In [52]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from keras.preprocessing.text import Tokenizer
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
from nltk.corpus import stopwords
import random
import json
import nltk
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
nltk.download('stopwords')
import re
import seaborn as sns
import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/volthai7us/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [53]:
def clean_and_split(text):
    words = re.findall(r'\b\w+\b', text.lower())
    return words

def preprocess_speaker(speaker): 
    file_path = f'./data/transcripts/friends-1-227-{speaker}-pair.json'

    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
        
        
    all_answers = [item['answer'] for item in data]

    all_answers = [clean_and_split(answer) for answer in all_answers]

    stop_words = set(stopwords.words('english'))
    
    
    def remove_stopwords(answer):
        filtered_words = [word for word in answer if word.lower() not in stop_words]
        return ' '.join(filtered_words)

    all_answers = [remove_stopwords(answer) for answer in all_answers]

    return all_answers

In [54]:
speakers = ("Rachel", "Chandler", "Phoebe", "Monica", "Ross", "Joey")

all_data = []

for index, speaker in enumerate(speakers):
    all_answers = preprocess_speaker(speaker=speaker)
    speaker_data = [(answer, index) for answer in all_answers]
    sample_size = min(5000, len(speaker_data))
    random_sample = random.sample(speaker_data, sample_size)
    all_data.extend(random_sample)
    
speaker_combinations = []

for speaker1 in speakers:
    for speaker2 in speakers:
        combination = f"{speaker1}-{speaker2}"
        speaker_combinations.append(combination)

df = pd.DataFrame(all_data, columns=['text', 'speaker'])
df = df.sample(frac=1)

In [55]:
rachel_df = df[df['speaker'] == 0]
other_df = df[df['speaker'] != 0]
other_df = other_df.sample(frac=1)
other_df = other_df[:rachel_df.shape[0]]
other_df['speaker'] = 1
df = pd.concat([rachel_df, other_df])
df = df.sample(frac=1)
speakers = ["Rachel", "Other"]
df = df.reset_index()

In [56]:
from bs4 import BeautifulSoup

def cleanText(text):
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'\|\|\|', r' ', text) 
    text = re.sub(r'http\S+', r'<URL>', text)
    text = text.lower()
    text = text.replace('x', '')
    return text
df['text'] = df['text'].apply(cleanText)

In [57]:
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            #if len(word) < 0:
            if len(word) <= 0:
                continue
            tokens.append(word.lower())
    return tokens

train, test = train_test_split(df, test_size=0.00001 , random_state=42)

train_tagged = train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['text']), tags=[r['speaker']]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['text']), tags=[r['speaker']]), axis=1)

In [58]:
max_fatures = 500000
MAX_SEQUENCE_LENGTH = 30

In [59]:
tokenizer = Tokenizer(num_words=max_fatures, split=' ', filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['text'].values)
X = tokenizer.texts_to_sequences(df['text'].values)
X = pad_sequences(X)
print('Found %s unique tokens.' % len(X))

Found 9094 unique tokens.


In [60]:
X = tokenizer.texts_to_sequences(df['text'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (9094, 30)


In [61]:
train_tagged.values

array([TaggedDocument(words=['brooklyn'], tags=[0]),
       TaggedDocument(words=['fine', 'right', 'let', 'ah', 'let', 'take', 'break', 'let', 'cool', 'okay', 'let', 'get', 'frozen', 'yoghurt', 'something'], tags=[1]),
       TaggedDocument(words=['really', 'think', 'say', 'goodbye', 'care'], tags=[0]),
       ..., TaggedDocument(words=['tired'], tags=[1]),
       TaggedDocument(words=['wow', 'tough', 'one', 'think', 'gon', 'na', 'go', 'dog'], tags=[0]),
       TaggedDocument(words=['lookin', 'upside', 'know', 'matter'], tags=[0])],
      dtype=object)

In [62]:
d2v_model = Doc2Vec(dm=1, dm_mean=1, window=8, min_count=1, workers=1, alpha=0.065, min_alpha=0.065)
d2v_model.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 9093/9093 [00:00<00:00, 4450788.46it/s]


In [63]:
%%time
for epoch in range(30):
    d2v_model.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    d2v_model.alpha -= 0.002
    d2v_model.min_alpha = d2v_model.alpha

100%|██████████| 9093/9093 [00:00<00:00, 6042269.69it/s]
100%|██████████| 9093/9093 [00:00<00:00, 6480680.76it/s]
100%|██████████| 9093/9093 [00:00<00:00, 6134599.69it/s]
100%|██████████| 9093/9093 [00:00<00:00, 6613283.56it/s]
100%|██████████| 9093/9093 [00:00<00:00, 7065358.70it/s]
100%|██████████| 9093/9093 [00:00<00:00, 7119433.69it/s]
100%|██████████| 9093/9093 [00:00<00:00, 6025087.88it/s]
100%|██████████| 9093/9093 [00:00<00:00, 3288678.65it/s]
100%|██████████| 9093/9093 [00:00<00:00, 7098233.07it/s]
100%|██████████| 9093/9093 [00:00<00:00, 7189218.90it/s]
100%|██████████| 9093/9093 [00:00<00:00, 7114121.67it/s]
100%|██████████| 9093/9093 [00:00<00:00, 7099554.41it/s]
100%|██████████| 9093/9093 [00:00<00:00, 6972359.46it/s]
100%|██████████| 9093/9093 [00:00<00:00, 7021135.18it/s]
100%|██████████| 9093/9093 [00:00<00:00, 7171644.65it/s]
100%|██████████| 9093/9093 [00:00<00:00, 7159528.12it/s]
100%|██████████| 9093/9093 [00:00<00:00, 7069287.54it/s]
100%|██████████| 9093/9093 [00:

CPU times: user 3.8 s, sys: 56.4 ms, total: 3.86 s
Wall time: 3.87 s


In [64]:
embedding_matrix = np.zeros((len(d2v_model.wv.index_to_key)+ 1, 30))

In [65]:
for i, vec in enumerate(d2v_model.dv.vectors):
    while i in vec <= 1000:
          embedding_matrix[i]=vec

In [66]:
d2v_model.wv.most_similar(positive=['phoebe'], topn=10)

[('tim', 0.5800897479057312),
 ('sous', 0.5617893934249878),
 ('goodie', 0.551600992679596),
 ('buffay', 0.5263261198997498),
 ('steer', 0.5143299698829651),
 ('intentions', 0.5115000605583191),
 ('phaybobo', 0.49510785937309265),
 ('quantity', 0.4938683807849884),
 ('sock', 0.4937157928943634),
 ('ann', 0.4837372899055481)]

In [67]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding
import tensorflow as tf

# init layer
model = Sequential()

# emmbed word vectors
model.add(Embedding(len(d2v_model.wv.index_to_key)+1,30,input_length=X.shape[1],weights=[embedding_matrix],trainable=True))

# learn the correlations
def split_input(sequence):
     return sequence[:-1], tf.reshape(sequence[1:], (-1,1))
model.add(LSTM(30,return_sequences=False))
model.add(Dense(12,activation="softmax"))
model.add(Dense(2,activation="softmax"))

# output model skeleton
model.summary()
model.compile(optimizer="adam",loss="binary_crossentropy",metrics=['acc'])

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 30, 30)            164820    
                                                                 
 lstm_7 (LSTM)               (None, 30)                7320      
                                                                 
 dense_5 (Dense)             (None, 12)                372       
                                                                 
 dense_6 (Dense)             (None, 2)                 26        
                                                                 
Total params: 172538 (673.98 KB)
Trainable params: 172538 (673.98 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [68]:
Y = pd.get_dummies(df['speaker']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.15, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(7729, 30) (7729, 2)
(1365, 30) (1365, 2)


In [71]:
batch_size = 32
history=model.fit(X_train, Y_train, epochs=50, batch_size=batch_size, verbose = 2)

Epoch 1/50
242/242 - 1s - loss: 0.3783 - acc: 0.8586 - 1s/epoch - 6ms/step
Epoch 2/50
242/242 - 1s - loss: 0.3654 - acc: 0.8660 - 1s/epoch - 5ms/step
Epoch 3/50
242/242 - 1s - loss: 0.3586 - acc: 0.8675 - 1s/epoch - 5ms/step
Epoch 4/50
242/242 - 1s - loss: 0.3532 - acc: 0.8702 - 1s/epoch - 5ms/step
Epoch 5/50
242/242 - 1s - loss: 0.3431 - acc: 0.8753 - 1s/epoch - 5ms/step
Epoch 6/50
242/242 - 1s - loss: 0.3382 - acc: 0.8749 - 1s/epoch - 5ms/step
Epoch 7/50
242/242 - 1s - loss: 0.3301 - acc: 0.8793 - 1s/epoch - 5ms/step
Epoch 8/50
242/242 - 1s - loss: 0.3256 - acc: 0.8794 - 1s/epoch - 5ms/step
Epoch 9/50
242/242 - 1s - loss: 0.3177 - acc: 0.8828 - 1s/epoch - 5ms/step
Epoch 10/50
242/242 - 1s - loss: 0.3142 - acc: 0.8814 - 1s/epoch - 5ms/step
Epoch 11/50
242/242 - 1s - loss: 0.3120 - acc: 0.8837 - 1s/epoch - 5ms/step
Epoch 12/50
242/242 - 1s - loss: 0.3066 - acc: 0.8863 - 1s/epoch - 5ms/step
Epoch 13/50
242/242 - 1s - loss: 0.2980 - acc: 0.8898 - 1s/epoch - 5ms/step
Epoch 14/50
242/242 -

In [72]:
# evaluate the model
_, train_acc = model.evaluate(X_train, Y_train, verbose=2)
_, test_acc = model.evaluate(X_test, Y_test, verbose=2)
print('Train: %.3f, Test: %.4f' % (train_acc, test_acc))

242/242 - 0s - loss: 0.1849 - acc: 0.9238 - 296ms/epoch - 1ms/step
43/43 - 0s - loss: 1.5099 - acc: 0.5451 - 61ms/epoch - 1ms/step
Train: 0.924, Test: 0.5451
