In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [35]:
data = pd.read_csv('simpsons_dataset.csv')

In [20]:
data.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [38]:
data.duplicated().sum()

5207

In [39]:
data.drop_duplicates(inplace=True)

In [40]:
data.shape

(126646, 2)

In [36]:
data.isnull().sum()

raw_character_text    17814
spoken_words          26459
dtype: int64

In [37]:
data.dropna(inplace=True)

In [28]:
data.isnull().sum()

raw_character_text    0
spoken_words          0
dtype: int64

In [41]:
data.shape

(126646, 2)

In [42]:
data.head(50)

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...
5,Martin Prince,I don't think there's anything left to say.
6,Edna Krabappel-Flanders,Bart?
7,Bart Simpson,Victory party under the slide!
9,Lisa Simpson,Mr. Bergstrom! Mr. Bergstrom!
10,Landlady,"Hey, hey, he Moved out this morning. He must h..."


In [43]:
sw = stopwords.words('english')
lem = WordNetLemmatizer()

def clean(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z]',' ',text)
    text = re.sub(r'\b\w{1}\b','',text)
    text = re.sub(r'\s+',' ',text)
    text = [lem.lemmatize(words) for words in text.split() if words not in sw]
    return text

In [44]:
data['spoken_words'] = data['spoken_words'].apply(lambda x:clean(x))

In [45]:
data.isnull().sum()

raw_character_text    0
spoken_words          0
dtype: int64

In [46]:
data.head(50)

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"[actually, little, sometimes, disease, magazin..."
1,Lisa Simpson,"[mr, bergstrom]"
2,Miss Hoover,"[know, although, sure, like, talk, touch, less..."
3,Lisa Simpson,"[life, worth, living]"
4,Edna Krabappel-Flanders,"[poll, open, end, recess, case, decided, put, ..."
5,Martin Prince,"[think, anything, left, say]"
6,Edna Krabappel-Flanders,[bart]
7,Bart Simpson,"[victory, party, slide]"
9,Lisa Simpson,"[mr, bergstrom, mr, bergstrom]"
10,Landlady,"[hey, hey, moved, morning, must, new, job, too..."


In [47]:
data.reset_index(inplace=True,drop=True)

In [48]:
data.columns

Index(['raw_character_text', 'spoken_words'], dtype='object')

In [49]:
data.shape

(126646, 2)

In [50]:
sentence = data['spoken_words'].tolist()

In [51]:
len(sentence)

126646

In [52]:
w2v = Word2Vec(sentences=sentence,
                     min_count=20,
                     window=2,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20)

In [72]:
w2v.save('w2v.h5')

In [53]:
w2v.corpus_count

126646

In [54]:
X = w2v.wv.index_to_key

In [55]:
len(X)

3804

In [17]:
y = w2v.wv[X]

In [52]:
pc = PCA(n_components=2)
result = pc.fit_transform(y)

In [54]:
result

array([[-2.2488168e-02, -1.1559102e-03],
       [-9.3922643e-03,  4.7666801e-04],
       [-1.6811268e-02,  2.1724417e-03],
       ...,
       [ 1.8918341e-02,  8.4056528e-03],
       [ 2.1092586e-02,  1.2560648e-02],
       [-6.0689174e-05,  5.0108572e-03]], dtype=float32)

In [59]:
w2v.wv.most_similar(positive=["bart"], topn=3)

[('something', 0.9989799857139587),
 ('homer', 0.998867928981781),
 ('part', 0.9988269209861755)]

In [71]:
data[data.spoken_words.str.contains('bart', regex=False)].spoken_words

6                                                    [bart]
23                                             [bart, vote]
52        [bart, get, one, vote, oh, worst, thing, ever,...
135                                            [bart, come]
139                                     [bart, thing, cute]
                                ...                        
126533    [child, bart, need, someone, reliable, deliver...
126536    [people, told, encouraging, bart, win, approva...
126537                                          [yay, bart]
126581    [popularity, contest, excuse, important, popul...
126637                                               [bart]
Name: spoken_words, Length: 3312, dtype: object