This Notebook is a demonstration on how word2vec was trained on the imdb dataset

In [2]:
# Read in the data and clean up column names
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from gensim.models import KeyedVectors

pd.set_option('display.max_colwidth', 100)

In [3]:
# Data Exploration
main_data = pd.read_csv('/Users/andrewsimon/Desktop/IMDBDataset.csv.zip')
main_data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked....,positive
1,A wonderful little production. <br /><br />The filming technique is very unassuming- very old-ti...,positive
2,"I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air...",positive
3,Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is a visually stunning film to watch. Mr. Mattei off...",positive
...,...,...
49995,"I thought this movie did a down right good job. It wasn't as creative or original as the first, ...",positive
49996,"Bad plot, bad dialogue, bad acting, idiotic directing, the annoying porn groove soundtrack that ...",negative
49997,"I am a Catholic taught in parochial elementary schools by nuns, taught by Jesuit priests in high...",negative
49998,I'm going to have to disagree with the previous comment and side with Maltin on this one. This i...,negative


In [4]:
# Clean data using the built in cleaner in gensim
main_data['review_clean'] = main_data['review'].apply(lambda x: gensim.utils.simple_preprocess(x))
main_data.head()

Unnamed: 0,review,sentiment,review_clean
0,One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked....,positive,"[one, of, the, other, reviewers, has, mentioned, that, after, watching, just, oz, episode, you, ..."
1,A wonderful little production. <br /><br />The filming technique is very unassuming- very old-ti...,positive,"[wonderful, little, production, br, br, the, filming, technique, is, very, unassuming, very, old..."
2,"I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air...",positive,"[thought, this, was, wonderful, way, to, spend, time, on, too, hot, summer, weekend, sitting, in..."
3,Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his...,negative,"[basically, there, family, where, little, boy, jake, thinks, there, zombie, in, his, closet, his..."
4,"Petter Mattei's ""Love in the Time of Money"" is a visually stunning film to watch. Mr. Mattei off...",positive,"[petter, mattei, love, in, the, time, of, money, is, visually, stunning, film, to, watch, mr, ma..."


In [45]:
# Encoding the label column
main_data['sentiment']=main_data['sentiment'].map({'positive':1,'negative':0})
# Split data into train and test sets
main_data_sample = main_data.sample(n=5000)

X_train, X_test, y_train, y_test = train_test_split (main_data_sample['review_clean'], main_data_sample['sentiment'] , test_size=0.2)

In [46]:
# Train the word2vec model
w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=100,
                                   window=5,
                                   min_count=2)

In [47]:
# Creating a dictionary of words for vectorization
words = list(set(w2v_model.wv.index_to_key))

word_vec_dict = {}

for word in words:
    word_vec_dict[word] = w2v_model.wv[word]

word_df = pd.DataFrame(data=word_vec_dict)
word_df


Unnamed: 0,aames,foolish,havoc,emerge,dish,humorous,sails,junk,mesopotamia,moranis,...,mission,yin,hickok,anyhoo,recap,skews,wider,dystopian,task,homicide
0,-0.006637,-0.026095,-0.012659,0.001410,-0.025536,-0.107040,-0.001525,-0.147518,0.007695,-0.017134,...,0.018161,0.004444,0.012015,-0.016125,-0.032323,-0.002046,-0.001967,-0.034614,-0.018277,-0.031589
1,0.067785,0.045657,0.159102,0.089327,0.028035,0.220085,0.034864,0.114705,0.008472,0.029219,...,0.284352,0.029659,0.062548,0.005541,0.049966,0.001175,0.125714,0.050639,0.233882,0.098056
2,0.018856,0.011911,0.046187,0.029989,-0.004700,0.092782,0.009914,0.139140,0.026340,0.030850,...,0.115005,0.020529,0.054774,0.003760,0.018405,-0.008875,0.031547,0.024183,0.095049,-0.003251
3,0.043726,0.000471,0.126216,0.129696,0.016226,0.198319,0.037983,-0.045065,0.016763,0.039102,...,0.247125,0.017928,0.100974,-0.016878,0.019286,0.009859,0.071981,0.015736,0.123026,0.047865
4,0.028143,0.044383,0.117305,0.050213,0.015893,0.098992,0.020870,0.064228,0.032951,0.026383,...,0.351536,0.015126,0.044486,0.000724,0.005644,-0.026118,0.034986,-0.002758,0.202832,0.062489
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.013646,0.034839,0.063074,0.019462,0.019806,0.099964,0.017541,0.234336,0.014021,0.021947,...,0.319364,0.028939,-0.025614,0.005032,0.007253,0.017249,0.094943,0.010957,0.053895,0.021588
96,0.008054,-0.011834,0.097680,0.032446,0.007809,-0.017835,0.008198,-0.017809,0.016668,-0.012668,...,0.099531,-0.012437,-0.000956,0.030480,0.007611,0.008184,0.025032,0.022327,0.066675,0.017440
97,-0.029598,-0.029505,-0.083333,-0.073562,-0.026861,-0.118996,-0.029455,-0.094915,-0.025469,-0.042264,...,-0.224752,-0.015967,-0.029181,-0.007631,-0.015234,-0.008785,-0.022896,-0.020112,-0.094105,-0.049982
98,-0.008325,0.015349,0.010150,-0.029213,0.014798,-0.048759,0.000878,0.146990,-0.012613,-0.000840,...,-0.037780,-0.009520,-0.053148,0.024327,0.023400,0.010065,0.047195,0.007382,-0.010285,0.025119


In [48]:
# Vectorization of the word dictionary for training of our model
X_train_vect = []

train_counter = 0

for review in X_train:
    train_counter = train_counter + 1
    print(str(train_counter) + '/' + str(len(X_train)))
    count = 0
    word_vec_avg = np.zeros(100)
    for word in review:
        if word in words:
            count = count + 1
            word_vec_avg = word_vec_avg + word_df[word]
    word_vec_avg = word_vec_avg / count
    X_train_vect.append(word_vec_avg)

X_test_vect = []
test_counter = 0

for review in X_test:
    test_counter = test_counter + 1
    print(str(test_counter) + '/' + str(len(X_test)))
    count = 0
    word_vec_avg = np.zeros(100)
    for word in review:
        if word in words:
            count = count + 1
            word_vec_avg = word_vec_avg + word_df[word]
    word_vec_avg = word_vec_avg / count
    X_test_vect.append(word_vec_avg)


1/4000
2/4000
3/4000
4/4000
5/4000
6/4000
7/4000
8/4000
9/4000
10/4000
11/4000
12/4000
13/4000
14/4000
15/4000
16/4000
17/4000
18/4000
19/4000
20/4000
21/4000
22/4000
23/4000
24/4000
25/4000
26/4000
27/4000
28/4000
29/4000
30/4000
31/4000
32/4000
33/4000
34/4000
35/4000
36/4000
37/4000
38/4000
39/4000
40/4000
41/4000
42/4000
43/4000
44/4000
45/4000
46/4000
47/4000
48/4000
49/4000
50/4000
51/4000
52/4000
53/4000
54/4000
55/4000
56/4000
57/4000
58/4000
59/4000
60/4000
61/4000
62/4000
63/4000
64/4000
65/4000
66/4000
67/4000
68/4000
69/4000
70/4000
71/4000
72/4000
73/4000
74/4000
75/4000
76/4000
77/4000
78/4000
79/4000
80/4000
81/4000
82/4000
83/4000
84/4000
85/4000
86/4000
87/4000
88/4000
89/4000
90/4000
91/4000
92/4000
93/4000
94/4000
95/4000
96/4000
97/4000
98/4000
99/4000
100/4000
101/4000
102/4000
103/4000
104/4000
105/4000
106/4000
107/4000
108/4000
109/4000
110/4000
111/4000
112/4000
113/4000
114/4000
115/4000
116/4000
117/4000
118/4000
119/4000
120/4000
121/4000
122/4000
123/4000
1

In [49]:
# Importing classifier
from sklearn.tree import DecisionTreeClassifier
clf_decision_word2vec = DecisionTreeClassifier()

clf_decision_word2vec.fit(X_train_vect, y_train)



DecisionTreeClassifier()

In [50]:
# Making predictions from classifier
from sklearn.metrics import accuracy_score
predictions = clf_decision_word2vec.predict(X_test_vect)
print(accuracy_score(y_test, predictions))



0.595


In [109]:
# Unit testing
df = pd.DataFrame([[-1, 0, 1], [1, 0, -1], [.5, 0, .5]])

assert isinstance(df, pd.DataFrame) == True

In [114]:
# Unit testing 
df = pd.DataFrame([[-1, 0, 1], [1, 0, -1], [.5, 0, .5]])
df = df.sample(n=1)
assert len(df) == 1