In [14]:
import gensim
import warnings
warnings.filterwarnings('ignore')

**Movie review data**

In [15]:
import pandas as pd
df = pd.read_csv('unlabeledTrainData.tsv', header=0, delimiter="\t", quoting=3)
df.head()

Unnamed: 0,id,review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was..."
1,"""45057_0""","""I saw this film about 20 years ago and rememb..."
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B..."
3,"""7161_0""","""I went to see this film with a great deal of ..."
4,"""43971_0""","""Yes, I agree with everyone on this site this ..."


**Function to clean up the data**

In [16]:
import re, string

def clean_str(string):
  """
  String cleaning before vectorization
  """
  try:    
    string = re.sub(r'^https?:\/\/<>.*[\r\n]*', '', string, flags=re.MULTILINE)
    string = re.sub(r"[^A-Za-z]", " ", string)         
    words = string.strip().lower().split()    
    words = [w for w in words if len(w)>=1]
    return " ".join(words)	
  except:
    return ""

In [17]:
df['clean_review'] = df['review'].apply(clean_str)
df.head()

Unnamed: 0,id,review,clean_review
0,"""9999_0""","""Watching Time Chasers, it obvious that it was...",watching time chasers it obvious that it was m...
1,"""45057_0""","""I saw this film about 20 years ago and rememb...",i saw this film about years ago and remember i...
2,"""15561_0""","""Minor Spoilers<br /><br />In New York, Joan B...",minor spoilers br br in new york joan barnard ...
3,"""7161_0""","""I went to see this film with a great deal of ...",i went to see this film with a great deal of e...
4,"""43971_0""","""Yes, I agree with everyone on this site this ...",yes i agree with everyone on this site this mo...


**Review to Word List**

In [18]:
#List to hold all words in each review
documents = []

#Iterate over each review
for doc in df['clean_review']:
    documents.append(doc.split(' '))

print(len(documents))
print(documents[0])

50000
['watching', 'time', 'chasers', 'it', 'obvious', 'that', 'it', 'was', 'made', 'by', 'a', 'bunch', 'of', 'friends', 'maybe', 'they', 'were', 'sitting', 'around', 'one', 'day', 'in', 'film', 'school', 'and', 'said', 'hey', 'let', 's', 'pool', 'our', 'money', 'together', 'and', 'make', 'a', 'really', 'bad', 'movie', 'or', 'something', 'like', 'that', 'what', 'ever', 'they', 'said', 'they', 'still', 'ended', 'up', 'making', 'a', 'really', 'bad', 'movie', 'dull', 'story', 'bad', 'script', 'lame', 'acting', 'poor', 'cinematography', 'bottom', 'of', 'the', 'barrel', 'stock', 'music', 'etc', 'all', 'corners', 'were', 'cut', 'except', 'the', 'one', 'that', 'would', 'have', 'prevented', 'this', 'film', 's', 'release', 'life', 's', 'like', 'that']


In [19]:
#Build the model
model = gensim.models.Word2Vec(documents, #Word list
                               min_count=5, #Ignore all words with total frequency lower than this                           
                               workers=6, #Number of CPU Cores
                               vector_size=300,  #Embedding size
                               window=5, #Maximum Distance between current and predicted word
                               epochs=10   #Number of iterations over the text corpus
                              ) 

**Document Vectors for Embedded Words**

In [20]:
model.wv['fantastic']

array([-0.6129727 ,  0.03158724, -0.76126605,  0.07857632,  1.0769459 ,
       -0.3723051 , -0.8580457 , -2.116642  , -0.8613273 ,  1.167159  ,
       -0.19922747,  0.45473295,  0.48717636, -0.96262646,  0.4570663 ,
       -0.33170342, -0.5648217 ,  0.55435133,  0.04163346,  0.49153295,
       -0.7493846 , -0.32686207, -0.27267358, -0.20102808, -1.2726067 ,
        0.00394259, -1.0117022 , -0.44632164,  0.5445328 , -0.68986213,
       -0.01636921, -0.03826097, -0.60647863, -0.5248712 , -0.44557777,
        0.3645087 , -0.95385087,  1.4731786 ,  0.29206336, -0.529396  ,
       -1.092436  ,  0.4813883 , -0.20871745,  0.13180968, -1.2102152 ,
        0.40938166,  0.3339924 ,  1.4949929 , -0.3253896 , -0.23088454,
       -0.6295665 , -0.4684444 ,  0.5617911 , -1.2629836 , -0.17673829,
        0.17421454, -0.27977252,  0.13977978,  0.2053538 ,  0.88568103,
       -0.8179156 ,  0.06108671,  1.0412977 ,  0.9926023 ,  0.36246687,
       -1.048939  , -0.6633383 ,  0.38076702, -0.06178578, -0.89

In [21]:
model.wv['pathetic']

array([-4.78994995e-01,  2.33480036e-02,  1.59555420e-01, -9.31204110e-02,
        4.06616479e-01,  7.12589204e-01, -5.07033288e-01, -7.87479758e-01,
        3.73401046e-01,  4.26930666e-01, -4.03651237e-01, -1.91450262e+00,
        2.95414031e-01, -1.32307768e+00, -4.90595132e-01,  2.41224051e-01,
       -1.63986766e+00, -5.10002553e-01, -4.90193861e-03,  1.42719567e+00,
        2.86566734e-01,  7.63135374e-01, -9.05259490e-01, -1.03871214e+00,
       -5.23563564e-01,  9.06619728e-01,  4.05460387e-01, -7.88633347e-01,
        1.09709188e-01,  4.60618228e-01, -9.12347555e-01, -1.48872480e-01,
        2.01003027e+00, -2.95650005e-01, -9.56210732e-01,  1.20557201e+00,
       -7.28900373e-01,  1.38988972e+00, -1.06541443e+00,  6.53742015e-01,
        1.01946628e+00, -9.44862328e-03,  2.80512542e-01,  1.10697949e+00,
       -1.58003315e-01,  1.11936353e-01, -7.06149518e-01, -6.55488074e-01,
        6.03442430e-01,  7.39882946e-01, -8.24782729e-01, -1.54603496e-01,
        4.17431295e-01,  

**Similarity of the Words**

In [22]:
model.wv.most_similar('fantastic')

[('terrific', 0.7814262509346008),
 ('fabulous', 0.7690280079841614),
 ('wonderful', 0.7650262117385864),
 ('superb', 0.7426376938819885),
 ('phenomenal', 0.7359784245491028),
 ('great', 0.7234622836112976),
 ('brilliant', 0.7121571898460388),
 ('marvelous', 0.7044367790222168),
 ('excellent', 0.6931370496749878),
 ('stunning', 0.673054039478302)]

In [23]:
model.wv.most_similar('pathetic')

[('pitiful', 0.6980125904083252),
 ('laughable', 0.6591284871101379),
 ('lame', 0.653693437576294),
 ('ridiculous', 0.6456886529922485),
 ('stupid', 0.6244801878929138),
 ('horrible', 0.6083274483680725),
 ('horrendous', 0.6050158143043518),
 ('atrocious', 0.5868836641311646),
 ('dreadful', 0.581160306930542),
 ('terrible', 0.5809403657913208)]

In [24]:
model.wv.most_similar('bollywood')

[('hindi', 0.6735062003135681),
 ('hollywood', 0.5615265965461731),
 ('mainstream', 0.5589401125907898),
 ('hk', 0.5137766599655151),
 ('tamil', 0.49169084429740906),
 ('blaxploitation', 0.4800868332386017),
 ('filipino', 0.47959044575691223),
 ('nowadays', 0.47797441482543945),
 ('genre', 0.4712223708629608),
 ('sf', 0.47045210003852844)]

In [None]:
model.wv.most_similar('mumbai')

**Saving the Model**

In [None]:
model.save('word2vec-movie-dataset')

**Loading the Model**

In [None]:
#Load model from memory
model = gensim.models.Word2Vec.load('word2vec-movie-dataset')

**Relationship**

1. Equation king + man = queen + ?
2. In this case there may not be enough data for this equation

In [None]:
model.wv.most_similar(positive=['king','man'], negative=['queen'])

[('master', 0.3433687388896942),
 ('filmmaker', 0.3381901681423187),
 ('joker', 0.3330538272857666),
 ('ness', 0.3229394555091858),
 ('men', 0.317975789308548),
 ('genius', 0.30485010147094727),
 ('batman', 0.3044992685317993),
 ('actor', 0.3033349812030792),
 ('soderbergh', 0.2990267872810364),
 ('boy', 0.28895556926727295)]

1. We do not get the right mapping here because there are not enough data corresponding to above mapping to get better results.
2. We can train models by running more number of EPOCHS.

In [None]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'])

[('princess', 0.4584004580974579),
 ('prince', 0.4026755094528198),
 ('mistress', 0.4006645381450653),
 ('antoinette', 0.4002799689769745),
 ('queen', 0.3930467963218689),
 ('margaret', 0.3915702998638153),
 ('countess', 0.38586390018463135),
 ('mai', 0.38177311420440674),
 ('norma', 0.37973058223724365),
 ('marie', 0.37907224893569946)]

In [None]:
model.wv.most_similar(positive=['father','mother'], negative=['daughter'])

[('dad', 0.5750482678413391),
 ('grandfather', 0.5738402009010315),
 ('mom', 0.5605438947677612),
 ('son', 0.5217747092247009),
 ('grandmother', 0.5186290144920349),
 ('parents', 0.47951704263687134),
 ('brother', 0.47604134678840637),
 ('husband', 0.46357232332229614),
 ('uncle', 0.45768845081329346),
 ('grandma', 0.45311906933784485)]

In [None]:
model.wv['mother'] + model.wv['father']-model.wv['daughter']

array([ 1.25944877e+00,  2.25785971e+00,  9.96634126e-01,  1.13741517e+00,
        1.44470322e+00, -5.36391020e-01, -1.65538669e-01, -9.36322331e-01,
        4.59440351e-01,  2.11135268e+00, -2.76659489e-01, -4.69175965e-01,
        1.53987372e+00, -1.82529414e+00, -3.28232855e-01, -5.25665760e-01,
        2.02382088e+00, -8.79832134e-02,  3.12511325e-01, -9.10004258e-01,
       -1.72134686e+00,  2.30993295e+00, -2.03554296e+00, -7.00923085e-01,
        2.22210944e-01,  1.25676513e+00, -1.60634518e-02, -3.19600224e-01,
        5.74525714e-01, -1.60235155e+00, -9.03960586e-01, -1.95198870e+00,
       -7.01595068e-01, -4.41825151e-01, -7.09805012e-01, -4.89496469e-01,
       -3.67840767e-01,  3.37533712e-01,  1.36266041e+00, -5.35406232e-01,
       -2.55608678e-01,  2.36379027e-01, -2.68875504e+00,  5.55121422e-01,
        3.38687837e-01, -1.08207679e+00,  3.23990345e-01,  2.58333087e-01,
        1.53264880e+00, -1.35585809e+00,  1.11913896e+00,  1.27048445e+00,
       -4.73133475e-01,  