## Word Embedding Models Comparison

In [1]:
import pandas as pd
import nltk
from gensim.models import Word2Vec, KeyedVectors



### First Model - Large PreTrained Google News bin

In [2]:
GoogleModel = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


### Testing the model

In [37]:
GoogleModel.most_similar('girl')

[('boy', 0.8543272018432617),
 ('teenage_girl', 0.7927976846694946),
 ('woman', 0.7494641542434692),
 ('teenager', 0.7172499299049377),
 ('schoolgirl', 0.7075953483581543),
 ('teenaged_girl', 0.6650916934013367),
 ('daughter', 0.6489864587783813),
 ('mother', 0.64781653881073),
 ('toddler', 0.6473966240882874),
 ('girls', 0.6154742240905762)]

Placing the results of the most_similar into a dataframe

In [28]:
manList = GoogleModel.most_similar(positive=['man'], topn=15)
dfman = pd.DataFrame(manList, columns = ['Most Similar' , 'Vector Accuracy'])
print (dfman['Most Similar'].head(10))

0                       woman
1                         boy
2                    teenager
3                teenage_girl
4                        girl
5    suspected_purse_snatcher
6                      robber
7             Robbery_suspect
8                   teen_ager
9                         men
Name: Most Similar, dtype: object


Getting the most similar words from a list of words using a loop

In [35]:
words = ["Trump", "Obama", "Clinton" ]
for word in words:
    print("-- Word: %s" % word)
    result = GoogleModel.most_similar(positive=word)
    dfloop = pd.DataFrame(result, columns = ['Most Similar' , 'Vector Accuracy'])
    print (dfloop['Most Similar'])

-- Word: Trump
0                Donald_Trump
1    impersonator_entertained
2                Ivanka_Trump
3                      Ivanka
4          mogul_Donald_Trump
5                 Trump_Tower
6                     Kepcher
7    billionaire_Donald_Trump
8                   Trumpster
9         tycoon_Donald_Trump
Name: Most Similar, dtype: object
-- Word: Obama
0              Barack_Obama
1    President_Barack_Obama
2                    McCain
3                   Clinton
4          Illinois_senator
5                     Biden
6                      Bush
7                    Barack
8               White_House
9        elect_Barack_Obama
Name: Most Similar, dtype: object
-- Word: Clinton
0            Hillary_Clinton
1                      Obama
2               Bill_Clinton
3     Hillary_Rodham_Clinton
4       Sen._Hillary_Clinton
5                    Hillary
6    Senator_Hillary_Clinton
7                     McCain
8                   Clintons
9               Barack_Obama
Name: Most Simi

In [36]:
vec2 = GoogleModel['music'] - GoogleModel['piano'] + GoogleModel['drum']
GoogleModel.most_similar([vec2])

[('drum', 0.6940563321113586),
 ('drumming', 0.513245701789856),
 ('music', 0.5001342296600342),
 ('Drum', 0.40797555446624756),
 ('hip_hop', 0.4001891314983368),
 ('bands', 0.3928840458393097),
 ('bhangra', 0.3885735273361206),
 ('reggae_dancehall', 0.38298097252845764),
 ('reggae', 0.38292473554611206),
 ('reggae_hip_hop', 0.3807390034198761)]

## Model 2 - Self Trained Reddit Model

Reading in a csv of a medium size to a dataframe, and sepreating out the required 'Title' column to be used to train the model. 

In [39]:
dfReddit = pd.read_csv('RedditNews.csv')
dfReddit.head()

Unnamed: 0,time_created,date_created,up_votes,down_votes,title,over_18,author,subreddit
0,1201232046,2008-01-25,3,0,Scores killed in Pakistan clashes,False,polar,worldnews
1,1201232075,2008-01-25,2,0,Japan resumes refuelling mission,False,polar,worldnews
2,1201232523,2008-01-25,3,0,US presses Egypt on Gaza border,False,polar,worldnews
3,1201233290,2008-01-25,1,0,Jump-start economy: Give health care to all,False,fadi420,worldnews
4,1201274720,2008-01-25,4,0,Council of Europe bashes EU&UN terror blacklist,False,mhermans,worldnews


In [41]:
newTitles = dfReddit["title"]
newTitles.shape

(509236,)

Tokenize the newTitles list and print the first 10 objects in the list. 

In [45]:
newsVec = [nltk.word_tokenize(title) for title in newTitles]
newsVec[:10]

[['Scores', 'killed', 'in', 'Pakistan', 'clashes'],
 ['Japan', 'resumes', 'refuelling', 'mission'],
 ['US', 'presses', 'Egypt', 'on', 'Gaza', 'border'],
 ['Jump-start', 'economy', ':', 'Give', 'health', 'care', 'to', 'all'],
 ['Council', 'of', 'Europe', 'bashes', 'EU', '&', 'UN', 'terror', 'blacklist'],
 ['Hay',
  'presto',
  '!',
  'Farmer',
  'unveils',
  'the',
  'illegal',
  'mock-Tudor',
  'castle',
  'he',
  'tried',
  'to',
  'hide',
  'behind',
  '40ft',
  'hay',
  'bales'],
 ['Strikes',
  ',',
  'Protests',
  'and',
  'Gridlock',
  'at',
  'the',
  'Poland-Ukraine',
  'Border'],
 ['The', 'U.N', '.', 'Mismanagement', 'Program'],
 ['Nicolas', 'Sarkozy', 'threatens', 'to', 'sue', 'Ryanair'],
 ['US',
  'plans',
  'for',
  'missile',
  'shields',
  'in',
  'Polish',
  'town',
  'met',
  'with',
  'resistance',
  '[',
  'video',
  ']']]

### Using Word2Vec, assign each word in the model a vector of size 100

In [46]:
RedditModel = Word2Vec(newsVec,min_count=1,size=100)

In [47]:
RedditModel.wv.most_similar('man')

[('woman', 0.898088812828064),
 ('teenager', 0.8351012468338013),
 ('boy', 0.8271929025650024),
 ('girl', 0.8181493282318115),
 ('couple', 0.7930596470832825),
 ('teen', 0.752565324306488),
 ('mother', 0.7476451992988586),
 ('doctor', 0.7413562536239624),
 ('teacher', 0.7396202087402344),
 ('policeman', 0.7272433638572693)]

## Model 3 - Small Wiki Dump bin

In [48]:
WikiModel = KeyedVectors.load_word2vec_format('wiki100k-w2v.cbow.bin', binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [49]:
WikiModel.most_similar('girl')

[('boy', 0.7120599746704102),
 ('destiny', 0.7029043436050415),
 ('love', 0.6940176486968994),
 ('lovers', 0.6887222528457642),
 ('sorry', 0.6821417808532715),
 ('teenage', 0.6736853718757629),
 ('beasts', 0.667998194694519),
 ('mercy', 0.6590928435325623),
 ('hello', 0.657588005065918),
 ('restless', 0.6566181182861328)]

## Simple Comparisons of the 3 Models

In [52]:
GoogleModel.most_similar('dog')

[('dogs', 0.868048906326294),
 ('puppy', 0.8106427192687988),
 ('pit_bull', 0.7803961038589478),
 ('pooch', 0.7627377510070801),
 ('cat', 0.7609456777572632),
 ('golden_retriever', 0.7500902414321899),
 ('German_shepherd', 0.7465174198150635),
 ('Rottweiler', 0.7437615394592285),
 ('beagle', 0.7418622374534607),
 ('pup', 0.7406911253929138)]

In [53]:
RedditModel.wv.most_similar('dog')

[('horse', 0.7701406478881836),
 ('pet', 0.770033597946167),
 ('cat', 0.7622345685958862),
 ('baby', 0.76194167137146),
 ('naked', 0.752766489982605),
 ('corpse', 0.7394227981567383),
 ('crocodile', 0.7339233756065369),
 ('penis', 0.7293257713317871),
 ('pig', 0.7262090444564819),
 ('lion', 0.7227792739868164)]

In [54]:
WikiModel.most_similar('dog')

[('dogs', 0.7068554162979126),
 ('rabbits', 0.6879082918167114),
 ('cat', 0.6614759564399719),
 ('cats', 0.6565462350845337),
 ('anthropomorphic', 0.6554036140441895),
 ('t_shirts', 0.6497201919555664),
 ('beast', 0.6489153504371643),
 ('coyote', 0.6422343254089355),
 ('flesh', 0.6411483883857727),
 ('beasts', 0.6408825516700745)]