## Word Embedding Models Comparison

In [1]:
import pandas as pd
import nltk
import numpy
from gensim.models import Word2Vec, KeyedVectors, FastText



In [2]:
FastModel = KeyedVectors.load_word2vec_format('FastText.vec')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


### First Model - Large PreTrained Google News bin

In [3]:
GoogleModel = KeyedVectors.load_word2vec_format('Google.bin', binary=True)

### Testing the model

In [4]:
GoogleModel.most_similar('girl')

[('boy', 0.8543272018432617),
 ('teenage_girl', 0.7927976250648499),
 ('woman', 0.7494640946388245),
 ('teenager', 0.717249870300293),
 ('schoolgirl', 0.7075953483581543),
 ('teenaged_girl', 0.6650916337966919),
 ('daughter', 0.6489864587783813),
 ('mother', 0.6478164196014404),
 ('toddler', 0.6473966836929321),
 ('girls', 0.6154742240905762)]

Placing the results of the most_similar into a dataframe

In [6]:
manList = GoogleModel.most_similar(positive=['man'], topn=30)
word = ["man"]
string = ' '.join([str(elem) for elem in word]) 
string = string.lower()
dfman = pd.DataFrame(manList, columns = ['Most Similar' , 'Vector Accuracy'])
dfman = dfman.apply(lambda x: x.astype(str).str.lower())
dfman = dfman[~dfman['Most Similar'].str.contains("_")]
dfman = dfman[~dfman['Most Similar'].str.contains(string)]
print (dfman['Most Similar'].head(10))

1              boy
2         teenager
4             girl
6           robber
9              men
11             guy
12          person
15    motorcyclist
21         suspect
24          victim
Name: Most Similar, dtype: object


Getting the most similar words from a list of words using a loop

In [7]:
words = ["Trump", "Obama", "Clinton" ]
for word in words:
    print("-- Word: %s" % word)
    result = GoogleModel.most_similar(positive=word)
    dfloop = pd.DataFrame(result, columns = ['Most Similar' , 'Vector Accuracy'])
    print (dfloop['Most Similar'])

-- Word: Trump
0                Donald_Trump
1    impersonator_entertained
2                Ivanka_Trump
3                      Ivanka
4          mogul_Donald_Trump
5                 Trump_Tower
6                     Kepcher
7    billionaire_Donald_Trump
8                   Trumpster
9         tycoon_Donald_Trump
Name: Most Similar, dtype: object
-- Word: Obama
0              Barack_Obama
1    President_Barack_Obama
2                    McCain
3                   Clinton
4          Illinois_senator
5                     Biden
6                      Bush
7                    Barack
8               White_House
9        elect_Barack_Obama
Name: Most Similar, dtype: object
-- Word: Clinton
0            Hillary_Clinton
1                      Obama
2               Bill_Clinton
3     Hillary_Rodham_Clinton
4       Sen._Hillary_Clinton
5                    Hillary
6    Senator_Hillary_Clinton
7                     McCain
8                   Clintons
9               Barack_Obama
Name: Most Simi

In [8]:
vec2 = GoogleModel['music'] - GoogleModel['piano'] + GoogleModel['drum']
GoogleModel.most_similar([vec2])

[('drum', 0.6940563321113586),
 ('drumming', 0.5132456421852112),
 ('music', 0.5001342296600342),
 ('Drum', 0.40797561407089233),
 ('hip_hop', 0.4001891016960144),
 ('bands', 0.3928840160369873),
 ('bhangra', 0.3885735273361206),
 ('reggae_dancehall', 0.38298097252845764),
 ('reggae', 0.38292473554611206),
 ('reggae_hip_hop', 0.3807390630245209)]

## Model 2 - Self Trained Reddit Model

Reading in a csv of a medium size to a dataframe, and sepreating out the required 'Title' column to be used to train the model. 

In [9]:
dfReddit = pd.read_csv('RedditNews.csv')
dfReddit.head()

Unnamed: 0,time_created,date_created,up_votes,down_votes,title,over_18,author,subreddit
0,1201232046,2008-01-25,3,0,Scores killed in Pakistan clashes,False,polar,worldnews
1,1201232075,2008-01-25,2,0,Japan resumes refuelling mission,False,polar,worldnews
2,1201232523,2008-01-25,3,0,US presses Egypt on Gaza border,False,polar,worldnews
3,1201233290,2008-01-25,1,0,Jump-start economy: Give health care to all,False,fadi420,worldnews
4,1201274720,2008-01-25,4,0,Council of Europe bashes EU&UN terror blacklist,False,mhermans,worldnews


In [10]:
newTitles = dfReddit["title"]
dfReddit.shape

(509236, 8)

Tokenize the newTitles list and print the first 10 objects in the list. 

In [11]:
newsVec = [nltk.word_tokenize(title) for title in newTitles]
newsVec[:10]

[['Scores', 'killed', 'in', 'Pakistan', 'clashes'],
 ['Japan', 'resumes', 'refuelling', 'mission'],
 ['US', 'presses', 'Egypt', 'on', 'Gaza', 'border'],
 ['Jump-start', 'economy', ':', 'Give', 'health', 'care', 'to', 'all'],
 ['Council', 'of', 'Europe', 'bashes', 'EU', '&', 'UN', 'terror', 'blacklist'],
 ['Hay',
  'presto',
  '!',
  'Farmer',
  'unveils',
  'the',
  'illegal',
  'mock-Tudor',
  'castle',
  'he',
  'tried',
  'to',
  'hide',
  'behind',
  '40ft',
  'hay',
  'bales'],
 ['Strikes',
  ',',
  'Protests',
  'and',
  'Gridlock',
  'at',
  'the',
  'Poland-Ukraine',
  'Border'],
 ['The', 'U.N', '.', 'Mismanagement', 'Program'],
 ['Nicolas', 'Sarkozy', 'threatens', 'to', 'sue', 'Ryanair'],
 ['US',
  'plans',
  'for',
  'missile',
  'shields',
  'in',
  'Polish',
  'town',
  'met',
  'with',
  'resistance',
  '[',
  'video',
  ']']]

### Using Word2Vec, assign each word in the model a vector of size 100

In [12]:
RedditModel = Word2Vec(newsVec,min_count=1,size=100, workers=6)

In [13]:
RedditModel.wv.most_similar('man')

[('woman', 0.9013823866844177),
 ('teenager', 0.8415164947509766),
 ('boy', 0.810286819934845),
 ('girl', 0.808212399482727),
 ('couple', 0.7884554862976074),
 ('doctor', 0.7665772438049316),
 ('mother', 0.7430440187454224),
 ('teacher', 0.7342153787612915),
 ('policeman', 0.7302061319351196),
 ('teen', 0.7246348261833191)]

## Model 3 - Small Wiki Dump bin

In [14]:
WikiModel = KeyedVectors.load_word2vec_format('wiki.bin', binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [15]:
WikiModel.most_similar('girl')
manList = WikiModel.most_similar(positive=['girl'], topn=30)
df = pd.DataFrame(manList, columns = ['Most Similar' , 'Vector Accuracy'])
print (df.head(10))

  Most Similar  Vector Accuracy
0          boy         0.712060
1      destiny         0.702904
2         love         0.694018
3       lovers         0.688722
4        sorry         0.682142
5      teenage         0.673685
6       beasts         0.667998
7        mercy         0.659093
8        hello         0.657588
9     restless         0.656618


## Simple Comparisons of the 3 Models

In [16]:
GoogleModel.most_similar('dog')

[('dogs', 0.8680489659309387),
 ('puppy', 0.8106428384780884),
 ('pit_bull', 0.780396044254303),
 ('pooch', 0.7627377510070801),
 ('cat', 0.7609456777572632),
 ('golden_retriever', 0.7500902414321899),
 ('German_shepherd', 0.7465174198150635),
 ('Rottweiler', 0.7437614798545837),
 ('beagle', 0.7418621778488159),
 ('pup', 0.740691065788269)]

In [17]:
RedditModel.wv.most_similar('dog')

[('cat', 0.7921118140220642),
 ('pig', 0.7833889126777649),
 ('baby', 0.7613813281059265),
 ('horse', 0.7580305337905884),
 ('tiger', 0.7519763708114624),
 ('pet', 0.7497408390045166),
 ('crocodile', 0.7390192747116089),
 ('penis', 0.738898754119873),
 ('bullet', 0.7388013005256653),
 ('naked', 0.7386067509651184)]

In [18]:
WikiModel.most_similar('dog')

[('dogs', 0.7068554162979126),
 ('rabbits', 0.687908411026001),
 ('cat', 0.6614758968353271),
 ('cats', 0.6565461158752441),
 ('anthropomorphic', 0.6554036736488342),
 ('t_shirts', 0.6497201919555664),
 ('beast', 0.6489152908325195),
 ('coyote', 0.6422343850135803),
 ('flesh', 0.6411483883857727),
 ('beasts', 0.6408825516700745)]

In [19]:
word = 'farm'
result1 = GoogleModel.most_similar(positive=word)
result2 = RedditModel.most_similar(positive=word)
result3 = WikiModel.most_similar(positive=word)
dftable = pd.DataFrame(result2, columns = ['Most Similar' , 'Vector Accuracy'])
dftable.insert(0, 'Most Similar Google',  result1, True)
dftable.insert(1, 'Most Similar Reddit',  result2, True)
dftable.insert(2, 'Most Similar Wiki',  result3, True)
dftable = dftable.drop('Most Similar', 1)
dftable = dftable.drop('Vector Accuracy', 1)
print(dftable)


                  Most Similar Google                Most Similar Reddit  \
0         (farms, 0.7599002122879028)        (farms, 0.7457308769226074)   
1    (dairy_farm, 0.7567876577377319)         (pool, 0.7318962216377258)   
2       (farming, 0.7305764555931091)  (supermarket, 0.7294552326202393)   
3        (farmer, 0.7109191417694092)         (mine, 0.7275694012641907)   
4  (DUANE_HOWELL, 0.6612095832824707)       (copper, 0.7254145741462708)   
5          (Farm, 0.6409205198287964)         (gold, 0.7237563133239746)   
6  (agricultural, 0.6379073262214661)        (coffee, 0.721081018447876)   
7  (dairy_farmer, 0.6344161033630371)        (sewage, 0.714181661605835)   
8       (farmers, 0.6342907547950745)        (steel, 0.6917004585266113)   
9   (v._Sos_prov, 0.6199461221694946)       (forest, 0.6916866302490234)   

                  Most Similar Wiki  
0  (plantation, 0.6966841220855713)  
1   (farmhouses, 0.683635950088501)  
2       (farms, 0.6830114722251892)  
3     (farm

  This is separate from the ipykernel package so we can avoid doing imports until
