In [103]:
import pandas as pd
import numpy as np
import gensim
import re

Downloading the Twitts dataset from my own Github repo

In [104]:
ds = pd.read_csv("https://github.com/FarshadAmiri/Learning-NLP/blob/main/Word%20Vectors%20(Embeddings)/Word2Vec%20on%20Twitts/train.csv?raw=true")
ds = ds.Text
ds_org = ds
ds

0         Reuters - Short-sellers, Wall Street's dwindli...
1         Reuters - Private investment firm Carlyle Grou...
2         Reuters - Soaring crude prices plus worries\ab...
3         Reuters - Authorities have halted oil export\f...
4         AFP - Tearaway world oil prices, toppling reco...
                                ...                        
119995     KARACHI (Reuters) - Pakistani President Perve...
119996    Red Sox general manager Theo Epstein acknowled...
119997    The Miami Dolphins will put their courtship of...
119998    PITTSBURGH at NY GIANTS Time: 1:30 p.m. Line: ...
119999    INDIANAPOLIS -- All-Star Vince Carter was trad...
Name: Text, Length: 120000, dtype: object

Pre-processing uisng Gensim tools

In [105]:
crp = ds.apply(gensim.utils.simple_preprocess)

In [106]:
crp

0         [reuters, short, sellers, wall, street, dwindl...
1         [reuters, private, investment, firm, carlyle, ...
2         [reuters, soaring, crude, prices, plus, worrie...
3         [reuters, authorities, have, halted, oil, expo...
4         [afp, tearaway, world, oil, prices, toppling, ...
                                ...                        
119995    [karachi, reuters, pakistani, president, perve...
119996    [red, sox, general, manager, theo, epstein, ac...
119997    [the, miami, dolphins, will, put, their, court...
119998    [pittsburgh, at, ny, giants, time, line, steel...
119999    [indianapolis, all, star, vince, carter, was, ...
Name: Text, Length: 120000, dtype: object

Training the model

In [107]:
model = gensim.models.Word2Vec(
    window = 10,
    min_count = 2,
    workers = 4
)

In [108]:
model.build_vocab(crp, progress_per = 1000)

In [109]:
model.train(crp, total_examples=model.corpus_count, epochs = 5)

(14849745, 17922940)

Testing

In [110]:
model.wv.most_similar('milk')

[('metals', 0.6721031069755554),
 ('industrial', 0.6686393022537231),
 ('steel', 0.6660748720169067),
 ('copper', 0.6623167991638184),
 ('deliveries', 0.6513888239860535),
 ('sabmiller', 0.639258861541748),
 ('gasoline', 0.6369499564170837),
 ('unilever', 0.6340436339378357),
 ('commodity', 0.6328487396240234),
 ('supply', 0.6320254802703857)]

In [111]:
print(model.wv.similarity('computer', 'carpet'))
print(model.wv.similarity('wood', 'space'))
print(model.wv.similarity('car', 'bicycle'))
print(model.wv.similarity('iran', 'tehran'))
print(model.wv.similarity('school', 'student'))

0.059948467
-0.085951276
0.4196517
0.94159776
0.5775516


**Continue training the model on new data**

These new files are in txt format, so we should do some additional preprocessing before passing them to the model

In [112]:
import urllib.request

url = 'https://github.com/FarshadAmiri/Learning-NLP/raw/main/Word%20Vectors%20(Embeddings)/Word2Vec%20on%20Twitts/english-web.txt'
req = urllib.request.Request(url)
ds2 = urllib.request.urlopen(req)

url2 = 'https://github.com/FarshadAmiri/Learning-NLP/raw/main/Word%20Vectors%20(Embeddings)/Word2Vec%20on%20Twitts/english-kjv.txt'
req2 = urllib.request.Request(url)
ds3 = urllib.request.urlopen(req)

In [113]:
crp2 = []
for line in ds2:
  crp2.append(line)
  
for line in ds3:
  crp2.append(line)

In [114]:
crp2

[b'In the beginning God created the heavens and the earth.\n',
 b'Now the earth was formless and empty.  Darkness was on the surface\n',
 b"of the deep.  God's Spirit was hovering over the surface\n",
 b'of the waters.\n',
 b'God said, "Let there be light," and there was light.\n',
 b'God saw the light, and saw that it was good.  God divided\n',
 b'the light from the darkness.\n',
 b'God called the light Day, and the darkness he called Night.\n',
 b'There was evening and there was morning, one day.\n',
 b'God said, "Let there be an expanse in the middle of the waters,\n',
 b'and let it divide the waters from the waters."\n',
 b'God made the expanse, and divided the waters which were under\n',
 b'the expanse from the waters which were above the expanse;\n',
 b'and it was so.\n',
 b'God called the expanse sky.  There was evening and there\n',
 b'was morning, a second day.\n',
 b'God said, "Let the waters under the sky be gathered together\n',
 b'to one place, and let the dry land appear;

In [115]:
# Convert bytes like objects to strings
for index, line in enumerate(crp2):
    crp2[index] = line.decode('utf-8')

crp2

['In the beginning God created the heavens and the earth.\n',
 'Now the earth was formless and empty.  Darkness was on the surface\n',
 "of the deep.  God's Spirit was hovering over the surface\n",
 'of the waters.\n',
 'God said, "Let there be light," and there was light.\n',
 'God saw the light, and saw that it was good.  God divided\n',
 'the light from the darkness.\n',
 'God called the light Day, and the darkness he called Night.\n',
 'There was evening and there was morning, one day.\n',
 'God said, "Let there be an expanse in the middle of the waters,\n',
 'and let it divide the waters from the waters."\n',
 'God made the expanse, and divided the waters which were under\n',
 'the expanse from the waters which were above the expanse;\n',
 'and it was so.\n',
 'God called the expanse sky.  There was evening and there\n',
 'was morning, a second day.\n',
 'God said, "Let the waters under the sky be gathered together\n',
 'to one place, and let the dry land appear;" and it was so.\n

In [116]:
crp2_m = pd.Series([])

for line in crp2:
  crp2_m = crp2_m.append(pd.Series(line))

crp2_m

  """Entry point for launching an IPython kernel.


0    In the beginning God created the heavens and t...
0    Now the earth was formless and empty.  Darknes...
0    of the deep.  God's Spirit was hovering over t...
0                                     of the waters.\n
0    God said, "Let there be light," and there was ...
                           ...                        
0    which he swore to Abraham, to Isaac, and to Ja...
0    Joseph took an oath of the children of Israel,...
0    surely visit you, and you shall carry up my bo...
0    So Joseph died, being one hundred ten years ol...
0    embalmed him, and he was put in a coffin in Eg...
Length: 7388, dtype: object

In [117]:
crp2_ppcd = crp2_m.apply(gensim.utils.simple_preprocess)

model.build_vocab(crp2_ppcd, update=True)


In [118]:
model.train(crp2_ppcd, total_examples=model.corpus_count, epochs = 5)

(234341, 352500)

In [119]:
# To compare with primary result
print(model.wv.similarity('computer', 'carpet'))
print(model.wv.similarity('wood', 'space'))
print(model.wv.similarity('car', 'bicycle'))
print(model.wv.similarity('iran', 'tehran'))
print(model.wv.similarity('school', 'student'))

0.059948467
0.05029596
0.4196517
0.94159776
0.5775516


In [126]:
print(model.wv.similarity('milk','cow'))
print(model.wv.similarity('good', 'bad'))
print(model.wv.similarity('good','best'))
print(model.wv.similarity('bicycle','car'))
print(model.wv.similarity('desk','bird'))

0.08474127
0.79787225
0.53889495
0.4196517
-0.06851443
