In [1]:
import gensim

In [2]:
# Google drive mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
# Setup the train and test directories
text_file = "drive/MyDrive/Colab Notebooks/dataset.txt"

print(text_file)

drive/MyDrive/Colab Notebooks/dataset.txt


In [9]:
# read all lines at once
with open(text_file, 'r', encoding= 'unicode_escape') as f:
    lines = f.readlines()
print(lines[2100:2103])

["'But it's never cared,' said Skimmer. 'Up until now. And now it wants to rip the top off the country and take what's underneath, mmph, mmhm.'\n", 'Ah, thought Vimes, our killer clerk does have more than one emotion.\n', "'Ankh-Morpork has always tried to get on well with other nations,' said Sybil. 'Well, these days, at least.'\n"]


In [10]:
# process sentences to tokens
processed_lines = [gensim.utils.simple_preprocess(sentence) for sentence in lines]

In [11]:
print(processed_lines[2100:2105])

[['but', 'it', 'never', 'cared', 'said', 'skimmer', 'up', 'until', 'now', 'and', 'now', 'it', 'wants', 'to', 'rip', 'the', 'top', 'off', 'the', 'country', 'and', 'take', 'what', 'underneath', 'mmph', 'mmhm'], ['ah', 'thought', 'vimes', 'our', 'killer', 'clerk', 'does', 'have', 'more', 'than', 'one', 'emotion'], ['ankh', 'morpork', 'has', 'always', 'tried', 'to', 'get', 'on', 'well', 'with', 'other', 'nations', 'said', 'sybil', 'well', 'these', 'days', 'at', 'least'], ['don', 'think', 'we', 'exactly', 'try', 'dear', 'said', 'vimes', 'it', 'just', 'that', 'we', 'found', 'that', 'why', 're', 'we', 'stopping'], ['he', 'pulled', 'down', 'the', 'window', 'what', 'happening', 'sergeant']]


In [12]:
# create word list from token using utf8 encoding
word_list = [word for words in processed_lines for word in words]

In [13]:
# check the length of the list
print('Length: ', len(word_list))

Length:  101044


In [15]:
# check five words
print(word_list[2100:2105])

['he', 'shouted', 'aw', 'must', 'we']


In [16]:
model = gensim.models.Word2Vec(
    [word_list],
    negative = 10, # negative sampling how many "noise words" should be drawn
    iter = 100,
    min_count = 1, # ignores all words with total frequency lower than this
    window = 7, # maximum distance between the current and predicted word
    size = 40 # dimension of the word vector
    )

In [18]:
print(model.wv['he'])

[ 0.09000594  2.3026462  -1.6785553  -0.56642425  0.9963058   0.7567505
 -0.28026736  1.0093896  -0.17157541 -0.90932167  0.35123393 -0.97855073
 -0.5086288   1.0880272  -0.0650894  -0.12275539  0.5658572   1.7014415
 -0.71361613  1.6612295  -1.59704    -0.17297661  0.5013771   1.3327591
  0.5883837   0.25945544 -1.8313568   0.26079226 -0.8424301   3.0796604
 -1.3353548   1.9789115  -0.4807216  -1.8339282   0.00654534  0.96218735
  0.7048919   1.9878603   0.08331054 -1.1993016 ]


In [23]:
print(model.wv['and'])

[ 1.5329118   0.74594086 -1.6432936  -1.1940209   0.5076787   0.89406383
  0.12276209 -0.06524452  0.09255071  1.5539902   0.8052821   0.85143703
  0.794774   -0.09461673 -0.1700565  -1.09236    -0.411806    1.2547437
  1.3144072   0.8496525  -0.00752541 -1.4032689   0.608398   -0.50958747
 -1.530054    0.17319436 -0.74233454 -0.74684167  1.1676917   1.7259591
 -1.2296201   1.3267307  -0.1219243  -0.8517497  -1.0871166  -1.1684201
 -1.7563635   0.8421795   1.3309423  -0.4744256 ]


In [24]:
print(model.wv.most_similar('elephant', topn=5))  # get similar words

[('fifth', 0.9847407341003418), ('angry', 0.966874897480011), ('millions', 0.9501312375068665), ('tons', 0.9499959945678711), ('whalebone', 0.9475234150886536)]


In [25]:
model.wv.similarity('mrs', 'she')

0.3877531

In [28]:
model.wv.similarity('sir', 'elephant')

0.14341593

In [29]:
model.wv.similarity('elephant', 'elephant')

1.0

In [30]:
model.most_similar(positive=['woman', 'king'], negative=['man'])

  """Entry point for launching an IPython kernel.


[('principalities', 0.8147658109664917),
 ('wave', 0.8001747131347656),
 ('rhysson', 0.7934685945510864),
 ('senior', 0.7921526432037354),
 ('rhys', 0.7842260003089905),
 ('frothy', 0.7837520241737366),
 ('engineer', 0.7815768718719482),
 ('daughters', 0.7735103368759155),
 ('prosperity', 0.7728802561759949),
 ('though', 0.7693667411804199)]

In [33]:
model.most_similar_cosmul(positive=['she', 'sir'], negative=['he'])

  """Entry point for launching an IPython kernel.


[('check', 1.0196564197540283),
 ('flint', 0.9843168258666992),
 ('elegant', 0.9775308966636658),
 ('firepower', 0.9583713412284851),
 ('permission', 0.9550015926361084),
 ('whine', 0.9268373847007751),
 ('upgrade', 0.9090000987052917),
 ('assisted', 0.8977051377296448),
 ('nudging', 0.8919122219085693),
 ('pooled', 0.888483464717865)]

**SpaCy**

In [41]:
import spacy

In [47]:
sp = spacy.load('en_core_web_sm')
nlp = spacy.load('en')

In [48]:
doc = nlp("""But it's never cared,' said Skimmer. 'Up until now. And now it wants to rip the top off the country and take what's underneath, mmph, mmhm.""")

In [50]:
print([w.text for w in doc])

['But', 'it', "'s", 'never', 'cared', ',', "'", 'said', 'Skimmer', '.', "'", 'Up', 'until', 'now', '.', 'And', 'now', 'it', 'wants', 'to', 'rip', 'the', 'top', 'off', 'the', 'country', 'and', 'take', 'what', "'s", 'underneath', ',', 'mmph', ',', 'mmhm', '.']


In [52]:
for token in doc:
  print(token.text, token.lemma_)

But but
it -PRON-
's be
never never
cared care
, ,
' '
said say
Skimmer Skimmer
. .
' '
Up up
until until
now now
. .
And and
now now
it -PRON-
wants want
to to
rip rip
the the
top top
off off
the the
country country
and and
take take
what what
's be
underneath underneath
, ,
mmph mmph
, ,
mmhm mmhm
. .


In [53]:
for token in doc:
  if token.ent_type != 0:
    print(token.text, token.ent_type_)

Skimmer PERSON


In [54]:
import pandas as pd
result_list = []
for token in doc:
  result_list.append([token.text, token.pos_, token.tag_, token.dep_])

df = pd.DataFrame(result_list, columns=['TEXT', 'POS_', 'TAG_', 'DEP_'])
df

Unnamed: 0,TEXT,POS_,TAG_,DEP_
0,But,CCONJ,CC,cc
1,it,PRON,PRP,nsubjpass
2,'s,AUX,VBZ,auxpass
3,never,ADV,RB,neg
4,cared,VERB,VBN,ccomp
5,",",PUNCT,",",punct
6,',PUNCT,'',punct
7,said,VERB,VBD,ROOT
8,Skimmer,PROPN,NNP,nsubj
9,.,PUNCT,.,punct


In [72]:
Dup_Rows = df[df.duplicated()]
Dup_Rows

Unnamed: 0,TEXT,POS_,TAG_,DEP_
10,',PUNCT,'',punct
14,.,PUNCT,.,punct
24,the,DET,DT,det
31,",",PUNCT,",",punct
33,",",PUNCT,",",punct
35,.,PUNCT,.,punct


In [74]:
Dup_Rows.to_excel('drive/MyDrive/Colab Notebooks/lesson_21.xlsx')

In [73]:
Dup_Rows.to_excel('C:\\ML\\Lesson_21\\lesson_21.xlsx')