### Initialization

In [None]:
pip install glove-python-binary



In [None]:
from glove import Corpus, Glove
import pandas as pd

In [None]:
corpus = Corpus() 

In [None]:
line = pd.read_csv('/content/train.csv',header=None)
line.head(100)

Unnamed: 0,0,1,2
0,,it's suliban.,スリバン人です
1,0.0,nothing thrills me more than to see kids polli...,生徒がお互いの受精じゃなくて 植物の受粉に熱中してくれてるよ!
2,1.0,do you want to spend all night at the cemetery...,この雨の中 一晩中 墓地にいたい?
3,2.0,our ships are in attack position.,船はもう攻撃発起位置にある。
4,3.0,what about the original sam? huh?,オリジナルの サムが居るんだぞ
...,...,...,...
95,94.0,rose.,ローズ!
96,95.0,"and if i see you flinch, you're out.",もしお前がだじろだら、脱落だ
97,96.0,give me my daughters back.,娘達を返して欲しい
98,97.0,why is it that when we had rotary phones,なぜ 電話機がダイヤル式だったり


In [None]:
lines = list(line[1])
print(len(lines))

2694511


### Removing the Stop Words

In [None]:

from nltk.corpus import stopwords 
# import nltk
# nltk.download('stopwords')
stop_words=set(stopwords.words('english')) 
lines_without_stopwords=[] #stop words contain the set of stop words 
for line in lines: 
  # print(line)
  temp_line=[] 
  for word in line.split(): 
    if word not in stop_words: 
      temp_line.append (word) 
  lines_without_stopwords.append(' '.join(temp_line)) 
  
print(lines_without_stopwords[:10])

['suliban.', 'nothing thrills see kids pollinating plants instead other.', 'want spend night cemetery rain?', 'ships attack position.', 'original sam? huh?', 'even though, like it, quit job!', 'goku continued journey alone, became steadily stronger!', "begin emergency captains' meeting!", 'words, lights projectors', 'funny, david. hell you?']


### Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer 
# import nltk
# nltk.download('wordnet')
wordnet_lemmatizer = WordNetLemmatizer() 
lines_with_lemmas=[] #stop words contain the set of stop words 
for line in lines_without_stopwords: 
  temp_line=[] 
  for word in line.split(): 
    temp_line.append(wordnet_lemmatizer.lemmatize(word)) 
  string=' ' 
  lines_with_lemmas.append(string.join(temp_line)) 
lines=lines_with_lemmas
print(lines[:10])

['suliban.', 'nothing thrill see kid pollinating plant instead other.', 'want spend night cemetery rain?', 'ship attack position.', 'original sam? huh?', 'even though, like it, quit job!', 'goku continued journey alone, became steadily stronger!', "begin emergency captains' meeting!", 'words, light projector', 'funny, david. hell you?']


In [None]:
final_lines = []
for i in lines:
  b=[]
  b.extend(i.split())
  final_lines.append(b)

In [None]:
final_lines[2:5]

[['want', 'spend', 'night', 'cemetery', 'rain?'],
 ['ship', 'attack', 'position.'],
 ['original', 'sam?', 'huh?']]

### Train GloVe model


In [None]:
corpus.fit(final_lines, window=4)

In [None]:
import time

start = time.time()

print("The time used to execute this is given below")


glove = Glove(no_components=5, learning_rate=0.05)
 
glove.fit(corpus.matrix, epochs=100, no_threads=32, verbose=True)
glove.add_dictionary(corpus.dictionary)
glove.save('glove.model')

end = time.time()

print('\nruning time:',end - start,' s')

The time used to execute this is given below
Performing 100 training epochs with 32 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29
Epoch 30
Epoch 31
Epoch 32
Epoch 33
Epoch 34
Epoch 35
Epoch 36
Epoch 37
Epoch 38
Epoch 39
Epoch 40
Epoch 41
Epoch 42
Epoch 43
Epoch 44
Epoch 45
Epoch 46
Epoch 47
Epoch 48
Epoch 49
Epoch 50
Epoch 51
Epoch 52
Epoch 53
Epoch 54
Epoch 55
Epoch 56
Epoch 57
Epoch 58
Epoch 59
Epoch 60
Epoch 61
Epoch 62
Epoch 63
Epoch 64
Epoch 65
Epoch 66
Epoch 67
Epoch 68
Epoch 69
Epoch 70
Epoch 71
Epoch 72
Epoch 73
Epoch 74
Epoch 75
Epoch 76
Epoch 77
Epoch 78
Epoch 79
Epoch 80
Epoch 81
Epoch 82
Epoch 83
Epoch 84
Epoch 85
Epoch 86
Epoch 87
Epoch 88
Epoch 89
Epoch 90
Epoch 91
Epoch 92
Epoch 93
Epoch 94
Epoch 95
Epoch 96
Epoch 97
Epoch 98
Epoch 99

runing time: 350.

### Result

In [None]:
print (glove.word_vectors[glove.dictionary['ice']])
print(glove.word_vectors[glove.dictionary['solid']])
print (glove.word_vectors[glove.dictionary['water']])
print(glove.word_vectors[glove.dictionary['steam']])
print(glove.word_vectors[glove.dictionary['gas']])

[-0.49698661  0.17355897 -0.8763322   0.18034416  0.55585528]
[ 0.02113601 -0.01558317 -0.12137971  0.06967532 -0.09278328]
[ 0.43206666 -0.17013498 -0.50695423 -0.95981384 -0.03750297]
[-0.06227461  0.00831281  0.16673608 -0.13151228  0.08683154]
[ 0.43870039 -0.37934762  0.0171744  -0.48198384 -0.1782471 ]


In [None]:
from scipy import spatial
l = ['ice','solid','water','steam','gas','fashion','random']

print('cosine simility: ')
for i,j in enumerate(l):
  for k in l[i+1:]:
    # if j==i:
    #   continue
    # else:
      result = 1 - spatial.distance.cosine(glove.word_vectors[glove.dictionary[j]], glove.word_vectors[glove.dictionary[k]])
      print(j,'--',k,': ',result)


cosine simility: 
ice -- solid :  0.2705795944977627
ice -- water :  0.004355012130448754
ice -- steam :  -0.3183679732926483
ice -- gas :  -0.5313199647848101
ice -- fashion :  0.1360633464799471
ice -- random :  0.4552037341709345
solid -- water :  0.04942093992153218
solid -- steam :  -0.9622865744014089
solid -- gas :  -0.029946270613192993
solid -- fashion :  0.7998214022669012
solid -- random :  0.8012909588551248
water -- steam :  0.03602476060615556
water -- gas :  0.7806190152296062
water -- fashion :  0.14257311484257207
water -- random :  0.44559489110177686
steam -- gas :  0.11010323317709869
steam -- fashion :  -0.8829763831064166
steam -- random :  -0.8232132609166667
gas -- fashion :  0.22507281431887904
gas -- random :  0.24597132228986784
fashion -- random :  0.8709398499141391
