# Global Vectors for Word Representation, GloVe
- https://wikidocs.net/22885
- Count based word representation과 Prediction based word representation을 모두 사용하는 방법
- Window based Co-occurrence Matrix: i 단어의 윈도우 크기(Window Size) 내에서 k 단어가 등장한 횟수를 i행 k열에 기재한 행렬
- Co-occurrence Probability: P(k | i), 특정 단어 i가 등장했을 때 어떤 단어 k가 등장한 횟수를 계산한 조건부 확률
- 목표: 임베딩 된 중심 단어와 주변 단어 벡터의 내적이 전체 코퍼스에서의 동시 등장 확률이 되도록 만드는 것

## Import

In [1]:
# !pip install glove_python

In [1]:
from glove import Corpus, Glove

import re
from lxml import etree
import urllib.request
import zipfile
from nltk.tokenize import word_tokenize, sent_tokenize

In [6]:
# import nltk
# nltk.download('punkt')

[nltk_data] Downloading package punkt to /home1/irteam/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Load training dataset

In [2]:
urllib.request.urlretrieve("https://wit3.fbk.eu/get.php?path=XML_releases/xml/ted_en-20160408.zip&filename=ted_en-20160408.zip", filename="ted_en-20160408.zip")

('ted_en-20160408.zip', <http.client.HTTPMessage at 0x7f54bfcf7668>)

In [3]:
with zipfile.ZipFile('ted_en-20160408.zip', 'r') as z:
    target_text = etree.parse(z.open('ted_en-20160408.xml', 'r'))
    parse_text = '\n'.join(target_text.xpath('//content/text()'))

## Text preprocessing

In [4]:
content_text = re.sub(r'\([^)]*\)', '', parse_text)
len(content_text)

24062319

In [7]:
sent_text = sent_tokenize(content_text)
len(sent_text)

273424

In [8]:
normalized_text = []
for string in sent_text:
    tokens = re.sub(r"[^a-z0-9]+", " ", string.lower())
    normalized_text.append(tokens)
len(normalized_text)

273424

In [9]:
result = [word_tokenize(sentence) for sentence in normalized_text]
len(result)

273424

In [13]:
result[:5]

[['here',
  'are',
  'two',
  'reasons',
  'companies',
  'fail',
  'they',
  'only',
  'do',
  'more',
  'of',
  'the',
  'same',
  'or',
  'they',
  'only',
  'do',
  'what',
  's',
  'new'],
 ['to',
  'me',
  'the',
  'real',
  'real',
  'solution',
  'to',
  'quality',
  'growth',
  'is',
  'figuring',
  'out',
  'the',
  'balance',
  'between',
  'two',
  'activities',
  'exploration',
  'and',
  'exploitation'],
 ['both',
  'are',
  'necessary',
  'but',
  'it',
  'can',
  'be',
  'too',
  'much',
  'of',
  'a',
  'good',
  'thing'],
 ['consider', 'facit'],
 ['i', 'm', 'actually', 'old', 'enough', 'to', 'remember', 'them']]

## get co-occurence matrix

In [10]:
corpus = Corpus()
corpus.fit(result, window=5)
corpus

<glove.corpus.Corpus at 0x7f543ccd8320>

In [11]:
corpus.matrix.shape

(54775, 54775)

## Train GloVe

In [16]:
glove = Glove(no_components=100, learning_rate=0.05)
glove.fit(corpus.matrix, epochs=20, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)

Performing 20 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19


## Predict

In [17]:
glove.most_similar("man")

[('woman', 0.96153452805664),
 ('guy', 0.8923528297497354),
 ('girl', 0.8562141550892345),
 ('young', 0.8449564111199589)]

In [18]:
glove.most_similar("apple")

[('actual', 0.8605317603547695),
 ('institution', 0.8522237975944981),
 ('elephant', 0.8311307014216432),
 ('instrument', 0.8276125952599354)]

In [19]:
glove.most_similar("king")

[('captain', 0.8740235251753843),
 ('owner', 0.8695533239597025),
 ('chief', 0.8647650381972076),
 ('director', 0.8540623972254942)]

In [20]:
glove.most_similar("clean")

[('fresh', 0.8432154639705923),
 ('water', 0.8384026739902367),
 ('heat', 0.8230607669308025),
 ('air', 0.7960530781740318)]

In [21]:
glove.most_similar("virus")

[('molecule', 0.8760577893420111),
 ('device', 0.8713630886539139),
 ('machine', 0.8603706031535792),
 ('procedure', 0.8442879646633301)]