## Using Dictionary to Embedding

In [5]:
text = "We have to do it right now and it will be yours"

tokens = [x for x in text.split(" ")]
unique = list(set(tokens))


print(tokens, len(tokens))
print(unique, len(unique))

token2idx = {}
for i in range(len(unique)):
    token2idx[unique[i]] = i
encode = [token2idx[x] for x in tokens]

print(token2idx)
print(encode)

['We', 'have', 'to', 'do', 'it', 'right', 'now', 'and', 'it', 'will', 'be', 'yours'] 12
['and', 'We', 'have', 'right', 'now', 'it', 'do', 'yours', 'be', 'to', 'will'] 11
{'and': 0, 'We': 1, 'have': 2, 'right': 3, 'now': 4, 'it': 5, 'do': 6, 'yours': 7, 'be': 8, 'to': 9, 'will': 10}
[1, 2, 9, 6, 5, 3, 4, 0, 5, 10, 8, 7]


## Sparse Representation Based Embedding

### Using One-hot Encoding to Embedding

In [6]:
import numpy as np

one_hot = []
for i in range(len(encode)):
    temp = []
    for j in range(max(encode)):
        if j == (encode[i] - 1):
            temp.append(1)
        else:
            temp.append(0)
    one_hot.append(temp)

np.array(one_hot)

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0]])

## Count Based Embedding

### Counter Vector

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = ["When I'm away from you, I miss your touch (ooh)",
        "You're the reason I believe in love",
        "It's been difficult for me to trust (ooh)",
        "And I'm afraid that I'ma fuck it up",
        "Ain't no way that I can leave you stranded",
        "'Cause you ain't never left me empty-handed",
        "And you know that I know that I can't live without you"]
vectorizer = CountVectorizer()
vectorizer.fit(corpus)
vectorizer.vocabulary_

{'when': 37,
 'away': 3,
 'from': 11,
 'you': 39,
 'miss': 23,
 'your': 40,
 'touch': 33,
 'ooh': 26,
 're': 27,
 'the': 31,
 'reason': 28,
 'believe': 5,
 'in': 14,
 'love': 20,
 'it': 15,
 'been': 4,
 'difficult': 8,
 'for': 10,
 'me': 22,
 'to': 32,
 'trust': 34,
 'and': 2,
 'afraid': 0,
 'that': 30,
 'ma': 21,
 'fuck': 12,
 'up': 35,
 'ain': 1,
 'no': 25,
 'way': 36,
 'can': 6,
 'leave': 17,
 'stranded': 29,
 'cause': 7,
 'never': 24,
 'left': 18,
 'empty': 9,
 'handed': 13,
 'know': 16,
 'live': 19,
 'without': 38}

### TF-IDF

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(min_df=1)
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
print(tfidf_matrix.shape)
doc_dist = (tfidf_matrix * tfidf_matrix.T)
print(f"유사도 행렬 : {doc_dist.get_shape()[0]} x {doc_dist.get_shape()[1]}")
print(doc_dist.toarray())

(7, 41)
유사도 행렬 : 7 x 7
[[1.         0.04394083 0.09810469 0.         0.04436054 0.04267714
  0.06784273]
 [0.04394083 1.         0.         0.         0.04672673 0.04495354
  0.07146146]
 [0.09810469 0.         1.         0.10687592 0.         0.10036572
  0.        ]
 [0.         0.         0.10687592 1.         0.08355241 0.
  0.21522683]
 [0.04436054 0.04672673 0.         0.08355241 1.         0.15277424
  0.28223309]
 [0.04267714 0.04495354 0.10036572 0.         0.15277424 1.
  0.06940631]
 [0.06784273 0.07146146 0.         0.21522683 0.28223309 0.06940631
  1.        ]]


## Prediction Based Embedding

### Word2Vec (Sentence Tokenizer -> Word Tokenizer)

In [5]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings
warnings.filterwarnings('ignore')
import gensim
from gensim.models import Word2Vec

sample = open('peter.txt', 'r', encoding='utf-8')
s = sample.read()

f = s.replace('\n', ' ')
data = []

for i in sent_tokenize(f):
    temp=[]
    for j in word_tokenize(i):
        temp.append(j.lower())

    data.append(temp)

In [6]:
data

[['once',
  'upon',
  'a',
  'time',
  'in',
  'london',
  ',',
  'the',
  'darlings',
  'went',
  'out',
  'to',
  'a',
  'dinner',
  'party',
  'leaving',
  'their',
  'three',
  'children',
  'wendy',
  ',',
  'jhon',
  ',',
  'and',
  'michael',
  'at',
  'home',
  '.'],
 ['after',
  'wendy',
  'had',
  'tucked',
  'her',
  'younger',
  'brothers',
  'jhon',
  'and',
  'michael',
  'to',
  'bed',
  ',',
  'she',
  'went',
  'to',
  'read',
  'a',
  'book',
  '.'],
 ['she', 'heard', 'a', 'boy', 'sobbing', 'outside', 'her', 'window', '.'],
 ['he', 'was', 'flying', '.'],
 ['there', 'was', 'little', 'fairy', 'fluttering', 'around', 'him', '.'],
 ['wendy', 'opened', 'the', 'window', 'to', 'talk', 'to', 'him', '.'],
 ['“', 'hello', '!'],
 ['who', 'are', 'you', '?'],
 ['why', 'are', 'you', 'crying', '”', ',', 'wendy', 'asked', 'him', '.'],
 ['“', 'my', 'name', 'is', 'peter', 'pan', '.'],
 ['my',
  'shadow',
  'wouldn',
  '’',
  't',
  'stock',
  'to',
  'me.',
  '”',
  ',',
  'he',
  'rep

### Word2Vec (CBOW) -> Context to Target Word

In [10]:
model = gensim.models.Word2Vec(data, min_count=1, vector_size=100, window=5, sg=0)
# sg = 0 -> CBOW & sg = 1 -> Skip Gram

print(f"[CBOW] Cosine similarity : {model.wv.similarity('leaving', 'went')}")
print(f"[CBOW] Cosine similarity : {model.wv.similarity('dinner', 'party')}")

[CBOW] Cosine similarity : -0.028390146791934967
[CBOW] Cosine similarity : -0.019526802003383636


### Word2Vec (Skip Gram) -> Target to Context

In [12]:
model = gensim.models.Word2Vec(data, min_count=1, vector_size=100, window=5, sg=1)
# sg = 0 -> CBOW & sg = 1 -> Skip Gram

print(f"[skip-gram] Cosine similarity : {model.wv.similarity('leaving', 'went')}")
print(f"[skip-gram] Cosine similarity : {model.wv.similarity('dinner', 'party')}")

[skip-gram] Cosine similarity : 0.17423781752586365
[skip-gram] Cosine similarity : 0.01690060645341873


### FastText -> Word2Vec 사전에 없는 단어 벡터 값 부여 불가 및 자주 사용되지 않는 단어 학습 불안정

In [15]:
from gensim.test.utils import common_texts
from gensim.models import FastText

model = FastText('peter.txt', vector_size=4, window=3, min_count=1)
sim_score1 = model.wv.similarity('leaving', 'went')
sim_score2 = model.wv.similarity('dinner', 'party')

print(sim_score1)
print(sim_score2)

0.49374622
0.19928932


## Count & Prediction Based Embedding

### GloVe -> Statistical Information + Skip-gram