#횟수 기반 임베딩

카운터 벡터

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'This is last chance.',
    'and if you do not have this chanc.te.',
    'you will never get any chance',
    'will you do get this one?',
    'please, get this chance'
]
vect = CountVectorizer()
vect.fit(corpus)
vect.vocabulary_

{'this': 15,
 'is': 8,
 'last': 9,
 'chance': 3,
 'and': 0,
 'if': 7,
 'you': 17,
 'do': 4,
 'not': 11,
 'have': 6,
 'chanc': 2,
 'te': 14,
 'will': 16,
 'never': 10,
 'get': 5,
 'any': 1,
 'one': 12,
 'please': 13}

countvectorizer의 적용 결과를 배열로 반환

In [2]:
vect.transform(['you will never get any chance']).toarray()

array([[0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1]])

불용어를 제거한 카운터 벡터

In [3]:
vect = CountVectorizer(stop_words = ['and', 'is', 'please', 'this']).fit(corpus)
vect.vocabulary_

{'last': 7,
 'chance': 2,
 'if': 6,
 'you': 13,
 'do': 3,
 'not': 9,
 'have': 5,
 'chanc': 1,
 'te': 11,
 'will': 12,
 'never': 8,
 'get': 4,
 'any': 0,
 'one': 10}

#TF-IDF

TF-IDF를 적용한 후 행렬로 표현

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
doc = ['I like machine learning', 'I love deep learning', 'I run everyday']
tfidf_vectorizer = TfidfVectorizer(min_df=1)
tfidf_matrix = tfidf_vectorizer.fit_transform(doc)
doc_distance = (tfidf_matrix * tfidf_matrix.T)

print('유사도를 위한', str(doc_distance.get_shape()[0]), 'x', str(doc_distance.get_shape()[1]), '행렬을 만들었습니다.')
print(doc_distance.toarray())

유사도를 위한 3 x 3 행렬을 만들었습니다.
[[1.       0.224325 0.      ]
 [0.224325 1.       0.      ]
 [0.       0.       1.      ]]


# 예측 기반 임베딩(Word2Vec)

데이터셋을 메모리로 로딩하고 토큰화 적용

In [20]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings
warnings.filterwarnings(action = 'ignore')
import gensim
from gensim.models import Word2Vec

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [22]:
sample = open('/peter.txt', 'r', encoding='UTF8') # 피터펜 데이터셋 로딩
s = sample.read()

f = s.replace("\n", " ") #줄바꿈을 공백으로 변환
data = []

for i in sent_tokenize(f):
  temp = []
  for j in word_tokenize(i):
    temp.append(j.lower())
  data.append(temp)

data

[['once',
  'upon',
  'a',
  'time',
  'in',
  'london',
  ',',
  'the',
  'darlings',
  'went',
  'out',
  'to',
  'a',
  'dinner',
  'party',
  'leaving',
  'their',
  'three',
  'children',
  'wendy',
  ',',
  'jhon',
  ',',
  'and',
  'michael',
  'at',
  'home',
  '.'],
 ['after',
  'wendy',
  'had',
  'tucked',
  'her',
  'younger',
  'brothers',
  'jhon',
  'and',
  'michael',
  'to',
  'bed',
  ',',
  'she',
  'went',
  'to',
  'read',
  'a',
  'book',
  '.'],
 ['she', 'heard', 'a', 'boy', 'sobbing', 'outside', 'her', 'window', '.'],
 ['he', 'was', 'flying', '.'],
 ['there', 'was', 'little', 'fairy', 'fluttering', 'around', 'him', '.'],
 ['wendy', 'opened', 'the', 'window', 'to', 'talk', 'to', 'him', '.'],
 ['“', 'hello', '!'],
 ['who', 'are', 'you', '?'],
 ['why', 'are', 'you', 'crying', '”', ',', 'wendy', 'asked', 'him', '.'],
 ['“', 'my', 'name', 'is', 'peter', 'pan', '.'],
 ['my',
  'shadow',
  'wouldn',
  '’',
  't',
  'stock',
  'to',
  'me.',
  '”',
  ',',
  'he',
  'rep

## CBOW

데이터셋에 CBOW 적용 후 'peter'와 'wendy'의 유사성 확인

In [23]:
model1 = gensim.models.Word2Vec(data, min_count = 1, vector_size = 100, window = 5, sg = 0) #sg=0 ->CBOW

print("Cosine similarity between 'peter' 'wendy' - CBOW : ", model1.wv.similarity('peter', 'wendy'))

Cosine similarity between 'peter' 'wendy' - CBOW :  0.074393824


'peter'와 'hook'의 유사성 확인

In [24]:
print("Cosine similarity between 'peter' 'hook' - CBOW : ", model1.wv.similarity('peter', 'hook'))

Cosine similarity between 'peter' 'hook' - CBOW :  0.027709836


##Skip-gram

데이터셋에 skip-gram을 적용 후 'peter'와 'wendy'의 유사성 확인

In [26]:
model2 = gensim.models.Word2Vec(data, min_count = 1, vector_size = 100, window = 5, sg = 1) #sg=1 ->skip-gram
print("Cosine similarity between 'peter' 'wendy' - Skip Gram : ", model2.wv.similarity('peter', 'wendy'))

Cosine similarity between 'peter' 'wendy' - Skip Gram :  0.40088683


In [27]:
print("Cosine similarity between 'peter' 'hook' - Skip Gram : ", model2.wv.similarity('peter', 'hook'))

Cosine similarity between 'peter' 'hook' - Skip Gram :  0.5201673


#패스트텍스트

라이브러리 및 데이터 호출

In [28]:
from gensim.test.utils import common_texts
from gensim.models import FastText

model = FastText('/peter.txt', vector_size = 4, window = 3, min_count = 1, epochs = 10)



'peter'와 'wendy'에 대한 코사인 유사도

In [29]:
sim_score = model.wv.similarity('peter', 'wendy')
print(sim_score)

0.4592452


'peter'와 'hook'에 대한 코사인 유사도

In [30]:
sim_score = model.wv.similarity('peter', 'hook')
print(sim_score)

0.043825686
