In [13]:
import os
import pandas as pd
import numpy as np
import math
import re
import nltk
from sklearn.datasets import load_files
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from gensim.models import Word2Vec, FastText

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\82109\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\82109\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\82109\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\82109\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Data load

In [4]:
data_path = "../data/news_script_integrated/"
news_list = sorted(os.listdir(data_path))
news_list

['20201.txt',
 '202010.txt',
 '202011.txt',
 '20202.txt',
 '20203.txt',
 '20204.txt',
 '20205.txt',
 '20206.txt',
 '20207.txt',
 '20208.txt',
 '20209.txt']

## Text preprocess

In [34]:
stemmer = WordNetLemmatizer()

def text_preprocess(text):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', text)
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()
    english_stops = set(stopwords.words('english'))
    document = [word for word in document if word not in english_stops]
    document = [stemmer.lemmatize(word) for word in document]
    document = [word for word in document if len(word) > 1]
    document = [word for word in document if word.isalpha()]
    document = ' '.join(document)
    
    return document    

In [39]:
text = []

for i in range(len(news_list)):
    f = open(data_path + news_list[i], 'r', encoding='UTF-8')
    lines = f.readlines()
    for line in lines:
        tmp = text_preprocess(line).split()
        if len(tmp) > 0:
            text.append(text_preprocess(line).split())

## Embedding

In [41]:
time Skip_gram_model = Word2Vec(text, size=16, window=3, min_count=1,  workers=1, sg=1, iter=10)

Wall time: 10.7 s


In [42]:
time Cbow_model = Word2Vec(text, size=16, window=3, min_count=1,  workers=1, sg=0, iter=10)

Wall time: 4.59 s


In [43]:
Cbow_model.wv.most_similar("idol", topn=10)

[('encountering', 0.8823961019515991),
 ('unattractive', 0.8812627792358398),
 ('star', 0.8755040168762207),
 ('debated', 0.8751051425933838),
 ('esthetic', 0.8624680042266846),
 ('younger', 0.8557512760162354),
 ('derision', 0.8514301776885986),
 ('today', 0.8482885360717773),
 ('blesses', 0.8473474383354187),
 ('older', 0.8456236124038696)]

In [44]:
Skip_gram_model.wv.most_similar("idol", topn=10)

[('star', 0.9019167423248291),
 ('performer', 0.8826932311058044),
 ('talking', 0.8822098970413208),
 ('celebrity', 0.8752883076667786),
 ('musician', 0.87342768907547),
 ('older', 0.8584058284759521),
 ('extremely', 0.852790355682373),
 ('mostly', 0.8489020466804504),
 ('seeing', 0.8444784283638),
 ('millennial', 0.8388079404830933)]

In [45]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

In [46]:
?Skip_gram_model.wv.distance

[1;31mSignature:[0m [0mSkip_gram_model[0m[1;33m.[0m[0mwv[0m[1;33m.[0m[0mdistance[0m[1;33m([0m[0mw1[0m[1;33m,[0m [0mw2[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Compute cosine distance between two words.
Calculate 1 - :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similarity`.

Parameters
----------
w1 : str
    Input word.
w2 : str
    Input word.

Returns
-------
float
    Distance between `w1` and `w2`.
[1;31mFile:[0m      c:\users\82109\anaconda3\lib\site-packages\gensim\models\keyedvectors.py
[1;31mType:[0m      method


In [48]:
def get_cos_euc_dist(w1, w2, model=Skip_gram_model):
    w1_vector, w2_vector = model.wv.get_vector(w1), model.wv.get_vector(w2)
    print(f"{w1} <--> {w2}")
    print('Euclidean: ', euclidean_distances(w1_vector.reshape(1,16), w2_vector.reshape(1,16))[0][0])
    print('Cosine   : ', model.wv.distance(w1,w2))
    print('-'*50)

In [49]:
get_cos_euc_dist("song", "idol")

song <--> idol
Euclidean:  2.573868
Cosine   :  0.43313950300216675
--------------------------------------------------


In [50]:
# 해당 단어의 비슷한 단어 들의 거리 비교
def get_top_similar_dist(word, model):
    for word2, dist in model.most_similar(word,topn=10):
        get_cos_euc_dist(word, word2, model=model)

In [None]:
get_top_similar_dist('손길/NNG',Skip_Gram_model)