In [None]:
import os
import pandas as pd
import numpy as np
import math
import re
import nltk
from sklearn.datasets import load_files
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from gensim.models import Word2Vec, FastText

## Data load

In [2]:
data_path = "../data/news_script_individual-200/"
news_list = sorted(os.listdir(data_path))
len(news_list)

2200

## Text preprocess

In [3]:
def build_dictionary(eng_sentence):
    dictionary = {}
    for sent in eng_sentence:
        pos_tags = nltk.pos_tag(nltk.word_tokenize(sent))
        for tag in pos_tags:
            value = tag[0]
            pos = tag[1]
            dictionary[value] = pos
    return dictionary

In [4]:
stemmer = WordNetLemmatizer()

def text_preprocess(text):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', text)
    
    document = document.replace('.', '')
    
    document = re.sub('[^a-zA]',' ',document).strip()
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()
    english_stops = set(stopwords.words('english'))
    document = [word for word in document if word not in english_stops]
    document = [stemmer.lemmatize(word) for word in document]
    document = [word for word in document if len(word) > 1]
    document = [word for word in document if word.isalpha()]
    pos_dict = build_dictionary(document)
    document = [n for n, tag in pos_dict.items() if tag in ["NN","NNP"] ]
    document = ' '.join(document)
    
    return document    

In [5]:
text = []

for i in range(len(news_list)):
    f = open(data_path + news_list[i], 'r', encoding='UTF-8')
    lines = f.readlines()
    for line in lines:
        tmp = text_preprocess(line).split()
        if len(tmp) > 0:
            text.append(text_preprocess(line).split())

## Embedding

In [6]:
time Skip_gram_model = Word2Vec(text, size=16, window=3, min_count=1,  workers=1, sg=1, iter=10)

Wall time: 30.1 s


In [7]:
time Cbow_model = Word2Vec(text, size=16, window=3, min_count=1,  workers=1, sg=0, iter=10)

Wall time: 13.6 s


In [None]:
Skip_gram_model.wv.vocab.keys()

In [None]:
Cbow_model.wv.most_similar("blackpink", topn=10)

In [None]:
Skip_gram_model.wv.most_similar("singe", topn=10)

In [13]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

In [14]:
?Skip_gram_model.wv.distance

[1;31mSignature:[0m [0mSkip_gram_model[0m[1;33m.[0m[0mwv[0m[1;33m.[0m[0mdistance[0m[1;33m([0m[0mw1[0m[1;33m,[0m [0mw2[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Compute cosine distance between two words.
Calculate 1 - :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similarity`.

Parameters
----------
w1 : str
    Input word.
w2 : str
    Input word.

Returns
-------
float
    Distance between `w1` and `w2`.
[1;31mFile:[0m      c:\users\82109\anaconda3\lib\site-packages\gensim\models\keyedvectors.py
[1;31mType:[0m      method


In [15]:
def get_cos_euc_dist(w1, w2, model=Skip_gram_model):
    w1_vector, w2_vector = model.wv.get_vector(w1), model.wv.get_vector(w2)
    print(f"{w1} <--> {w2}")
    print('Euclidean: ', euclidean_distances(w1_vector.reshape(1,16), w2_vector.reshape(1,16))[0][0])
    print('Cosine   : ', model.wv.distance(w1,w2))
    print('-'*50)

In [49]:
get_cos_euc_dist("kpop", "idol")
get_cos_euc_dist("dynamite", "idol")
get_cos_euc_dist("dynamite", "song")
get_cos_euc_dist("dynamite", "kpop")
get_cos_euc_dist("blackpink", "idol")
get_cos_euc_dist("blackpink", "song")
get_cos_euc_dist("blackpink", "kpop")
get_cos_euc_dist("exo", "idol")
get_cos_euc_dist("exo", "song")
get_cos_euc_dist("exo", "kpop")

kpop <--> idol
Euclidean:  2.7254503
Cosine   :  0.5781855583190918
--------------------------------------------------
dynamite <--> idol
Euclidean:  2.2868202
Cosine   :  0.3212130069732666
--------------------------------------------------
dynamite <--> song
Euclidean:  2.327134
Cosine   :  0.19012415409088135
--------------------------------------------------
dynamite <--> kpop
Euclidean:  1.2445022
Cosine   :  0.22412264347076416
--------------------------------------------------
blackpink <--> idol
Euclidean:  2.3498466
Cosine   :  0.3392786383628845
--------------------------------------------------
blackpink <--> song
Euclidean:  2.3747857
Cosine   :  0.17819619178771973
--------------------------------------------------
blackpink <--> kpop
Euclidean:  1.2773558
Cosine   :  0.22552794218063354
--------------------------------------------------
exo <--> idol
Euclidean:  2.4998212
Cosine   :  0.4068801999092102
--------------------------------------------------
exo <--> song
Eucli

In [25]:
def get_cos_euc_dist2(w1, w2, model=Cbow_model):
    w1_vector, w2_vector = model.wv.get_vector(w1), model.wv.get_vector(w2)
    print(f"{w1} <--> {w2}")
    print('Euclidean: ', euclidean_distances(w1_vector.reshape(1,16), w2_vector.reshape(1,16))[0][0])
    print('Cosine   : ', model.wv.distance(w1,w2))
    print('-'*50)

In [50]:
get_cos_euc_dist2("kpop", "idol")
get_cos_euc_dist2("dynamite", "idol")
get_cos_euc_dist2("dynamite", "song")
get_cos_euc_dist2("dynamite", "kpop")
get_cos_euc_dist2("blackpink", "idol")
get_cos_euc_dist2("blackpink", "song")
get_cos_euc_dist2("blackpink", "kpop")
get_cos_euc_dist2("exo", "idol")
get_cos_euc_dist2("exo", "song")
get_cos_euc_dist2("exo", "kpop")

kpop <--> idol
Euclidean:  5.747503
Cosine   :  0.5080408453941345
--------------------------------------------------
dynamite <--> idol
Euclidean:  6.4452796
Cosine   :  0.5170215368270874
--------------------------------------------------
dynamite <--> song
Euclidean:  7.8626
Cosine   :  0.34231799840927124
--------------------------------------------------
dynamite <--> kpop
Euclidean:  3.1474574
Cosine   :  0.2638216018676758
--------------------------------------------------
blackpink <--> idol
Euclidean:  6.48288
Cosine   :  0.5280620753765106
--------------------------------------------------
blackpink <--> song
Euclidean:  7.904689
Cosine   :  0.31452876329421997
--------------------------------------------------
blackpink <--> kpop
Euclidean:  3.1659849
Cosine   :  0.1345774531364441
--------------------------------------------------
exo <--> idol
Euclidean:  6.3987827
Cosine   :  0.4781457185745239
--------------------------------------------------
exo <--> song
Euclidean:  7

In [17]:
# 해당 단어의 비슷한 단어 들의 거리 비교
def get_top_similar_dist(word, model):
    for word2, dist in model.wv.most_similar(word,topn=10):
        get_cos_euc_dist(word, word2, model=model)

In [None]:
def plot_2d_graph(vocabs, xs, ys):
  plt.figure(figsize=(10, 8))
  plt.scatter(xs, ys, marker='o')
  for i,v in enumerate(vocabs):
    plt.annotate(v, xy=(xs[i], ys[i]))

In [None]:
Skip_vectors = Skip_gram_model.wv
Cbow_vectors = Cbow_model.wv

Skip_vocabs = Skip_vectors.vocab.keys()
Cbow_vocabs = Cbow_vectors.vocab.keys()
Skip_vectors_list = [Skip_vectors[v] for v in Skip_vocabs]
Cbow_vectors_list = [Cbow_vectors[v] for v in Cbow_vocabs]

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
xys = pca.fit_transform(Skip_vectors_list[:15])
xs = xys[:,0]
ys = xys[:,1]

plot_2d_graph(Skip_vocabs, xs, ys)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
xys = pca.fit_transform(Cbow_vectors_list[:15])
xs = xys[:,0]
ys = xys[:,1]

plot_2d_graph(Cbow_vocabs, xs, ys)