## Embedding

In [None]:
import os
import numpy as np
import pandas as pd
from numpy import dot
from numpy.linalg import norm
import urllib.request
from sentence_transformers import SentenceTransformer

In [None]:
# model = SentenceTransformer('bert-base-multilingual-uncased')
model = SentenceTransformer('sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens')

In [None]:
# Utility function for generating sentence embedding from the text
def get_embeddings(text):
    return model.encode(text)

data = pd.DataFrame(columns=['text', 'emb'])
# Generating sentence embedding from the text
# data_path = "/content/drive/MyDrive/쳇, 6pt: 비스킷(biskuit)/dataset/"
data_path = '../../../data/content/'

items = os.scandir(data_path)
items
i = 0
for item in items:
  # print(file.path)
  # print(data_path + file.name)
  if not item.is_file: pass
  text = open(item.path).read()
  emb = get_embeddings(text).tolist()
  row = pd.DataFrame({'text': item.name, 'emb': [emb]})
  data = pd.concat([data, row], ignore_index=True)
  if i > 100: break
  i += 1
data

Unnamed: 0,text,emb
0,"2020-08-05_크몽_구글 애널리틱스 데모, 웹ᄉ...","[-0.4812588095664978, 0.4819827675819397, -0.0..."
1,2020-08-02_루닛_Cognitive biases and augmente...,"[-0.22332094609737396, 0.50008225440979, -0.27..."
2,2020-07-31_포스타입_두근두근! 포스타입 신...,"[-0.331694632768631, -0.014439956285059452, 1...."
3,2020-07-30_포스타입_포스타입의 iOS 앱은 어...,"[-0.28599151968955994, 0.1372424215078354, 0.7..."
4,2020-07-29_라인_개선된 페이지 스택으로 LIN...,"[-0.5113919377326965, 0.0555141419172287, 0.44..."
...,...,...
97,2020-06-04_루닛_Learning with Average Top-k L...,"[-0.429318904876709, 0.08358487486839294, 0.93..."
98,2020-06-04_루닛_Review Enhanced Deep Residual...,"[-0.24212583899497986, 0.4102625548839569, 1.3..."
99,2020-06-04_루닛_Review Deep Laplacian Pyramid...,"[-0.21833647787570953, 0.603190004825592, 0.70..."
100,2020-06-04_루닛_Review Deep Multi-Scale Video...,"[0.2423110157251358, 0.432127982378006, 0.8025..."


## K-Means Clustering

In [None]:
from nltk.cluster import KMeansClusterer
import nltk
from scipy.spatial import distance_matrix

from yellowbrick.cluster import KElbowVisualizer

In [None]:
def clustering_question(data,NUM_CLUSTERS = 15):

    sentences = data['text']

    X = np.array(data['emb'].tolist())

    kclusterer = KMeansClusterer(
        NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance,
        repeats=25,avoid_empty_clusters=True)

    assigned_clusters = kclusterer.cluster(X, assign_clusters=True)

    data['cluster'] = pd.Series(assigned_clusters, index=data.index)
    data['centroid'] = data['cluster'].apply(lambda x: kclusterer.means()[x])

    return data, assigned_clusters

In [None]:
df, assigned_clusters = clustering_question(data)

In [None]:
model = KMeans()
visualizer = KElbowVisualizer(model, k=(2,20))
visualizer.fit(np.array(data['emb'].tolist()))

In [None]:
def distance_from_centroid(row):
    # type of emb and centroid is different, hence using tolist below
    return distance_matrix([row['emb']], [row['centroid'].tolist()])[0][0]

# Compute centroid distance to the data
data['distance_from_centroid'] = data.apply(distance_from_centroid, axis=1)

In [None]:
data

Unnamed: 0,text,emb,cluster,centroid,distance_from_centroid
0,2020-08-02_루닛_Cognitive biases and augmente...,"[-0.22332094609737396, 0.50008225440979, -0.27...",5,"[-0.16448878953663917, 0.30018228544105724, 0....",9.636630
1,2020-07-31_포스타입_두근두근! 포스타입 신...,"[-0.331694632768631, -0.014439956285059452, 1....",11,"[-0.1581072530450488, 0.18226117940020248, 0.7...",8.210659
2,2020-07-30_포스타입_포스타입의 iOS 앱은 어...,"[-0.28599151968955994, 0.1372424215078354, 0.7...",6,"[-0.2653249025948753, 0.19974408953456582, 0.6...",8.694017
3,2020-07-29_라인_개선된 페이지 스택으로 LIN...,"[-0.5113919377326965, 0.0555141419172287, 0.44...",1,"[-0.20896379238309645, 0.059328042962710215, 0...",6.675155
4,2020-07-29_스마일게이트AI_인간의 뇌 vs. AI ...,"[-0.26579350233078003, -0.31898099184036255, 0...",7,"[-0.20975078827445817, -0.03281381082193739, 0...",7.279147
...,...,...,...,...,...
97,2020-06-04_루닛_Review Enhanced Deep Residual...,"[-0.24212583899497986, 0.4102625548839569, 1.3...",6,"[-0.2653249025948753, 0.19974408953456582, 0.6...",7.244352
98,2020-06-04_루닛_Review Deep Laplacian Pyramid...,"[-0.21833647787570953, 0.603190004825592, 0.70...",11,"[-0.1581072530450488, 0.18226117940020248, 0.7...",6.586092
99,2020-06-04_루닛_Review Deep Multi-Scale Video...,"[0.2423110157251358, 0.432127982378006, 0.8025...",11,"[-0.1581072530450488, 0.18226117940020248, 0.7...",6.696493
100,2020-06-04_루닛_Geometry-aware CNN Features.txt,"[-0.5235992670059204, -0.21078307926654816, 0....",14,"[-0.3427318669616827, 0.1249989776387366, 0.40...",7.717373


## Evaluation

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize = (8, 8))

for i in range(k):
    plt.scatter(df.loc[df['cluster'] == i, 'Annual Income (k$)'], df.loc[df['cluster'] == i, 'Spending Score (1-100)'], 
                label = 'cluster ' + str(i))

plt.legend()
plt.title('K = %d results', size = 15)
plt.xlabel('Annual Income', size = 12)
plt.ylabel('Spending Score', size = 12)
plt.show()