# Step 11: Unsupervised k-means clustering

In [2]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.cluster import KMeans

In [3]:
word_vectors = Word2Vec.load("word2vec.model").wv

In [4]:
model = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors.astype('double'))

In [5]:
word_vectors.similar_by_vector(model.cluster_centers_[0], topn=10, restrict_vocab=None)

[('suffici', 0.9916220307350159),
 ('next', 0.9907612800598145),
 ('remiss', 0.9899042248725891),
 ('glucocorticoid', 0.9898771643638611),
 ('make', 0.9898754358291626),
 ('intern', 0.9892132878303528),
 ('we_found', 0.988933801651001),
 ('teprotumumab', 0.9882659912109375),
 ('esophag', 0.9881330728530884),
 ('thought', 0.9880006313323975)]

In [6]:
positive_cluster_center = model.cluster_centers_[0]
negative_cluster_center = model.cluster_centers_[1]

In [7]:
words = pd.DataFrame(np.array([list(word_vectors.index_to_key)]).T)
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

In [8]:
words['cluster_value'] = [1 if i==0 else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)

In [24]:
words[words['cluster_value'] != 0]

Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score,sentiment_coeff
0,or,"[0.10209823, -0.07570479, -0.011496712, -0.066...",1,-1,0.772296,-0.772296
1,pressur,"[0.047711316, -0.07002459, -0.024057176, -0.08...",1,-1,0.770109,-0.770109
2,train,"[0.0758576, -0.10417662, -0.020020593, -0.0693...",1,-1,0.769759,-0.769759
3,hypertens,"[0.119882256, -0.086356066, -0.060781825, -0.0...",1,-1,0.779188,-0.779188
4,anti,"[0.08376609, -0.07109665, -0.06981209, 0.01331...",1,-1,0.805614,-0.805614
...,...,...,...,...,...,...
5418,sodium_bicarbon,"[-0.06830417, 0.01134732, 0.020890148, -0.0582...",0,1,2.997169,2.997169
5419,convert,"[-0.06554523, 0.021923034, 0.020463716, -0.071...",0,1,3.261610,3.261610
5420,axi,"[-0.06985295, 0.020333901, 0.022981338, -0.083...",0,1,3.463481,3.463481
5421,special,"[-0.05797433, 0.0030707512, 0.009448027, -0.03...",0,1,3.752625,3.752625
