# 3D Word Embedding

## Parameters

In [1]:
import sys
#Define the path of packages
sys.path.append('/Users/efraflores/Desktop/hub/playground/venv/lib/python3.9/site-packages')

In [2]:
import os
import pandas as pd

BASE_DIR = '/Users/efraflores/Desktop/EF/Diplo/Asimov'
print(sorted(os.listdir(BASE_DIR)))
EMBEDDING_DIM = 300

['.DS_Store', '3D_WordEmbedding.csv', '3D_WordEmbedding.png', 'The_Last_Question.pdf', 'asimov_features.csv', 'corpus.txt', 'doc', 'glove.6B.100d.txt', 'glove.6B.200d.txt', 'glove.6B.300d.txt', 'glove.6B.50d.txt', 'glove.6B.zip', 'project', 'rnn_asimov_architecture.json', 'rnn_asimov_tokenizer.pickle', 'rnn_asimov_weights.h5']


## GloVe

In [3]:
import numpy as np

#Get the pre-trained vectors as a dict
emb_dict = {}
with open(os.path.join(BASE_DIR,f'glove.6B.{EMBEDDING_DIM}d.txt')) as f:
    for line in f:
        #Each line is a word with its coef sep by \s
        word, coefs = line.split(maxsplit=1)
        #Like .split() transforming strings into an array
        coefs = np.fromstring(coefs,'f',sep=' ')
        #Update the dict with a new word and its coef
        emb_dict[word] = coefs
        
df = pd.DataFrame(emb_dict).T
df.sample()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
eyo,-0.14179,-0.1476,-0.51338,0.27316,-0.075388,0.14448,0.92536,-0.11734,-0.034651,0.59225,...,0.31893,0.39141,-0.58556,0.31501,-0.30712,-0.83642,-0.14884,0.16472,0.16469,-0.11288


## Dim reduction

In [4]:
from sklearn.decomposition import PCA

pca = PCA(3)
X = pd.DataFrame(pca.fit_transform(df),index=df.index)
X.sample(4)

Unnamed: 0,0,1,2
mahur,-0.564124,-0.743216,-0.323968
dakroub,-0.19692,-0.708499,0.480787
rodenberg,-0.012771,1.505456,-0.10446
grinstein,0.42448,1.255726,0.145018


## Clustering

In [5]:
'''!pip install sklearn'''
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

clustering = Pipeline(steps=[('MinMax',MinMaxScaler()),('PCA', KMeans(9, random_state=22))])
X['cluster'] = clustering.fit_predict(X[[x for x in X.columns if x not in ['cluster']]])
X['cluster'].value_counts(1)

2    0.145363
6    0.143885
1    0.139253
7    0.134405
8    0.125685
5    0.116795
0    0.074203
4    0.062043
3    0.058370
Name: cluster, dtype: float64

In [6]:
X.to_csv(os.path.join(BASE_DIR,'3D_WordEmbedding.csv'))

## 3D plot

In [3]:
X = pd.read_csv(os.path.join(BASE_DIR,'3D_WordEmbedding.csv'),index_col=0)
X.sample()

Unnamed: 0,0,1,2,cluster
hanshan,-1.137823,-0.014532,-0.40373,2


In [4]:
'''!pip install ipympl'''
'''!pip install seaborn'''
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import ListedColormap
%matplotlib widget

In [8]:
X_sample = X.sample(frac=.0002)

cmap = ListedColormap(sns.color_palette("husl", 256).as_hex())

fig = plt.figure(figsize=(7,6))
ax = fig.add_subplot(111, projection='3d')
sc = ax.scatter(X_sample['0'].values,
                X_sample['1'].values,
                X_sample['2'].values,
                cmap=cmap,
                c=X_sample['cluster'],
                marker='x')
plt.legend(*sc.legend_elements(), bbox_to_anchor=(1.05, 1), loc=2)
for i,x in enumerate(X_sample.index):
    ax.text(X_sample['0'].values[i],X_sample['1'].values[i],X_sample['2'].values[i],
            '%s' % (str(x)), size=5, zorder=3, color='k')

plt.savefig(os.path.join(BASE_DIR,'3D_WordEmbedding.png'), transparent=True)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## Nicer look

In [42]:
X_sample = X.sample(frac=.00015)
X_sample.index = X_sample['cluster'].map(dict(zip(range(len(set(X_sample['cluster']))),
                                                       ['pets','numbers','months','jobs',
                                                        'tools','sports','minerals',
                                                        'countries','verbs','travel',
                                                        'home'])))

cmap = ListedColormap(sns.color_palette("husl", 256).as_hex())

fig = plt.figure(figsize=(6,6))
ax = fig.add_subplot(111)
sc = ax.scatter(X_sample['0'].values,
                X_sample['1'].values,
                cmap=cmap,
                c=X_sample['cluster'],
                marker='o')
#plt.legend(*sc.legend_elements(), bbox_to_anchor=(1.05, 1), loc=2)
for i,x in enumerate(X_sample.index):
    ax.text(X_sample['0'].values[i],X_sample['1'].values[i],
            '%s' % (str(x)), size=7, zorder=3, color='k')
#ax.set_facecolor('#C2D5D5')
plt.savefig(os.path.join(BASE_DIR,'3D_WordEmbedding.png'), transparent=True)
plt.show()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …