In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tqdm
import os
import glob

In [None]:
import spacy

In [None]:
nlp = spacy.load('en_core_web_lg')

In [None]:
def cosine_similarity(matrix, vector):
    cos_sim = np.dot(matrix, vector) / (np.linalg.norm(matrix, axis=1) * np.linalg.norm(vector))
    return cos_sim

## Load vocab (all)

In [None]:
vocab_vectors = nlp.vocab.vectors.data

In [None]:
# keys are hashes
vocab_words = np.array([nlp.vocab.strings[key] for key in nlp.vocab.vectors])

In [None]:
if False:
    lst = []
    for i, word in tqdm.tqdm(enumerate(vocab_words), total=len(vocab_words)):
        doc = nlp(str(word))
        if len(doc) == 1:
            token = doc[0]
            lst.append(
                [
                    token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
                    token.shape_, token.is_alpha, token.is_stop, token.morph
                ]
            )
        else:
            row = [word]
            row.extend([np.nan]*8)
            lst.append(row)
    cols = [
        'text', 'lemma', 'pos', 'tag', 'dep', 
        'shape', 'alpha', 'stop', 'morph', 
    ]
    df_vocab = pd.DataFrame(lst, columns=cols)
else:
    df_vocab = pd.read_csv('spacy_vocab.csv')

In [None]:
vocab_words.shape, vocab_vectors.shape, df_vocab.shape

## Load vocab (English)

In [None]:
from string import ascii_lowercase

In [None]:
lowercase = list(ascii_lowercase)

In [None]:
mask_lower = np.array([np.isin(list(text), lowercase, invert=True).sum() == 0 for text in tqdm.tqdm(df_vocab['text'].astype(str))])

In [None]:
from nltk.corpus import words

In [None]:
eng_arr = np.array(words.words())

In [None]:
mask_eng = np.isin(df_vocab['text'].values.astype(str), eng_arr)

In [None]:
mask_lemma = df_vocab['text'] == df_vocab['lemma']

In [None]:
mask = mask_lower & mask_eng & mask_lemma

In [None]:
df_vocab = df_vocab[mask]

In [None]:
vocab_words = vocab_words[mask]

In [None]:
vocab_vectors = vocab_vectors[mask]

In [None]:
vocab_words.shape, vocab_vectors.shape, df_vocab.shape

## EDA

In [None]:
vocab_words_len = np.array([len(i) for i in vocab_words])

In [None]:
#plt.hist(np.log10(vocab_words_len), bins=100)
plt.hist(vocab_words_len, bins=100)
plt.yscale('log')

In [None]:
vocab_words_len.mean(), vocab_words_len.std()

In [None]:
vocab_chars = list(''.join(vocab_words))

In [None]:
char, cnt_char = np.unique(vocab_chars, return_counts=True)

In [None]:
plt.hist(np.log10(cnt_char), bins=50)
plt.yscale('log')

In [None]:
mask = cnt_char>100#0
plt.figure(figsize=[15,5])
plt.bar(char[mask], cnt_char[mask])
plt.bar(f'\nOther:\n{(~mask).sum()}', cnt_char[~mask].sum())
plt.yscale('log')
plt.tight_layout()

In [None]:
vocab_vectors_mean = vocab_vectors.mean(1)
vocab_vectors_std = vocab_vectors.std(1)

In [None]:
fig, axs = plt.subplots(2,2,figsize=[10,10])
axs[0,0].hist(vocab_vectors.flatten(), bins=200)
axs[0,1].scatter(vocab_vectors_mean, vocab_vectors_std, alpha=0.1, s=1)
axs[1,0].hist(vocab_vectors_mean, bins=100)
axs[1,1].hist(vocab_vectors_std, bins=100)
axs[0,0].set_yscale('log')
#axs[0,1].set_yscale('log')
axs[1,0].set_yscale('log')
axs[1,1].set_yscale('log')

In [None]:
corr = []
for i in tqdm.tqdm(range(300)):
    for j in range(i+1,300):
        corr_ij = np.corrcoef(vocab_vectors[:, i], vocab_vectors[:, j])
        corr.append([i, j, corr_ij[0, 1]])

In [None]:
corr = np.array(corr)

In [None]:
corr_min_0, corr_min_1, corr_min = corr[np.argmin(corr[:, 2])]
corr_max_0, corr_max_1, corr_max = corr[np.argmax(corr[:, 2])]

In [None]:
fig, axs = plt.subplots(1,2,figsize=[10,5])
axs[0].hist(corr[:, 2], bins=100)
axs[1].scatter(vocab_vectors[:, int(corr_min_0)], vocab_vectors[:, int(corr_min_1)], s=1, alpha=0.1, label=f'{corr_min:.3f}')
axs[1].scatter(vocab_vectors[:, int(corr_max_0)], vocab_vectors[:, int(corr_max_1)], s=1, alpha=0.1, label=f'{corr_max:.3f}')
axs[0].set_yscale('log')
axs[1].legend()

In [None]:
df_vocab.shape[0] - len(set(df_vocab['lemma']))

In [None]:
pos, cnt_pos = np.unique(df_vocab['pos'].astype(str), return_counts=True)

In [None]:
plt.figure(figsize=[15,5])
plt.bar(pos, cnt_pos)
plt.yscale('log')
plt.tight_layout()

In [None]:
tag, cnt_tag = np.unique(df_vocab['tag'].astype(str), return_counts=True)

In [None]:
plt.figure(figsize=[20,5])
plt.bar(tag, cnt_tag)
plt.yscale('log')
plt.tight_layout()

In [None]:
shape, cnt_shape = np.unique(df_vocab['shape'].astype(str), return_counts=True)

In [None]:
mask = cnt_shape>10#00
plt.figure(figsize=[15,5])
plt.bar(shape[mask], cnt_shape[mask])
plt.bar(f'\nOther:\n{(~mask).sum()}', cnt_shape[~mask].sum())
plt.yscale('log')
plt.tight_layout()

In [None]:
(df_vocab['alpha'] == True).sum() / df_vocab.shape[0]

In [None]:
(df_vocab['stop'] == True).sum() / df_vocab.shape[0]

In [None]:
morph, cnt_morph = np.unique(df_vocab['morph'].astype(str), return_counts=True)

In [None]:
mask = cnt_morph>100#0
morph_mask = []
for i in range(morph[mask].shape[0]):
    if i % 3 == 0:
        if i == 0:
            morph_mask.append('()')
        else:
            morph_mask.append(f'{morph[mask][i]}')
    elif i % 3 == 1:
        morph_mask.append(f'\n{morph[mask][i]}')
    else:
        morph_mask.append(f'\n\n{morph[mask][i]}')
plt.figure(figsize=[15,5])
plt.bar(morph_mask, cnt_morph[mask])
plt.bar(f'\nOther:\n{(~mask).sum()}', cnt_morph[~mask].sum())
plt.yscale('log')
plt.tight_layout()

## PCA

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA(n_components=2, whiten=True)

In [None]:
pca_words = pca.fit_transform(vocab_vectors)

In [None]:
pca_words.shape

In [None]:
pca.explained_variance_ratio_.sum()

In [None]:
xmin = int(pca_words[:, 0].min()) - 1
xmax = int(pca_words[:, 0].max()) + 1
ymin = int(pca_words[:, 1].min()) - 1
ymax = int(pca_words[:, 1].max()) + 1

In [None]:
xmin, xmax, ymin, ymax

In [None]:
plt.figure(figsize=[10,10])
plt.grid()
plt.scatter(pca_words[:, 0], pca_words[:, 1], s=1, alpha=0.1)
plt.xlim(xmin, xmax)
plt.ylim(ymin, ymax)

In [None]:
df_vocab.columns

In [None]:
for i in pos:
    mask = df_vocab['pos'].astype(str) == i
    plt.title(f'{i}: {mask.sum()}')
    plt.scatter(pca_words[:, 0][mask], pca_words[:, 1][mask], s=1, alpha=0.5)
    plt.xlim(xmin, xmax)
    plt.ylim(ymin, ymax)
    plt.show()

In [None]:
ind = np.argmax(pca.components_[1])

In [None]:
vocab_words[np.argsort(vocab_vectors[:, ind])[-100:]]

## UMAP

In [None]:
from umap import UMAP

In [None]:
umap = UMAP(n_components=2)

In [None]:
#umap.fit(vocab_vectors[::5])

In [None]:
#skip = len(vocab_vectors)//7
#umap_words = np.concatenate([umap.transform(vocab_vectors[i:i+skip]) for i in tqdm.tqdm(range(0, len(vocab_vectors), skip))])

In [None]:
umap_words = umap.fit_transform(vocab_vectors)

In [None]:
umap_words.shape

In [None]:
xmin = int(umap_words[:, 0].min()) - 1
xmax = int(umap_words[:, 0].max()) + 1
ymin = int(umap_words[:, 1].min()) - 1
ymax = int(umap_words[:, 1].max()) + 1

In [None]:
xmin, xmax, ymin, ymax

In [None]:
plt.figure(figsize=[10,10])
plt.grid()
plt.scatter(umap_words[:, 0], umap_words[:, 1], s=1, alpha=0.1)
plt.xlim(xmin, xmax)
plt.ylim(ymin, ymax)

In [None]:
df_vocab.columns

In [None]:
for i in pos:
    mask = df_vocab['pos'].astype(str) == i
    plt.title(f'{i}: {mask.sum()}')
    plt.scatter(umap_words[:, 0][mask], umap_words[:, 1][mask], s=1, alpha=0.5)
    plt.xlim(xmin, xmax)
    plt.ylim(ymin, ymax)
    plt.show()

## Similarity

In [None]:
vocab_small = np.load('vocab_lemma_small.npz')['vls']

In [None]:
vocab_small

In [None]:
target_vector = vocab_vectors[np.where('one' == vocab_words)[0][0]]

In [None]:
cos_sim = cosine_similarity(vocab_vectors, target_vector)
inds_sort = np.argsort(cos_sim)

In [None]:
vocab_words[inds_sort[-100:]]

In [None]:
plt.figure(figsize=[10,10])
plt.grid()
plt.scatter(umap_words[:, 0], umap_words[:, 1], s=1, alpha=0.1)
plt.scatter(umap_words[inds_sort[-100:-1], 0], umap_words[inds_sort[-100:-1], 1], s=10, alpha=1)
plt.scatter(umap_words[inds_sort[-1], 0], umap_words[inds_sort[-1], 1], s=10, alpha=1)
plt.xlim(xmin, xmax)
plt.ylim(ymin, ymax)