# Train embeddings

In [1]:
import os
import numpy as np
from gensim.models import Word2Vec

unable to import 'smart_open.gcs', disabling that module


In [2]:
model = Word2Vec(corpus_file="data/imdb/imdb_dataset.csv", sg=0, min_count=5, size=100, window=4, negative=5, hs=0, sample=1e-5, workers=12, iter=20)

## Save embeddings

In [3]:
if not os.path.isdir("emb_dir"):
    os.mkdir("emb_dir") 

model.save("emb_dir/emb.bin")

In [4]:
w2v = Word2Vec.load("emb_dir/emb.bin")

In [5]:
with open("emb_dir/vocab.txt", "w", encoding='utf-8') as f:
    vocab = sorted([(model.wv.vocab[x].index, x, model.wv.vocab[x].count) for x in model.wv.vocab.keys()], key=lambda x: x[0])
    vocab = [f"{v[1]} {v[2]}" for v in vocab]
    vocab = "\n".join(vocab)
    f.write(vocab)

np.save("emb_dir/emb", model.wv.vectors)

## Load embeddings

In [6]:
vocab = [v.split()[0] for v in open("emb_dir/vocab.txt").read().split("\n")]
emb = np.load("emb_dir/emb.npy")

In [9]:
len(vocab)

25940

In [10]:
emb.shape

(25940, 100)

In [11]:
from numpy import dot
from numpy.linalg import norm

def cosine_similarity(a, b):
    return dot(a, b)/(norm(a)*norm(b))

In [12]:
cosine_similarity(emb[vocab.index("green")], emb[vocab.index("red")])

0.9859541

In [15]:
vocab[11123]

'base,'

In [13]:
cosine_similarity(emb[vocab.index("green")], emb[11123])

0.99732995

## Pretrained embeddings

[RusVectōrēs](https://rusvectores.org/ru/models/)

In [16]:
w2v_p = Word2Vec.load("pretrained/model.bin")

UnpicklingError: could not find MARK

In [17]:
w2v_p = open("pretrained/model.txt").read().split("\n")[1:]

In [18]:
len(w2v_p)

189193

In [20]:
w2v_p[1]

'быть_VERB 0.56827927 -0.3209115 -0.019124394 -1.9356475 -0.8124491 1.657973 -1.3159128 1.1186373 -2.0223322 -3.2186239 -0.8087832 -1.6359693 0.30781016 -1.9637547 1.1508301 0.09154594 1.499977 -1.9233339 -1.395698 0.4522272 1.3875449 -0.6991994 1.1241058 -0.8872212 1.5669478 0.105251916 -0.72891355 -1.7446259 -0.39187282 -0.21783888 0.99993414 -2.8352168 0.28585172 -1.2888701 -0.7390094 0.7050435 2.6338942 0.37333074 2.078776 -0.19290456 -0.73848265 -1.7742666 1.6497376 3.7614036 -1.2978705 -0.00728586 1.3725481 -1.5491337 -0.81334364 -0.7748071 -0.61844414 0.7076229 0.53189814 -0.519196 1.2863954 0.99337226 1.1154282 -0.930117 1.572236 -1.2262355 -0.86334157 0.61980397 -0.09515494 0.5440264 -0.50054914 0.32205996 -1.49439 0.63581914 2.5604725 0.62158006 -0.9675487 1.7475013 -1.055982 1.8056836 0.29194352 0.72323376 -0.2652101 -1.5487951 -1.2479311 0.6863535 -0.5887072 -0.25386146 -1.0167173 -2.9647908 -0.96875453 -1.6287156 -0.60758674 -0.5515135 -0.4176201 0.6098959 -2.8192685 2.585

In [19]:
len(w2v_p[0].split())

301

In [21]:
vocab = []
emb = []
for line in w2v_p:
    l = line.split()
    word = l[0].split("_")[0]
    
    vec = list(map(float, l[1:]))
    
    vocab += [word]
    emb += [vec]

emb = np.array(emb)

In [22]:
len(vocab)

189193

In [23]:
emb.shape

(189193, 300)

In [24]:
cosine_similarity(emb[vocab.index("красный")], emb[vocab.index("зеленый")])

0.5960777700118979

In [25]:
cosine_similarity(emb[vocab.index("красный")], emb[vocab.index("человек")])

-0.19100238054656765

# Using with ML algorithm

In [26]:
import csv
import re

In [27]:
vocab = [v.split()[0] for v in open("emb_dir/vocab.txt").read().split("\n")]
emb = np.load("emb_dir/emb.npy")

vocab = ["<PAD>", "<UNK>"] + vocab

emb = np.insert(emb, 0, np.random.uniform(-1, 1, emb.shape[1]), axis=0)
emb = np.insert(emb, 0, np.zeros(emb.shape[1]), axis=0)

In [28]:
len(vocab)

25942

In [29]:
emb.shape

(25942, 100)

In [31]:
emb[1]

array([ 0.41583523,  0.50968206, -0.13879314, -0.16804345, -0.747004  ,
       -0.91946787,  0.6386806 , -0.22171985,  0.50008035, -0.89180565,
        0.61508083,  0.8735436 ,  0.36179537,  0.36583492, -0.11727881,
        0.38155738, -0.56035167,  0.58727276,  0.56286174,  0.35117456,
        0.87490153, -0.320434  , -0.10026085, -0.3016947 , -0.89546376,
        0.07421842,  0.45389298, -0.86732966, -0.42485312,  0.51647836,
        0.4295946 ,  0.6681166 , -0.3370198 ,  0.7573049 ,  0.16937827,
        0.76184666,  0.97491103, -0.0215477 ,  0.5299386 , -0.19537358,
        0.37821028, -0.788024  ,  0.22741619, -0.88069034, -0.16316468,
        0.07166386,  0.73954093,  0.25548416,  0.38152096,  0.9114591 ,
        0.34309018,  0.73081404, -0.78640765, -0.03425251,  0.42943048,
       -0.28751266,  0.14582887, -0.7280036 , -0.97134376,  0.24407373,
        0.3497146 , -0.18667412,  0.00170363, -0.22577421, -0.14483874,
       -0.75114655,  0.75909424,  0.6838565 , -0.0205434 , -0.13

In [32]:
def load_data(filename, delimiter, vocab, emb):
    samples = []
    labels = []
    with open(filename, newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=delimiter, quotechar='"')
        next(reader) #skip csv header
        for row in reader:
            samples += [row[0]]
            labels += [row[1]]

    preprocessed_samples = []
    for sample in samples:
        s = sample.lower()
        s = re.sub("[^а-яА-Яa-zA-Z0-9]", " ", s)
        s = re.sub("\s+", " ", s)
        s = s.strip()
        preprocessed_samples += [s]
    
    tokenized_samples = []
    for sample in preprocessed_samples:
        s = sample.split()
        tokenized_samples += [s]
    
    digitized_samples = []
    for sample in tokenized_samples:
        s = [vocab.index(token) if token in vocab else vocab.index("<UNK>") for token in sample]
            
        digitized_samples += [s]
        
    correct_samples = []
    max_len = 128

    for sample in digitized_samples:
        if len(sample) < max_len:
            sample += [vocab.index("<PAD>")] * (max_len - len(sample))

        correct_samples += [sample[:max_len]]
        
    labels_dict = {"negative": 0, "positive": 1}

    correct_labels = [labels_dict[label] for label in labels]
    
    train_data = correct_samples[:4000]
    train_labels = correct_labels[:4000]
    test_data = correct_samples[4000:]
    test_labels = correct_labels[4000:]
    
    return train_data, train_labels, test_data, test_labels

In [33]:
train_data, train_labels, test_data, test_labels = load_data('data/imdb/imdb_dataset.csv', ',', vocab, emb)

In [34]:
len(train_data[0])

128