In [1]:
import io
from tqdm import tqdm
from itertools import islice
import numpy as np

def load_vectors(fname, limit):
  fin = io.open(fname, 'r', encoding = 'utf-8', newline = '\n', errors = 'ignore')
  n, d = map(int, fin.readline().split())
  data = {}
  for line in tqdm(islice(fin, limit), total = limit):
    tokens = line.rstrip().split(' ')
    data[tokens[0]] = np.array(list(map(float, tokens[1:])))
  return data

vecs = load_vectors('crawl-300d-2M.vec', 100000)   

100%|██████████| 100000/100000 [00:15<00:00, 6412.90it/s]


In [2]:
def get_k_nearest_neighbors(vec, k):
  return list(zip(*sorted(list(map(lambda key: (np.linalg.norm(vec - vecs[key]), key), vecs.keys())))))[1][:k]

print(get_k_nearest_neighbors(vecs['Paris'], 20))
print(get_k_nearest_neighbors(vecs['brother'], 20))

('Paris', 'France', 'Parisian', 'paris', 'Lyon', 'London', 'PARIS', 'French', 'Lille', 'Marseille', 'Toulouse', 'Bordeaux', 'Marseilles', 'Strasbourg', 'Berlin', 'Le', 'Versailles', 'Nantes', 'Brussels', 'Grenoble')
('brother', 'sister', 'cousin', 'brothers', 'brother-in-law', 'uncle', 'nephew', 'father', 'son', 'sister-in-law', 'aunt', 'sisters', 'daughter', 'niece', 'dad', 'cousins', 'Brother', 'mother', 'siblings', 'grandfather')


In [None]:
get_k_nearest_neighbors(vecs['Paris'] - vecs['France'] + vecs['Germany'], 1)


In [None]:
get_k_nearest_neighbors(vecs['brother'] - vecs['man'] + vecs['woman'], 1)

In [None]:
get_k_nearest_neighbors(vecs['king'] - vecs['man'] + vecs['woman'], 5)


In [None]:
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset = 'train', categories = categories, shuffle = True, random_state = 42)
twenty_test = fetch_20newsgroups(subset='test', categories = categories,shuffle = True, random_state = 42)
X_train = twenty_train.data
y_train = twenty_train.target
X_test = twenty_test.data
y_test = twenty_test.target


In [None]:
vecs = load_vectors('crawl-300d-2M.vec', 2000000)   


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)
logreg = LogisticRegression(solver = 'liblinear', multi_class = 'ovr', random_state = 1)

logreg.fit(X_train_tfidf, y_train)
y_pred = logreg.predict(X_test_tfidf)
print(accuracy_score(y_test, y_pred))


In [None]:
print(X_train_tfidf.shape)


In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 300, random_state = 1)
X_train_tfidf_pca = pca.fit_transform(X_train_tfidf.todense())
X_test_tfidf_pca = pca.transform(X_test_tfidf.todense())

logreg.fit(X_train_tfidf_pca, y_train)
y_pred_pca = logreg.predict(X_test_tfidf_pca)
print(accuracy_score(y_test, y_pred_pca))


In [None]:
import numpy as np

zero = sum(vecs.values()) / len(vecs)
def text2vec(text):
  words = text.split()
  return sum(list(map(lambda w: np.array(list(vecs.get(w, zero))), words))) / len(words)


In [None]:
X_train_vec = list(map(lambda text: text2vec(text), X_train))
X_test_vec = list(map(lambda text: text2vec(text), X_test))


In [None]:
logreg.fit(X_train_vec, y_train)
y_pred_vec = logreg.predict(X_test_vec)
print(accuracy_score(y_test, y_pred_vec))



In [None]:
dim = 300
vocab = np.zeros((len(tfidf.vocabulary_.keys()), dim))
for key in tqdm(tfidf.vocabulary_.keys()):
  vocab[tfidf.vocabulary_[key]] = vecs.get(key, zero)

In [None]:
X_train_weighted = X_train_tfidf.dot(vocab)
X_test_weighted = X_test_tfidf.dot(vocab)


In [None]:
logreg.fit(X_train_weighted, y_train)
y_pred_weighted = logreg.predict(X_test_weighted)
print(accuracy_score(y_test, y_pred_weighted))


In [None]:
import pandas as pd

reviews = pd.read_csv('imdb_master.csv', encoding = 'latin-1')['review']


In [None]:
from gensim.models import Word2Vec

sentences = list(map(lambda text: text.split(), reviews))
model = Word2Vec(sentences, min_count = 1, seed = 1, workers = 1)


In [None]:
wv = model.wv


In [None]:

print(wv.similar_by_word('Paris'))
print()
print(wv.similar_by_word('brother'))
print()
print(wv.similar_by_vector(wv['king'] - wv['man'] + wv['woman'], 1))