## Starting my own model

In [1]:
import pandas as pd
import pickle
from gensim.models.phrases import Phrases, Phraser
from collections import defaultdict  # For word frequency

In [2]:
with open("df_clean.csv", "rb") as fp:   # Unpickling
    df_clean = pickle.load(fp)

In [3]:
df_clean = df_clean[~df_clean.clean.str.contains("gt")]

In [4]:
sent = [row.split() for row in df_clean['clean']]

In [6]:
phrases = Phrases(sent, min_count=30)

In [8]:
bigram = Phraser(phrases)
sentences = bigram[sent]

In [9]:
import multiprocessing

from gensim.models import Word2Vec

In [10]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [11]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [12]:
w2v_model.build_vocab(sentences[:100000])

### Google Model

In [11]:
from gensim.models import KeyedVectors
filename = 'GoogleNews-vectors-negative300.bin'
google_model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [16]:
google_model.most_similar(positive=["man"])

[('woman', 0.7664012312889099),
 ('boy', 0.6824870705604553),
 ('teenager', 0.6586930155754089),
 ('teenage_girl', 0.6147903800010681),
 ('girl', 0.5921714305877686),
 ('suspected_purse_snatcher', 0.571636438369751),
 ('robber', 0.5585119724273682),
 ('Robbery_suspect', 0.5584410429000854),
 ('teen_ager', 0.5549196600914001),
 ('men', 0.5489762425422668)]

### Fine Tunning

In [14]:
w2v_model.intersect_word2vec_format('GoogleNews-vectors-negative300.bin', lockf=1.0, binary=True)

In [15]:
w2v_model.train(sentences[:100000], total_examples=w2v_model.corpus_count, epochs=30)

(26590687, 58773780)

In [26]:
w2v_model.wv.most_similar(positive=["incel"])

[('virgin', 0.3080313801765442),
 ('low_smv', 0.27676504850387573),
 ('celibate', 0.23913732171058655),
 ('experienced', 0.2344883382320404),
 ('dork', 0.23057860136032104),
 ('medium_cache', 0.23048681020736694),
 ('loser', 0.2278689593076706),
 ('lurker', 0.2260952889919281),
 ('partner_count', 0.2256605625152588),
 ('pump_dump', 0.2186303734779358)]