# Word 2 Vector model

## CLeaning data

In [7]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

import spacy  # For preprocessing

In [11]:
nlp = spacy.load('en')

In [2]:
df = pd.read_csv("./../data/reddit/cm/Masculism_comments.csv")

In [3]:
df.head()

Unnamed: 0,ID,Comment,Author,Score,Parent id,Publish Date
0,c2lqouq,It is important to realize *why* so much atten...,AHumanThatListens,5,t3_knu87,2011-09-22 09:57:33
1,c2lqp9j,There is also something else that's rarely bro...,[deleted],6,t3_knuah,2011-09-22 10:00:01
2,c2lqpvn,I wish this subreddit good luck. I hope this ...,junkeee999,6,t3_kntzz,2011-09-22 10:03:35
3,c2lqpwb,"Good point. Men are not ""sexualizable,"" right?...",AHumanThatListens,4,t1_c2lqp9j,2011-09-22 10:03:42
4,c2lqq9l,I think the more important point is that when ...,godlessaltruist,11,t1_c2lqouq,2011-09-22 10:05:36


In [6]:
df.isnull().sum()

ID              0
Comment         0
Author          0
Score           0
Parent id       0
Publish Date    0
dtype: int64

In [16]:
df.shape

(14249, 6)

In [8]:
def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]

    # Too short comments aren't so useful in the training
    if len(txt) > 2:
        return ' '.join(txt)

In [9]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['Comment'])

In [12]:
%%time
txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1)]

Time to clean up everything: 6.42 mins
CPU times: user 10min 51s, sys: 1min 41s, total: 12min 33s
Wall time: 6min 25s


In [13]:
df_clean = pd.DataFrame({'clean': txt})
df_clean.shape

(14249, 1)

In [14]:
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(12734, 1)

## Creating the vocabulary

In [20]:
from gensim.models.phrases import Phrases, Phraser

In [52]:
sent = [row.split() for row in df_clean['clean']]

In [54]:
phrases = Phrases(sent, min_count=30, progress_per=10000)

In [55]:
bigram = Phraser(phrases)

In [56]:
sentences = bigram[sent]

In [57]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

21938

In [58]:
sorted(word_freq, key=word_freq.get, reverse=True)[:11]

['man',
 'woman',
 'think',
 'feminist',
 'people',
 'like',
 'rape',
 "'",
 'say',
 'thing',
 'feminism']

## Starting the Model

In [90]:
import multiprocessing

from gensim.models import Word2Vec

In [91]:
cores = multiprocessing.cpu_count()
cores

16

In [92]:
w2v_model = Word2Vec(min_count=2,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [93]:
%%time
w2v_model.build_vocab(sentences, progress_per=10000)

CPU times: user 3.34 s, sys: 28 ms, total: 3.36 s
Wall time: 3.37 s


In [94]:
%%time
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

CPU times: user 3min 18s, sys: 1.11 s, total: 3min 19s
Wall time: 1min 37s


(8311380, 15990960)

In [95]:
w2v_model.init_sims(replace=True)

In [96]:
w2v_model.wv.most_similar(positive=["man"])

[('woman', 0.8259655833244324),
 ('structurally', 0.7188979983329773),
 ('victimize', 0.7095577716827393),
 ('gtman', 0.6805514097213745),
 ('gtwoman', 0.6772775650024414),
 ('supposedly', 0.6742631196975708),
 ('minimal', 0.6638935804367065),
 ('needy', 0.6635546088218689),
 ('lgbtq', 0.6602290868759155),
 ('receiver', 0.6591845750808716)]

In [117]:
w2v_model.wv.most_similar(positive=["mgtow"])

[('pua', 0.9059526920318604),
 ('incel', 0.8711636066436768),
 ('bitter', 0.8563239574432373),
 ('redpill', 0.8489885330200195),
 ('faction', 0.8479870557785034),
 ('homophobic', 0.8384590744972229),
 ('wary', 0.8337081670761108),
 ('vent', 0.8286869525909424),
 ('bash', 0.8209365606307983),
 ('vitriolic', 0.8129596710205078)]

In [98]:
w2v_model.wv.most_similar(positive=["incel"])

[('fling', 0.9197372794151306),
 ('moan', 0.9174047112464905),
 ('witty', 0.9127354621887207),
 ('flame', 0.9085283279418945),
 ('neckbeard', 0.9030858278274536),
 ('disgust', 0.900199830532074),
 ('excited', 0.8980779647827148),
 ('chill', 0.897413969039917),
 ('vitriolic', 0.8946410417556763),
 ('douchebag', 0.8921919465065002)]

In [122]:
w2v_model.wv.most_similar(positive=["feminism"])

[('movement', 0.8342776894569397),
 ('egalitarian', 0.8301292061805725),
 ('brand', 0.8225247859954834),
 ('anti_feminism', 0.8224219083786011),
 ('masculism', 0.8160293102264404),
 ('oppose', 0.796680748462677),
 ('pertain', 0.795570969581604),
 ('mrm', 0.7916226983070374),
 ('gender_equality', 0.7915030717849731),
 ('monolithic', 0.7891660928726196)]

In [100]:
w2v_model.wv.similarity("mra", "masculism")

0.6883935

In [101]:
w2v_model.wv.similarity("mra", "mgtow")

0.71136

In [102]:
w2v_model.wv.similarity("mra", "incel")

0.5894016

In [103]:
w2v_model.wv.similarity("incel", "mgtow")

0.8711634

In [104]:
w2v_model.wv.similarity("redpill", "mgtow")

0.8489885

In [105]:
w2v_model.wv.similarity("redpill", "mra")

0.71640015

In [106]:
w2v_model.wv.similarity("mra", "house")

-0.018870939

In [107]:
w2v_model.wv.similarity("man", "woman")

0.8259655

In [109]:
w2v_model.wv.doesnt_match(['incel', 'mgtow', 'mra', 'mensright'])

'mra'

In [110]:
def tsnescatterplot(model, word, list_names):
    """ Plot in seaborn the results from the t-SNE reduction algorithm of the vectors of a query word,
    its list of most similar words, and a list of words.
    """
    arrays = np.empty((0, 300), dtype='f')
    word_labels = [word]
    color_list  = ['red']

    # adds the vector of the query word
    arrays = np.append(arrays, model.wv.__getitem__([word]), axis=0)
    
    # gets list of most similar words
    close_words = model.wv.most_similar([word])
    
    # adds the vector for each of the closest words to the array
    for wrd_score in close_words:
        wrd_vector = model.wv.__getitem__([wrd_score[0]])
        word_labels.append(wrd_score[0])
        color_list.append('blue')
        arrays = np.append(arrays, wrd_vector, axis=0)
    
    # adds the vector for each of the words from list_names to the array
    for wrd in list_names:
        wrd_vector = model.wv.__getitem__([wrd])
        word_labels.append(wrd)
        color_list.append('green')
        arrays = np.append(arrays, wrd_vector, axis=0)
        
    # Reduces the dimensionality from 300 to 50 dimensions with PCA
    reduc = PCA(n_components=19).fit_transform(arrays)
    
    # Finds t-SNE coordinates for 2 dimensions
    np.set_printoptions(suppress=True)
    
    Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(reduc)
    
    # Sets everything up to plot
    df = pd.DataFrame({'x': [x for x in Y[:, 0]],
                       'y': [y for y in Y[:, 1]],
                       'words': word_labels,
                       'color': color_list})
    
    fig, _ = plt.subplots()
    fig.set_size_inches(9, 9)
    
    # Basic plot
    p1 = sns.regplot(data=df,
                     x="x",
                     y="y",
                     fit_reg=False,
                     marker="o",
                     scatter_kws={'s': 40,
                                  'facecolors': df['color']
                                 }
                    )
    
    # Adds annotations one by one with a loop
    for line in range(0, df.shape[0]):
         p1.text(df["x"][line],
                 df['y'][line],
                 '  ' + df["words"][line].title(),
                 horizontalalignment='left',
                 verticalalignment='bottom', size='medium',
                 color=df['color'][line],
                 weight='normal'
                ).set_size(15)

    
    plt.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50)
    plt.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50)
            
    plt.title('t-SNE visualization for {}'.format(word.title()))