# Models of Word2Vec 

# Learning how to use:

## Gensim Tutorial with Wikipedia article

### Creating Corpus

In [None]:
import bs4 as bs  
import urllib.request  
import re  
import nltk

scrapped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence')  
article = scrapped_data .read()

parsed_article = bs.BeautifulSoup(article,'lxml')

paragraphs = parsed_article.find_all('p')

article_text = ""

for p in paragraphs:  
    article_text += p.text

In [2]:
print(type(article), "\n", 
      article[:100])

<class 'bytes'> 
 b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title'


In [3]:
print(type(parsed_article), "\n",
      parsed_article.find("title")) #html page


<class 'bs4.BeautifulSoup'> 
 <title>Artificial intelligence - Wikipedia</title>


In [4]:
print(type(article_text), "\n",
      article_text[:100])

<class 'str'> 
 
In computer science,  artificial intelligence (AI), sometimes called machine intelligence, is intel


### Preprocessing

In [5]:
# Cleaing the text
processed_article = article_text.lower()  
processed_article = re.sub('[^a-zA-Z]', ' ', processed_article )  
processed_article = re.sub(r'\s+', ' ', processed_article)

In [6]:
processed_article[:100]

' in computer science artificial intelligence ai sometimes called machine intelligence is intelligenc'

In [7]:
import spacy

In [8]:
nlp = spacy.load('en')

In [9]:
doc = nlp(processed_article)
for i, token in enumerate(doc.sents):
    print('-->Sentence %d: %s' % (i, token.text))

-->Sentence 0:  in computer science artificial intelligence ai sometimes called machine intelligence is intelligence demonstrated by machines in contrast to the natural intelligence displayed by humans and animals colloquially the term artificial intelligence is used to describe machines that mimic cognitive functions that humans associate with other human minds such as learning and problem solving as machines become increasingly capable tasks considered to require intelligence are often removed from the definition of ai a phenomenon known as the ai effect a quip in tesler s theorem says ai is whatever hasn t been done yet for instance optical character recognition is frequently excluded from things considered to be ai having become a routine technology modern machine capabilities generally classified as ai include successfully understanding human speech competing at the highest level in strategic game systems such as chess and go autonomously operating cars intelligent routing in cont

In [10]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [11]:
spacy_stopwords

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [125]:
all_words = [token for token in doc.sents]

In [126]:
all_words[0][2].is_stop

False

In [127]:
for i in range(len(all_words)):
    # if it's not a stop word or punctuation mark, add it to our article!
    all_words[i] = [w.lemma_ for w in all_words[i] if not w.is_stop]

In [128]:
all_words #list of lists

[['computer',
  'science',
  'artificial',
  'intelligence',
  'ai',
  'call',
  'machine',
  'intelligence',
  'intelligence',
  'demonstrate',
  'machine',
  'contrast',
  'natural',
  'intelligence',
  'display',
  'human',
  'animal',
  'colloquially',
  'term',
  'artificial',
  'intelligence',
  'describe',
  'machine',
  'mimic',
  'cognitive',
  'function',
  'human',
  'associate',
  'human',
  'mind',
  'learning',
  'problem',
  'solve',
  'machine',
  'increasingly',
  'capable',
  'task',
  'consider',
  'require',
  'intelligence',
  'remove',
  'definition',
  'ai',
  'phenomenon',
  'know',
  'ai',
  'effect',
  'quip',
  'tesler',
  'theorem',
  'ai',
  'hasn',
  't',
  'instance',
  'optical',
  'character',
  'recognition',
  'frequently',
  'exclude',
  'thing',
  'consider',
  'ai',
  'have',
  'routine',
  'technology',
  'modern',
  'machine',
  'capability',
  'generally',
  'classify',
  'ai',
  'include',
  'successfully',
  'understand',
  'human',
  'speech'

### Creating Word2Vec Model

In [129]:
from gensim.models import Word2Vec

word2vec = Word2Vec(all_words, min_count=2) 

In [130]:
vocabulary = word2vec.wv.vocab  
print(len(vocabulary))
print(vocabulary)  

1070
{'computer': <gensim.models.keyedvectors.Vocab object at 0x7fc0397a90b8>, 'science': <gensim.models.keyedvectors.Vocab object at 0x7fc0397a9080>, 'artificial': <gensim.models.keyedvectors.Vocab object at 0x7fc0397a90f0>, 'intelligence': <gensim.models.keyedvectors.Vocab object at 0x7fc0397a9128>, 'ai': <gensim.models.keyedvectors.Vocab object at 0x7fc0397a91d0>, 'call': <gensim.models.keyedvectors.Vocab object at 0x7fc0397a9160>, 'machine': <gensim.models.keyedvectors.Vocab object at 0x7fc0397a9240>, 'demonstrate': <gensim.models.keyedvectors.Vocab object at 0x7fc0397a9198>, 'contrast': <gensim.models.keyedvectors.Vocab object at 0x7fc0397a92b0>, 'natural': <gensim.models.keyedvectors.Vocab object at 0x7fc0397a9208>, 'display': <gensim.models.keyedvectors.Vocab object at 0x7fc0397a9278>, 'human': <gensim.models.keyedvectors.Vocab object at 0x7fc0397a92e8>, 'animal': <gensim.models.keyedvectors.Vocab object at 0x7fc0397a9390>, 'term': <gensim.models.keyedvectors.Vocab object at 0x7

### Model Analysis

In [131]:
v1 = word2vec.wv['artificial']   #vector for the word

In [132]:
v1

array([-1.0011374e-03, -1.1876627e-03,  3.5678297e-03,  4.7016554e-03,
        1.2425583e-02, -3.1497229e-03,  7.8962073e-03, -6.7504519e-03,
        7.2532906e-03,  4.4042361e-03,  2.6449893e-04, -1.9917439e-03,
        1.1857611e-03,  2.7442928e-03, -1.1719433e-02,  4.0582476e-05,
        5.6022271e-03, -4.0285396e-03, -2.1333680e-03, -1.2278679e-03,
        6.5676826e-03, -4.0238095e-03,  2.1322779e-03,  6.7554684e-03,
       -8.3139550e-04, -8.8217184e-03,  2.1870395e-04, -6.4608511e-03,
       -8.2796132e-03, -4.1018575e-04, -7.9176249e-03,  2.8374000e-03,
        2.2966918e-03,  1.0042547e-02, -6.8599260e-03, -4.5062797e-03,
       -4.1345318e-04, -3.1231092e-03,  2.5921324e-03, -5.0421548e-03,
       -4.6865758e-03, -8.5072210e-03, -1.5624474e-03, -2.5031993e-03,
       -4.8998734e-03,  4.1424078e-03,  7.4683055e-03, -3.8514652e-03,
        3.2686093e-03,  5.9349136e-04,  4.8933376e-04, -8.2950486e-04,
       -4.0827552e-03, -3.3352512e-04,  6.0979971e-03,  2.2259193e-04,
      

In [133]:
sim_words = word2vec.wv.most_similar('intelligence', topn=20)  

In [134]:
sim_words

[('ai', 0.7749153971672058),
 ('human', 0.7547346353530884),
 ('learn', 0.7315230369567871),
 ('system', 0.7218893766403198),
 ('machine', 0.7061141729354858),
 ('application', 0.6937988996505737),
 ('artificial', 0.692389965057373),
 ('include', 0.679070234298706),
 ('use', 0.6708467602729797),
 ('know', 0.6683763861656189),
 ('problem', 0.6606654524803162),
 ('model', 0.6571164727210999),
 ('network', 0.652495265007019),
 ('game', 0.6483724117279053),
 ('theory', 0.6467932462692261),
 ('algorithm', 0.6414783000946045),
 ('car', 0.6391733884811401),
 ('logic', 0.6334788799285889),
 ('decision', 0.6291033029556274),
 ('possible', 0.6261762380599976)]

# Trying to use the previous model with reddit data

In [35]:
import pandas as pd
import re
from gensim.models import Word2Vec
import spacy
nlp = spacy.load('en')

In [36]:
df = pd.read_csv("./data/reddit/cm/MRActivism_comments.csv")

In [37]:
serie = df.Comment

In [38]:
serie.head()

0    The Office of Civil Rights is an organization ...
1    I suppose I'm most concerned as to how an offi...
2    How do we do it?  What can I do?  I want this ...
3    Sorry, I'll try to get to this later. This wee...
4    So how about some cites? Where is this new sta...
Name: Comment, dtype: object

In [39]:
serie.shape

(348,)

In [40]:
serie = serie[serie != '[deleted]']
serie.shape

(330,)

In [80]:
my_stop_words = [u'say', u'\s', u'Mr', u'be', u'said', u'says', u'saying', u's', u'’s', u'\n\n',
                 u' ', u's', u'\n', u't']
for stopword in my_stop_words:
    lexeme = nlp.vocab[stopword]
    lexeme.is_stop = True

In [81]:
stl= pd.Series.tolist(serie)

In [94]:
texts, article, skl_texts = [], [], []
for pre_comment in stl:
    if "www" in pre_comment: 
        continue
    comment = pre_comment.lower()  
    comment = re.sub('[^a-zA-Z]', ' ', comment)  
    comment = re.sub(r'\s+', ' ', comment)
    try:
        doc = nlp(comment)
    except:
        continue
    for w in doc:
        # if it's not a stop word or punctuation mark, add it to our article!
        if not w.is_stop:
            # we add the lematized version of the word
            article.append(w.lemma_)
        # assume each comment as a document
        if w.text == doc[-1].text:
            skl_texts.append(' '.join(article))
            texts.append(article)
            article = []

In [95]:
texts

[['office', 'civil', 'right', 'organization', 'support', 'woman'],
 ['despite',
  'gender',
  'neutral',
  'mandate',
  'consistently',
  'work',
  'support',
  'woman'],
 ['new',
  'preponderance',
  'evidence',
  'standard',
  'outrageous',
  'purely',
  'effort',
  'place',
  'responsibility',
  'sexual',
  'activity',
  'young',
  'man',
  'away',
  'woman'],
 ['suppose',
  'm',
  'concerned',
  'office',
  'government',
  'get',
  'decision',
  'like',
  'face',
  'point',
  'legal',
  'criticism',
  'party',
  'stick',
  'finger',
  'ear',
  'pretend',
  'aren',
  'question',
  'answer',
  'm',
  'surprised',
  'office',
  'decision',
  'm',
  'perplex',
  'away',
  'defend',
  'know',
  'maybe',
  'valid',
  'argument',
  'haven',
  'hear',
  'know',
  'give',
  'win',
  'try'],
 [],
 ['want', 'vermin', 'go'],
 ['sorry',
  'will',
  'try',
  'later',
  'week',
  'turn',
  'busy',
  'little',
  'time',
  'reddit',
  'reading'],
 ['cite', 'new', 'standard'],
 ['wow', 'standard'],


In [96]:
word2vec = Word2Vec(texts, min_count=2) 

In [97]:
vocabulary = word2vec.wv.vocab  
print(len(vocabulary))
print(vocabulary)  

848
{'office': <gensim.models.keyedvectors.Vocab object at 0x7f60ac5e19b0>, 'civil': <gensim.models.keyedvectors.Vocab object at 0x7f60ac5e15f8>, 'right': <gensim.models.keyedvectors.Vocab object at 0x7f60ac5e1898>, 'organization': <gensim.models.keyedvectors.Vocab object at 0x7f60ac5e17f0>, 'support': <gensim.models.keyedvectors.Vocab object at 0x7f60af6c6f28>, 'woman': <gensim.models.keyedvectors.Vocab object at 0x7f60af3ddcf8>, 'despite': <gensim.models.keyedvectors.Vocab object at 0x7f60af9ab0f0>, 'gender': <gensim.models.keyedvectors.Vocab object at 0x7f60af9ab2e8>, 'neutral': <gensim.models.keyedvectors.Vocab object at 0x7f60af9ab080>, 'work': <gensim.models.keyedvectors.Vocab object at 0x7f60af9ab128>, 'new': <gensim.models.keyedvectors.Vocab object at 0x7f60af9ab1d0>, 'preponderance': <gensim.models.keyedvectors.Vocab object at 0x7f60af9ab160>, 'evidence': <gensim.models.keyedvectors.Vocab object at 0x7f60af9ab668>, 'standard': <gensim.models.keyedvectors.Vocab object at 0x7f60

In [98]:
v1 = word2vec.wv['men']   #vector for the word
v1

array([ 5.4545337e-03,  4.3074433e-03, -2.7030143e-03, -2.1562264e-03,
        3.7910382e-04, -4.5982129e-03,  1.3795113e-03, -2.2752469e-03,
       -1.2752709e-03, -1.0174123e-03,  4.2846384e-03, -2.7380444e-03,
       -5.5503887e-03,  2.9758092e-05, -2.2586039e-03, -4.4150888e-03,
        4.5763780e-03, -2.6292074e-03,  4.4808030e-04,  2.6236821e-04,
        4.4415216e-03,  8.0928142e-04, -4.0190034e-03,  2.3514538e-03,
       -3.1463818e-03, -4.0261215e-03, -2.9024121e-03,  7.1974960e-04,
       -2.4463022e-03, -1.1734637e-03, -2.8011713e-03,  3.8653687e-03,
       -2.2390855e-03,  1.8478973e-05, -1.0443471e-03, -2.2866947e-03,
       -4.8838104e-03, -2.0998067e-03, -8.2515483e-04, -2.1683152e-03,
        3.8271653e-03,  8.3259575e-04,  2.2258027e-03,  2.3770430e-03,
        3.6067336e-03,  4.6805190e-03, -9.3509658e-04, -1.7431705e-03,
        3.6098603e-03, -2.0570266e-04, -2.7502950e-03, -3.3506149e-04,
       -3.5616157e-03, -4.0100231e-03, -3.7848449e-03, -2.5597177e-03,
      

In [99]:
sim_words = word2vec.wv.most_similar('men', topn=20)  
sim_words

[('likely', 0.31682848930358887),
 ('o', 0.31154245138168335),
 ('small', 0.29945477843284607),
 ('not', 0.27216869592666626),
 ('reason', 0.26910656690597534),
 ('notion', 0.25408294796943665),
 ('lobby', 0.25321266055107117),
 ('paragraph', 0.24737334251403809),
 ('female', 0.24525147676467896),
 ('provide', 0.24242256581783295),
 ('assumption', 0.2373969405889511),
 ('imply', 0.22561395168304443),
 ('turn', 0.22260120511054993),
 ('situation', 0.21924252808094025),
 ('possibility', 0.21677349507808685),
 ('till', 0.2121698260307312),
 ('original', 0.21079513430595398),
 ('join', 0.21007461845874786),
 ('shirt', 0.20914912223815918),
 ('end', 0.20755378901958466)]

In [100]:
sim_words = word2vec.wv.most_similar('woman', topn=20)  
sim_words

[('right', 0.4868926405906677),
 ('help', 0.42004677653312683),
 ('boy', 0.4143672287464142),
 ('oo', 0.41062772274017334),
 ('feminist', 0.39699387550354004),
 ('point', 0.3948400616645813),
 ('need', 0.3774545192718506),
 ('mod', 0.37719085812568665),
 ('know', 0.3771521747112274),
 ('r', 0.36778849363327026),
 ('gt', 0.36160486936569214),
 ('thing', 0.36093980073928833),
 ('assault', 0.3592444658279419),
 ('contact', 0.3554409444332123),
 ('mrm', 0.3456125557422638),
 ('anti', 0.3427159786224365),
 ('erection', 0.34067127108573914),
 ('subreddit', 0.33871686458587646),
 ('people', 0.3383009433746338),
 ('exactly', 0.3382965326309204)]