<a href="https://colab.research.google.com/github/Avinash9k5r/python-model-on-words/blob/master/WordModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Getting Started

Setting up the Environment


In [0]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time # To time our operations
from collections import defaultdict #For word frequency
import spacy  #For preprocessing
import logging # Setting up the loggings to monitor system
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt ='%H:%M:%S',level=logging.INFO)


# Preprocessing

In [0]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [0]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
file_id = 'https://drive.google.com/open?id=1EKfQxxUI-WbHWN9yA-Y3z9Da5URA7gHY'
downloaded = drive.CreateFile({'id': file_id})
downloaded = drive.CreateFile({'id':'1EKfQxxUI-WbHWN9yA-Y3z9Da5URA7gHY'}) # replace the id with id of file you want to access
downloaded.GetContentFile('simpsons_dataset.csv')  

# Read file as panda dataframe
import pandas as pd
df = pd.read_csv("simpsons_dataset.csv")

Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/googleapiclient/discovery_cache/__init__.py", line 36, in autodetect
    from google.appengine.api import memcache
ModuleNotFoundError: No module named 'google.appengine'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/googleapiclient/discovery_cache/file_cache.py", line 33, in <module>
    from oauth2client.contrib.locked_file import LockedFile
ModuleNotFoundError: No module named 'oauth2client.contrib.locked_file'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/googleapiclient/discovery_cache/file_cache.py", line 37, in <module>
    from oauth2client.locked_file import LockedFile
ModuleNotFoundError: No module named 'oauth2client.locked_file'

During handling of the above exception, another exceptio

In [0]:
df.shape

(158314, 2)

In [0]:
df.head()


Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [0]:
df.isnull().sum() # Show missing values

raw_character_text    17814
spoken_words          26459
dtype: int64

Removing the missing values:

In [0]:
df = df.dropna().reset_index(drop=True)
df.isnull().sum()

raw_character_text    0
spoken_words          0
dtype: int64

## Cleaning:

We are lemmatizing and removing the stopwords and non-alphabetic characters for each line of dialogue

In [0]:
nlp = spacy.load('en',disable=['ner','parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and remvoes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

Removes non-alphabetic characters.

In [0]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['spoken_words'])

Taking advantage of spaCy.pipe() attribute to speed-up the cleaning process:

In [0]:
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning,batch_size=5000, n_threads=-1)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60,2)))

Time to clean up everything: 1.05 mins


Put the results in a DataFrame to remove missing values and duplicates:

In [0]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(85964, 1)

In [0]:
df_clean.head()

Unnamed: 0,clean
0,actually little disease magazine news show nat...
2,know sure like talk touch lesson plan teach
3,life worth live
4,poll open end recess case decide thought final...
7,victory party slide


### Bigrams:

In [0]:
from gensim.models.phrases import Phrases, Phraser

INFO - 07:06:38: 'pattern' package not found; tag filters are not available for English


As Phrases() takes a list of list of words as input:

In [0]:
sent = [row.split() for row in df_clean['clean']]

Creates the relevant phrases from the list of sentences:

In [0]:
phrases = Phrases(sent, min_count = 30, progress_per=10000)

INFO - 07:06:47: collecting all words and their counts
INFO - 07:06:47: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 07:06:47: PROGRESS: at sentence #10000, processed 63561 words and 52816 word types
INFO - 07:06:47: PROGRESS: at sentence #20000, processed 130943 words and 99866 word types
INFO - 07:06:47: PROGRESS: at sentence #30000, processed 192972 words and 138532 word types
INFO - 07:06:47: PROGRESS: at sentence #40000, processed 249842 words and 172659 word types
INFO - 07:06:47: PROGRESS: at sentence #50000, processed 311265 words and 208566 word types
INFO - 07:06:47: PROGRESS: at sentence #60000, processed 373588 words and 243702 word types
INFO - 07:06:47: PROGRESS: at sentence #70000, processed 436441 words and 278740 word types
INFO - 07:06:48: PROGRESS: at sentence #80000, processed 497829 words and 311886 word types
INFO - 07:06:48: collected 330804 word types from a corpus of 537160 words (unigram + bigrams) and 85964 sentences
INFO - 07:06:48: us

The goal of Phraser() is to cut down memory consumption of Phrases(), by discarding model state not strictly needed for the bigram detection task:

In [0]:
bigram = Phraser(phrases)

INFO - 07:07:11: source_vocab length 330804
INFO - 07:07:14: Phraser built with 126 phrasegrams


Transforms the corpus based on the bigrams detected:

In [0]:
sentences = bigram[sent]

Most Frequent Words:

In [0]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
        
len(word_freq)

30178

In [0]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['oh', 'like', 'know', 'get', 'hey', 'think', 'right', 'look', 'want', 'come']

# Training the Model

In [0]:
import multiprocessing
from gensim.models import Word2Vec

In [0]:
cores = multiprocessing.cpu_count() # Count the number of cores in a comuter

In [0]:
w2v_model = Word2Vec(min_count = 20,
                     window =2,
                     size = 300,
                     sample=6e-5,
                     alpha=0.03,
                     min_alpha = 0.0007,
                     negative=20,
                     workers=cores-1)

Building the Vocabulary Table:

In [0]:
t = time()

w2v_model.build_vocab(sentences, progress_per=1000)

print('Time to build vocab: {} mins'.format(round((time()-t)/60,2)))

INFO - 07:09:01: collecting all words and their counts
INFO - 07:09:01: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 07:09:01: PROGRESS: at sentence #1000, processed 6628 words, keeping 2288 word types
INFO - 07:09:01: PROGRESS: at sentence #2000, processed 12910 words, keeping 3593 word types
INFO - 07:09:01: PROGRESS: at sentence #3000, processed 19216 words, keeping 4611 word types
INFO - 07:09:01: PROGRESS: at sentence #4000, processed 25523 words, keeping 5464 word types
INFO - 07:09:01: PROGRESS: at sentence #5000, processed 32001 words, keeping 6389 word types
INFO - 07:09:01: PROGRESS: at sentence #6000, processed 37928 words, keeping 7128 word types
INFO - 07:09:01: PROGRESS: at sentence #7000, processed 43561 words, keeping 7747 word types
INFO - 07:09:01: PROGRESS: at sentence #8000, processed 49302 words, keeping 8371 word types
INFO - 07:09:01: PROGRESS: at sentence #9000, processed 55282 words, keeping 8966 word types
INFO - 07:09:01: PROGRESS:

Time to build vocab: 0.05 mins


## Training of the Model

In [0]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time()-t)/60,2)))

INFO - 07:10:01: training model with 1 workers on 3319 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2
INFO - 07:10:02: EPOCH 1 - PROGRESS: at 33.37% examples, 65355 words/s, in_qsize 0, out_qsize 0
INFO - 07:10:03: EPOCH 1 - PROGRESS: at 69.05% examples, 66233 words/s, in_qsize 0, out_qsize 0
INFO - 07:10:04: worker thread finished; awaiting finish of 0 more threads
INFO - 07:10:04: EPOCH - 1 : training on 523700 raw words (199218 effective words) took 2.9s, 67847 effective words/s
INFO - 07:10:05: EPOCH 2 - PROGRESS: at 33.37% examples, 67333 words/s, in_qsize 0, out_qsize 0
INFO - 07:10:06: EPOCH 2 - PROGRESS: at 70.95% examples, 67781 words/s, in_qsize 0, out_qsize 0
INFO - 07:10:07: worker thread finished; awaiting finish of 0 more threads
INFO - 07:10:07: EPOCH - 2 : training on 523700 raw words (199222 effective words) took 2.9s, 68703 effective words/s
INFO - 07:10:08: EPOCH 3 - PROGRESS: at 33.37% examples, 66263 words/s, in_qsize 0, out_qsize 0


Time to train the model: 1.47 mins


# Exploring the Model

Most Similar to:

In [0]:
w2v_model.wv.most_similar(positive=["homer"])

INFO - 07:12:11: precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):


[('bongo', 0.7438229322433472),
 ('marge', 0.7383730411529541),
 ('wife', 0.729287326335907),
 ('rude', 0.728277862071991),
 ('snuggle', 0.7205110788345337),
 ('sorry', 0.7020863890647888),
 ('listen', 0.7008452415466309),
 ('crummy', 0.6998229026794434),
 ('gee', 0.6980044841766357),
 ('attract', 0.696446418762207)]

In [0]:
w2v_model.wv.most_similar(positive=["homer_simpson"])

  if np.issubdtype(vec.dtype, np.int):


[('congratulation', 0.7384845018386841),
 ('easily', 0.7243740558624268),
 ('council', 0.7230184674263),
 ('recent', 0.7135732769966125),
 ('waylon', 0.7095463275909424),
 ('governor', 0.7063230276107788),
 ('pleased', 0.6944870948791504),
 ('kennedy', 0.6925977468490601),
 ('committee', 0.6886483430862427),
 ('defeat', 0.6862305998802185)]

In [0]:
w2v_model.wv.most_similar(positive=["marge"])

  if np.issubdtype(vec.dtype, np.int):


[('convince', 0.753345251083374),
 ('sorry', 0.7442652583122253),
 ('hammock', 0.738720178604126),
 ('homer', 0.7383730411529541),
 ('rude', 0.731797456741333),
 ('grownup', 0.7308133840560913),
 ('becky', 0.723953127861023),
 ('arrange', 0.7207881212234497),
 ('raccoon', 0.7200326919555664),
 ('loving', 0.7158425450325012)]

In [0]:
w2v_model.wv.most_similar(positive=["bart"])

  if np.issubdtype(vec.dtype, np.int):


[('lisa', 0.8213196992874146),
 ('homework', 0.7795455455780029),
 ('mom', 0.7676389813423157),
 ('substitute', 0.763089656829834),
 ('convince', 0.7581101655960083),
 ('surprised', 0.7539457082748413),
 ('hearing', 0.7440679669380188),
 ('strangle', 0.7392697334289551),
 ('upset', 0.737227201461792),
 ('impressive', 0.7352877855300903)]

Similarities

In [0]:
w2v_model.wv.similarity('maggie', 'tavern')

  if np.issubdtype(vec.dtype, np.int):


0.23141363

In [0]:
w2v_model.wv.similarity('maggie','baby')

  if np.issubdtype(vec.dtype, np.int):


0.69330287

In [0]:
w2v_model.wv.similarity('bart','nelson')

  if np.issubdtype(vec.dtype, np.int):


0.6432154

Odd-One-Out

In [0]:
w2v_model.wv.doesnt_match(['jimbo','milhouse','kearney'])

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
  if np.issubdtype(vec.dtype, np.int):


'jimbo'

In [0]:
w2v_model.wv.doesnt_match(["nelson","bart","milhouse"])

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
  if np.issubdtype(vec.dtype, np.int):


'nelson'

In [0]:
w2v_model.wv.doesnt_match(['homer','patty','selma'])

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)
  if np.issubdtype(vec.dtype, np.int):


'homer'

Analogy Difference

In [0]:
w2v_model.wv.most_similar(positive=["woman","homer"],negative=["marge"],topn=3)

  if np.issubdtype(vec.dtype, np.int):


[('admire', 0.6310784816741943),
 ('obvious', 0.5805646181106567),
 ('carefully', 0.5758380889892578)]

In [0]:
w2v_model.wv.most_similar(positive=["woman","bart"],negative=["man"],topn=3)

  if np.issubdtype(vec.dtype, np.int):


[('lisa', 0.7217448949813843),
 ('parent', 0.7040499448776245),
 ('upset', 0.6835949420928955)]