<a href="https://colab.research.google.com/github/Avinash9k5r/python-model-on-words/blob/master/FirstModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Getting Started
Setting up the Environment

In [0]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time # To time our operations
from collections import defaultdict #For word frequency
import spacy  #For preprocessing
import logging # Setting up the loggings to monitor system
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt ='%H:%M:%S',level=logging.INFO)


# Preprocessing

In [0]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [0]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [9]:
file_id = 'https://drive.google.com/open?id=1Qy6q6sGmCuB2ZgSoD0j15GzcXSV8lB21'
downloaded = drive.CreateFile({'id': file_id})
downloaded = drive.CreateFile({'id':'1Qy6q6sGmCuB2ZgSoD0j15GzcXSV8lB21'}) # replace the id with id of file you want to access
downloaded.GetContentFile('reviews_data.txt')  

# Read file as panda dataframe
import pandas as pd
df = pd.read_csv('reviews_data.txt',delimiter='\t',encoding='latin-1',error_bad_lines=False,header=None,engine='python')


INFO - 16:52:06: URL being requested: GET https://www.googleapis.com/drive/v2/files/1Qy6q6sGmCuB2ZgSoD0j15GzcXSV8lB21?alt=json
Skipping line 5253: '	' expected after '"'
Skipping line 6976: '	' expected after '"'
Skipping line 7648: '	' expected after '"'
Skipping line 7884: '	' expected after '"'
Skipping line 7954: '	' expected after '"'
Skipping line 12600: '	' expected after '"'
Skipping line 14495: '	' expected after '"'
Skipping line 16098: '	' expected after '"'
Skipping line 16381: '	' expected after '"'
Skipping line 17192: '	' expected after '"'
Skipping line 19793: '	' expected after '"'
Skipping line 20401: '	' expected after '"'
Skipping line 21810: '	' expected after '"'
Skipping line 22540: '	' expected after '"'
Skipping line 23078: '	' expected after '"'
Skipping line 23084: '	' expected after '"'
Skipping line 27273: '	' expected after '"'
Skipping line 29333: '	' expected after '"'
Skipping line 32039: '	' expected after '"'
Skipping line 34495: '	' expected after '"

In [10]:
df.shape

(254306, 4)

In [11]:
df.head()

Unnamed: 0,0,1,2,3
0,Oct 12 2009,Nice trendy hotel location not too bad.,I stayed in this hotel for one night. As this ...,
1,Sep 25 2009,Great Budget Hotel!,Stayed two nights at Aloft on the most recent ...,
2,Aug 4 2009,Excellent value - location not a big problem.,We stayed at the Aloft Beijing Haidian for 5 n...,
3,Jul 17 2009,Stylish clean reasonable value poor location,I am glad to be the first person to post photo...,
4,May 30 2009,Remote but excellent value for money,Stayed there for one night. The hotel is locat...,


In [12]:
df.dtypes

0    object
1    object
2    object
3    object
dtype: object

In [13]:
df.tail()

Unnamed: 0,0,1,2,3
254301,Mar 27 2009,?????????????,,
254302,Jan 24 2009,???????????????,,
254303,Jan 24 2009,?????????????,,
254304,Jan 24 2009,??,,
254305,Oct 28 2008,?????????????,,


# Cleaning

In [0]:
nlp = spacy.load('en',disable=['ner','parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and remvoes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

Removes non-alphabetic characters:

In [0]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df[2])

In [16]:
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning,batch_size=5000, n_threads=-1)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60,2)))

Time to clean up everything: 20.28 mins


Putting the results in a data frame to remove missing values and duplicates

In [17]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(219344, 1)

In [18]:
df_clean.head()

Unnamed: 0,clean
0,stay hotel night fairly new place taxi driver ...
1,stay night aloft recent trip china hotel moder...
2,stay aloft beijing haidian night july nd lot r...
3,glad person post photo hotel key point propert...
4,stay night hotel locate haidian little bit rem...


# Bigrams

In [19]:
from gensim.models.phrases import Phrases, Phraser

INFO - 17:21:20: 'pattern' package not found; tag filters are not available for English


As Phrases() takes a list of list of words as input:

In [0]:
sent = [row.split() for row in df_clean['clean']]

Creates the relevant phrases from the list of sentences:

In [21]:
phrases = Phrases(sent, min_count = 30, progress_per=10000)

INFO - 17:21:41: collecting all words and their counts
INFO - 17:21:41: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 17:21:42: PROGRESS: at sentence #10000, processed 827919 words and 384468 word types
INFO - 17:21:44: PROGRESS: at sentence #20000, processed 1617848 words and 624500 word types
INFO - 17:21:45: PROGRESS: at sentence #30000, processed 2594326 words and 927463 word types
INFO - 17:21:47: PROGRESS: at sentence #40000, processed 3458408 words and 1158235 word types
INFO - 17:21:49: PROGRESS: at sentence #50000, processed 4390306 words and 1374708 word types
INFO - 17:21:51: PROGRESS: at sentence #60000, processed 5284426 words and 1571232 word types
INFO - 17:21:52: PROGRESS: at sentence #70000, processed 6060032 words and 1735367 word types
INFO - 17:21:54: PROGRESS: at sentence #80000, processed 6858180 words and 1886267 word types
INFO - 17:21:55: PROGRESS: at sentence #90000, processed 7648123 words and 2027027 word types
INFO - 17:21:57: PROGRESS

In [22]:
bigram = Phraser(phrases)

INFO - 17:22:27: source_vocab length 3537109
INFO - 17:23:01: Phraser built with 4857 phrasegrams


In [0]:
sentences = bigram[sent]

# Most Frequent Words:

In [24]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
        
len(word_freq)

128134

In [25]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['room',
 'hotel',
 'stay',
 'good',
 'night',
 'great',
 'staff',
 'location',
 'clean',
 'nice']

# Training the Model

In [0]:
import multiprocessing
from gensim.models import Word2Vec

In [0]:
cores = multiprocessing.cpu_count() # Count the number of cores in a comuter

In [0]:
w2v_model = Word2Vec(min_count = 20,
                     window =2,
                     size = 300,
                     sample=6e-5,
                     alpha=0.03,
                     min_alpha = 0.0007,
                     negative=20,
                     workers=cores-1)

Building the Vocabulary Table:

In [29]:
t = time()

w2v_model.build_vocab(sentences, progress_per=1000)

print('Time to build vocab: {} mins'.format(round((time()-t)/60,2)))

INFO - 17:27:47: collecting all words and their counts
INFO - 17:27:47: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 17:27:47: PROGRESS: at sentence #1000, processed 84988 words, keeping 8175 word types
INFO - 17:27:48: PROGRESS: at sentence #2000, processed 173144 words, keeping 11747 word types
INFO - 17:27:48: PROGRESS: at sentence #3000, processed 253220 words, keeping 14093 word types
INFO - 17:27:48: PROGRESS: at sentence #4000, processed 334403 words, keeping 16074 word types
INFO - 17:27:48: PROGRESS: at sentence #5000, processed 414729 words, keeping 18161 word types
INFO - 17:27:49: PROGRESS: at sentence #6000, processed 486835 words, keeping 19762 word types
INFO - 17:27:49: PROGRESS: at sentence #7000, processed 553152 words, keeping 21102 word types
INFO - 17:27:49: PROGRESS: at sentence #8000, processed 628136 words, keeping 22489 word types
INFO - 17:27:49: PROGRESS: at sentence #9000, processed 702423 words, keeping 23664 word types
INFO - 17

Time to build vocab: 0.88 mins


Training of the Model:

In [31]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 17:30:01: training model with 1 workers on 18921 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2
INFO - 17:30:02: EPOCH 1 - PROGRESS: at 0.78% examples, 70307 words/s, in_qsize 1, out_qsize 0
INFO - 17:30:03: EPOCH 1 - PROGRESS: at 1.68% examples, 71592 words/s, in_qsize 2, out_qsize 0
INFO - 17:30:04: EPOCH 1 - PROGRESS: at 2.67% examples, 73041 words/s, in_qsize 1, out_qsize 0
INFO - 17:30:05: EPOCH 1 - PROGRESS: at 3.75% examples, 72966 words/s, in_qsize 2, out_qsize 0
INFO - 17:30:06: EPOCH 1 - PROGRESS: at 4.80% examples, 73199 words/s, in_qsize 1, out_qsize 0
INFO - 17:30:07: EPOCH 1 - PROGRESS: at 5.80% examples, 72990 words/s, in_qsize 1, out_qsize 0
INFO - 17:30:08: EPOCH 1 - PROGRESS: at 6.87% examples, 72898 words/s, in_qsize 2, out_qsize 0
INFO - 17:30:09: EPOCH 1 - PROGRESS: at 7.92% examples, 73035 words/s, in_qsize 1, out_qsize 0
INFO - 17:30:10: EPOCH 1 - PROGRESS: at 8.88% examples, 73031 words/s, in_qsize 2, out_qsize 0
INFO - 17:

Time to train the model: 51.54 mins


In [32]:
w2v_model.init_sims(replace=True)


INFO - 18:22:05: precomputing L2-norms of word weight vectors


# Exploring the Model

Most Similart to:

In [34]:
w2v_model.wv.most_similar(positive=["room"])

  if np.issubdtype(vec.dtype, np.int):


[('double_bed', 0.6792638301849365),
 ('king_size', 0.651154637336731),
 ('th_floor', 0.6463562250137329),
 ('double', 0.6434370875358582),
 ('twin', 0.641046941280365),
 ('king', 0.6190584897994995),
 ('spacious', 0.5999467372894287),
 ('bedroom', 0.5951843857765198),
 ('bed', 0.594986081123352),
 ('queen', 0.5846426486968994)]

In [35]:
w2v_model.wv.most_similar(positive=["hotels"])

  if np.issubdtype(vec.dtype, np.int):


[('compare', 0.4426605701446533),
 ('properties', 0.4266010522842407),
 ('property', 0.3965243697166443),
 ('star', 0.3942808210849762),
 ('chain', 0.3902702331542969),
 ('par', 0.3837100863456726),
 ('hilton', 0.38347315788269043),
 ('comparison', 0.3749620318412781),
 ('propertie', 0.37458768486976624),
 ('hotel', 0.368985652923584)]

In [36]:
w2v_model.wv.most_similar(positive=["mall"])

  if np.issubdtype(vec.dtype, np.int):


[('shopping_mall', 0.7714414596557617),
 ('shopping_centre', 0.6581958532333374),
 ('department_store', 0.5437674522399902),
 ('shop', 0.5387667417526245),
 ('mall_emirates', 0.5247905850410461),
 ('oriental_plaza', 0.49925124645233154),
 ('shopping', 0.4991026520729065),
 ('emirates_mall', 0.48372238874435425),
 ('westfield_mall', 0.4820064902305603),
 ('dubai_mall', 0.4808569848537445)]

In [39]:
w2v_model.wv.most_similar(positive=["delighted"])

  if np.issubdtype(vec.dtype, np.int):


[('pleased', 0.5857943296432495),
 ('pleasantly_surprised', 0.4764917492866516),
 ('thrill', 0.4455282390117645),
 ('happy', 0.4399331212043762),
 ('impressed', 0.43697068095207214),
 ('delight', 0.39311179518699646),
 ('fortunate', 0.3824749290943146),
 ('pleasantly_surprise', 0.3809754550457001),
 ('exceed_expectation', 0.365864634513855),
 ('pleasant_surprise', 0.3606758415699005)]

In [40]:
w2v_model.wv.most_similar(positive=["machine"])

  if np.issubdtype(vec.dtype, np.int):


[('vend_machine', 0.4965496063232422),
 ('stairmaster', 0.46742600202560425),
 ('machine_weight', 0.4670303463935852),
 ('bike_treadmill', 0.4606589674949646),
 ('treadmill_elliptical', 0.4452149271965027),
 ('soda_machine', 0.44069036841392517),
 ('cross_trainer', 0.4389886260032654),
 ('stepper', 0.42871326208114624),
 ('bike_weight', 0.42328232526779175),
 ('cardio_weight', 0.42006033658981323)]

Similarities:

In [37]:
w2v_model.wv.similarity('hotel', 'mall')

  if np.issubdtype(vec.dtype, np.int):


0.1935021

In [38]:
w2v_model.wv.similarity('bed', 'room')

  if np.issubdtype(vec.dtype, np.int):


0.5949861

In [41]:
w2v_model.wv.similarity('garden', 'city')

  if np.issubdtype(vec.dtype, np.int):


0.12833506

In [45]:
w2v_model.wv.similarity('internet', 'computer')

  if np.issubdtype(vec.dtype, np.int):


0.59984493

In [47]:
w2v_model.wv.similarity('tourist', 'traveler')

  if np.issubdtype(vec.dtype, np.int):


0.3786593

In [50]:
w2v_model.wv.similarity('neat', 'clean')

  if np.issubdtype(vec.dtype, np.int):


0.5322975