In [52]:
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import nltk



In [32]:
# define training data
sentences = [['drink','not','good'],
			['felt','superb'],
			['just','good','ambience'],
			['bad','taste'],
			['parking','problem'],
            ['fantastic','food'],
			['bad', 'food'],
			['nice','place']]

y = np.array([0,1,1,0,0,1,0,1])



In [33]:
"""Train the model"""

model = Word2Vec(sentences, sg=0, min_count=1,vector_size=100)
print(model)



Word2Vec<vocab=15, vector_size=100, alpha=0.025>


In [34]:
"""Vocabulary:"""
len(model.wv)


15

In [35]:
"""Accessing vectors"""

print(model.wv.get_vector("felt") )
print(model.wv.get_vector('fantastic'))
print(len(model.wv.get_vector("drink")))

[-1.9442164e-03 -5.2675214e-03  9.4471136e-03 -9.2987325e-03
  4.5039477e-03  5.4041781e-03 -1.4092624e-03  9.0070926e-03
  9.8853596e-03 -5.4750429e-03 -6.0210000e-03 -6.7469729e-03
 -7.8948820e-03 -3.0479168e-03 -5.5940272e-03 -8.3446801e-03
  7.8290224e-04  2.9946566e-03  6.4147436e-03 -2.6289499e-03
 -4.4534765e-03  1.2495709e-03  3.9146186e-04  8.1169987e-03
  1.8280029e-04  7.2315861e-03 -8.2645155e-03  8.4335366e-03
 -1.8889094e-03  8.7011540e-03 -7.6168370e-03  1.7963862e-03
  1.0564864e-03  4.6005251e-05 -5.1032533e-03 -9.2476979e-03
 -7.2642174e-03 -7.9511739e-03  1.9137275e-03  4.7846674e-04
 -1.8131376e-03  7.1201660e-03 -2.4756920e-03 -1.3473093e-03
 -8.9005642e-03 -9.9254129e-03  8.9493981e-03 -5.7539381e-03
 -6.3729975e-03  5.1994072e-03  6.6699935e-03 -6.8316413e-03
  9.5975993e-04 -6.0084737e-03  1.6473436e-03 -4.2892788e-03
 -3.4407973e-03  2.1856665e-03  8.6615775e-03  6.7281104e-03
 -9.6770572e-03 -5.6221043e-03  7.8803329e-03  1.9893574e-03
 -4.2560520e-03  5.98812

In [36]:
"""Mean of Embeddings
First Comment
['drink','not','good']
"""

sentences[0]

first = np.array([model.wv.get_vector(word) for word in sentences[0] ])
first.shape

first.mean(axis=0).shape



(100,)

In [37]:
"""All Comments"""

means = []
for sentence in sentences :
    sent = np.array([model.wv.get_vector(word) for word in sentence ])
    row_means = sent.mean(axis=0)
    means.append(row_means)
means = np.array(means)

X = means
X.shape



(8, 100)

In [38]:
"""Model Building"""

from sklearn.ensemble import RandomForestClassifier
model_rf = RandomForestClassifier(random_state=23)
model_rf.fit( X , y )



In [39]:
"""Model Testing:"""
test_sentences = [['bad','food'],['good','place']]
test_means = []
for sentence in test_sentences :
    sent = np.array([model.wv.get_vector(word) for word in sentence ])
    row_means = sent.sum(axis=0)
    test_means.append(row_means)
num_test_means = np.array(test_means)
X_test = num_test_means

y_pred = model_rf.predict(X_test)
y_pred

array([0, 1])

In [40]:
dataset = pd.read_csv("/home/darkstar/Documents/pg-dbda/module7_statistics/Daywise Study Material/datasets/Restaurant_Reviews.tsv", sep='\t')
dataset

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [41]:
stops = stopwords.words('english')
stops

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [42]:
def preprocess(text_col, size):
    corpus = []
    for i in range(0, size):
        review = re.sub('[^a-zA-Z]', ' ', text_col[i])
        review = review.lower()
        review = review.split()
        ps = PorterStemmer()
        review = [ps.stem(word) for word in review if not word in set(stops)]
        review = ' '.join(review)
        review = review.split()
        if review == []:
          review = ["anything"]
        corpus.append(review)
    return corpus

In [43]:
corpus = preprocess(text_col=dataset['Review'], size=dataset.shape[0])
print(len(corpus))


1000


In [44]:
corpus[:3]
"""CBOW: sg = 0
Skip-Gram: sg = 1
"""

model_r =  Word2Vec(corpus, min_count=1, vector_size=100, sg=0)

means = []
for sentence in corpus :
    sent = np.array([model_r.wv.get_vector(word) for word in sentence ])
    row_means = sent.mean(axis=0)
    means.append(row_means)


In [45]:
means = np.array(means)
print(means)

[[-0.00248936  0.00482832  0.00606749 ... -0.0039599  -0.00132851
  -0.00285674]
 [ 0.00410618 -0.00112031 -0.00397767 ...  0.00081174  0.00304167
  -0.00740726]
 [-0.00521496  0.00181101 -0.00672057 ... -0.0001369   0.00624619
  -0.00342533]
 ...
 [-0.00335364  0.00013293 -0.00160126 ... -0.00363972  0.00139065
   0.00052057]
 [-0.00117236 -0.00082415  0.00021906 ... -0.0021179   0.00520433
  -0.00169304]
 [-0.00114351  0.00053284  0.00145375 ... -0.00363293 -0.002106
  -0.0034314 ]]


In [46]:

X = means
y = dataset.iloc[:, 1]

X.shape

y.shape

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.20,
                                                    random_state = 23,
                                                    stratify=y)
classifier = RandomForestClassifier(random_state=23)
classifier.fit(X_train, y_train)

In [54]:
"""**Model Evaluation**"""

y_pred_prob = classifier.predict_proba(X_test)[:,1]
print(roc_auc_score(y_test, y_pred_prob))

y_pred = classifier.predict(X_test)
print(accuracy_score(y_test, y_pred))


0.7046000000000001
0.64


In [55]:
"""#### Grid Search CV"""

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=23)
print(classifier.get_params())

params = {'max_features':[2,5,10,20,50]}
gcv = GridSearchCV(classifier, param_grid=params, cv=kfold, verbose=3)
gcv.fit(X, y)

print(gcv.best_params_)

print(gcv.best_score_)



test_corp = ['bad taste', 'horrible','love']
tst_corpus = preprocess(text_col=test_corp,
                    size=3)

print(tst_corpus)

test_means = []
for sentence in tst_corpus :
    word_vects = [model_r.wv.get_vector(word, norm=True) for word in sentence ]
    row_means = np.mean(word_vects,axis=0)
    test_means.append(row_means)
test_means = np.array( test_means )

y_pred = gcv.predict(test_means)
print(y_pred)


{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 23, 'verbose': 0, 'warm_start': False}
Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END ....................max_features=2;, score=0.650 total time=   0.4s
[CV 2/5] END ....................max_features=2;, score=0.610 total time=   0.3s
[CV 3/5] END ....................max_features=2;, score=0.675 total time=   0.3s
[CV 4/5] END ....................max_features=2;, score=0.685 total time=   0.3s
[CV 5/5] END ....................max_features=2;, score=0.660 total time=   0.3s
[CV 1/5] END ....................max_features=5;, score=0.645 total time=   0.5s
[CV 2/5] END ....................max_features=5;, score=0.650 t