# COURSE:   PGP [AI&ML]

## Learner :  Chaitanya Kumar Battula
## Module  : NLP
## Topic   : Sentiment Analysis_Zomato rating

In [1]:
import pandas as pd, numpy as np
import re

In [2]:
reviews0 = pd.read_csv("Zomato_reviews.csv")

In [3]:
reviews0.head()

Unnamed: 0,rating,review_text
0,1.0,"Their service is worst, pricing in menu is dif..."
1,5.0,really appreciate their quality and timing . I...
2,4.0,"Went there on a Friday night, the place was su..."
3,4.0,A very decent place serving good food.\r\nOrde...
4,5.0,One of the BEST places for steaks in the city....


In [4]:
reviews0.describe(include="all")

Unnamed: 0,rating,review_text
count,27762.0,27748
unique,,10548
top,,good
freq,,278
mean,3.665784,
std,1.284573,
min,1.0,
25%,3.0,
50%,4.0,
75%,5.0,


14 rows are missing the review text - need to get rid of these records

In [5]:
reviews1 = reviews0[~reviews0.review_text.isnull()].copy()
reviews1.reset_index(inplace=True, drop=True)

In [6]:
reviews0.shape, reviews1.shape

((27762, 2), (27748, 2))

#### Converting to list for easy manipulation

In [7]:
reviews_list = reviews1.review_text.values

In [8]:
len(reviews_list)

27748

### Text clean up 
- Normalize the case  
- Remove stop words
   - remove "not", "no" from the stop word list
- Remove punctuations

Normalizing case

In [9]:
reviews_lower = [txt.lower() for txt in reviews_list]

In [10]:
reviews_lower[2:4]

['went there on a friday night, the place was surprisingly empty. interesting menu which is almost fully made of dosas. i had bullseye dosa and cheese masala dosa. the bullseye dosa was really good, with the egg perfectly cooked to a half boiled state. the masala in the cheese masala was good, but the cheese was a bit too chewy for my liking. the chutney was good, the sambar was average. the dishes are reasonably priced.',
 'a very decent place serving good food.\r\nordered chilli fish, chicken & pork sizzler.\r\neverything tasted good but pork could have been slightly better cooked.\r\ntried 2 beverages, both were very sweet.']

Remove extra line breaks

In [11]:
reviews_lower = [" ".join(txt.split()) for txt in reviews_lower]

In [12]:
reviews_lower[2:4]

['went there on a friday night, the place was surprisingly empty. interesting menu which is almost fully made of dosas. i had bullseye dosa and cheese masala dosa. the bullseye dosa was really good, with the egg perfectly cooked to a half boiled state. the masala in the cheese masala was good, but the cheese was a bit too chewy for my liking. the chutney was good, the sambar was average. the dishes are reasonably priced.',
 'a very decent place serving good food. ordered chilli fish, chicken & pork sizzler. everything tasted good but pork could have been slightly better cooked. tried 2 beverages, both were very sweet.']

#### Tokenize

In [13]:
from nltk.tokenize import word_tokenize

In [14]:
print(word_tokenize(reviews_lower[0]))

['their', 'service', 'is', 'worst', ',', 'pricing', 'in', 'menu', 'is', 'different', 'from', 'bill', '.', 'they', 'can', 'give', 'you', 'a', 'bill', 'with', 'increased', 'pricing', '.', 'even', 'for', 'serving', 'water', ',', 'menu', ',', 'order', 'you', 'need', 'to', 'call', 'them', '3-4', 'times', 'even', 'on', 'a', 'non', 'busy', 'day', '.']


In [15]:
reviews_tokens = [word_tokenize(sent) for sent in reviews_lower]
print(reviews_tokens[0])

['their', 'service', 'is', 'worst', ',', 'pricing', 'in', 'menu', 'is', 'different', 'from', 'bill', '.', 'they', 'can', 'give', 'you', 'a', 'bill', 'with', 'increased', 'pricing', '.', 'even', 'for', 'serving', 'water', ',', 'menu', ',', 'order', 'you', 'need', 'to', 'call', 'them', '3-4', 'times', 'even', 'on', 'a', 'non', 'busy', 'day', '.']


### Remove stop words and punctuations

In [16]:
from nltk.corpus import stopwords
from string import punctuation

In [17]:
stop_nltk = stopwords.words("english")
stop_punct = list(punctuation)

In [18]:
print(stop_nltk)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [19]:
stop_nltk.remove("no")
stop_nltk.remove("not")
stop_nltk.remove("don")
stop_nltk.remove("won")

In [20]:
"no" in stop_nltk

False

In [21]:
stop_final = stop_nltk + stop_punct + ["...", "``","''", "====", "must"]

In [22]:
def del_stop(sent):
    return [term for term in sent if term not in stop_final]

In [23]:
del_stop(reviews_tokens[1])

['really',
 'appreciate',
 'quality',
 'timing',
 'tried',
 'thattil',
 'kutti',
 'dosa',
 "'ve",
 'addicted',
 'dosa',
 'really',
 'chutney',
 'really',
 'good',
 'money',
 'worth',
 'much',
 'better',
 'thattukada',
 'try']

In [24]:
reviews_clean = [del_stop(sent) for sent in reviews_tokens]

In [25]:
reviews_clean = [" ".join(sent) for sent in reviews_clean]
reviews_clean[:2]

['service worst pricing menu different bill give bill increased pricing even serving water menu order need call 3-4 times even non busy day',
 "really appreciate quality timing tried thattil kutti dosa 've addicted dosa really chutney really good money worth much better thattukada try"]

### Separate X and Y and perform train test split, 70-30

In [26]:
len(reviews_clean)

27748

In [27]:
X = reviews_clean
y = reviews1.rating

Train test split

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=42)

### Document term matrix using TfIdf

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [30]:
vectorizer = TfidfVectorizer(max_features = 5000)

In [31]:
len(X_train), len(X_test)

(19423, 8325)

In [32]:
X_train_bow = vectorizer.fit_transform(X_train)

In [33]:
X_test_bow = vectorizer.transform(X_test)

In [34]:
X_train_bow.shape, X_test_bow.shape

((19423, 5000), (8325, 5000))

### Model building

In [35]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [36]:
?RandomForestRegressor

In [37]:
learner_rf = RandomForestRegressor(random_state=42)

In [38]:
%%time

learner_rf.fit(X_train_bow, y_train)

RandomForestRegressor(random_state=42)

In [39]:
y_train_preds = learner_rf.predict(X_train_bow)

In [40]:
from sklearn.metrics import mean_squared_error

In [41]:
mean_squared_error(y_train, y_train_preds)**0.5

0.23720347586757728

#### Increase the number of trees

In [42]:
learner_rf = RandomForestRegressor(random_state=42, n_estimators=20)

In [43]:
%%time

learner_rf.fit(X_train_bow, y_train)

Wall time: 1min 2s


RandomForestRegressor(n_estimators=20, random_state=42)

In [44]:
y_train_preds = learner_rf.predict(X_train_bow)

In [45]:
mean_squared_error(y_train, y_train_preds)**0.5

0.2507584827360229

### Hyper-parameter tuning

"class_weights" was one of the many hyperparameters to tune for the SVM.  

Let's find the best hyper-parameters for the SVM classifier

In [46]:
from sklearn.model_selection import GridSearchCV

In [47]:
?RandomForestRegressor

Instantiate the learner with a random state

In [48]:
learner_rf = RandomForestRegressor(random_state=42)

In [49]:
# Create the parameter grid based on the results of random search 
param_grid = {
                 'max_features': [500, "sqrt", "log2", "auto"],
                  'max_depth': [10, 15, 20, 25]
             }

In [50]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = learner_rf, 
                           param_grid = param_grid, 
                           cv = 5, 
                           n_jobs = -1, 
                           verbose = 1, 
                           scoring = "neg_mean_squared_error" )



In [51]:
grid_search.fit(X_train_bow, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [10, 15, 20, 25],
                         'max_features': [500, 'sqrt', 'log2', 'auto']},
             scoring='neg_mean_squared_error', verbose=1)

In [None]:
grid_search.grid_scores_

In [53]:
grid_search.best_estimator_

RandomForestRegressor(max_depth=25, max_features=500, random_state=42)

In [62]:
grid_search.best_score_

-0.45644900985829145

In [61]:
grid_search.best_params_

{'max_depth': 25, 'max_features': 500}

In [None]:
means = clf.cv_results_['mean_test_score']

### Using the best estimator to make predictions on the test set

In [54]:
y_train_pred = grid_search.best_estimator_.predict(X_train_bow)

In [55]:
y_test_pred = grid_search.best_estimator_.predict(X_test_bow)

In [56]:
mean_squared_error(y_train, y_train_pred)**0.5

0.5876681060137491

In [57]:
mean_squared_error(y_test, y_test_pred)**0.5

0.6715338514796404

### Identifying mismatch cases

In [58]:
res_df = pd.DataFrame({'review':X_test, 'rating':y_test, 'rating_pred':y_test_pred})

In [59]:
res_df[(res_df.rating - res_df.rating_pred)>=2].shape

(8, 3)

In [60]:
res_df[(res_df.rating - res_df.rating_pred)>=2]

Unnamed: 0,review,rating,rating_pred
7277,life saviours serving excellent food worst tim...,5.0,2.00767
1818,value money ordered second time,5.0,2.970624
4771,not good,5.0,1.998553
16510,may not polished serving packaging etc never b...,5.0,1.894671
14845,oh memories place first drink bangalore almost...,5.0,2.628792
15201,sauce not included,4.0,1.878766
3165,rice quantity less,5.0,2.844196
16515,may not polished serving packaging etc never b...,5.0,1.894671
