In [23]:
import pandas as pd
d=pd.read_csv('/content/Restaurant_Reviews.csv')
df=pd.DataFrame(d)
df

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [24]:
df.isnull().sum()

Review    0
Liked     0
dtype: int64

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


Data preprocessing

In [26]:
from nltk.corpus import stopwords

In [27]:
import nltk
from nltk.stem import PorterStemmer,SnowballStemmer
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
nltk.download('stopwords')
nltk.download('wordnet')
def preprocessing(text):
  text=text.lower()
  review_words=text.split()
  review=[words for words in review_words if not words in set(stopwords.words('english'))]
  ps=PorterStemmer()
  review=[ps.stem(word) for word in review]
  review=[lemmatizer.lemmatize(word) for word in review]
  return " ".join(review)
preprocessing('Wow... Loved this place.')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


'wow... love place.'

In [None]:
import spacy
nlp=spacy.load('en_core_web_sm')
def preprocess(text):
  doc=nlp(text)
  no_stop_word=[]
  for token in doc:
    if not token.is_stop:
      no_stop_word.append(token.text)
  return " ".join(no_stop_word)
preprocess('Wow... Loved this place.')



'Wow ... Loved place .'

In [28]:
df['filtered_review']=df.Review.apply(lambda x: preprocessing(x))

In [29]:
df['filtered_review']

0                                     wow... love place.
1                                            crust good.
2                                    tasti textur nasty.
3      stop late may bank holiday rick steve recommen...
4                              select menu great prices.
                             ...                        
995                    think food flavor textur lacking.
996                              appetit instantli gone.
997                        overal impress would go back.
998    whole experi underwhelming, think we'll go nin...
999    then, wast enough life there, pour salt wound ...
Name: filtered_review, Length: 1000, dtype: object

In [None]:
df.head(20)

Unnamed: 0,Review,Liked,filtered_review
0,Wow... Loved this place.,1,wow... love place.
1,Crust is not good.,0,crust good.
2,Not tasty and the texture was just nasty.,0,tasti textur nasty.
3,Stopped by during the late May bank holiday of...,1,stop late may bank holiday rick steve recommen...
4,The selection on the menu was great and so wer...,1,select menu great prices.
5,Now I am getting angry and I want my damn pho.,0,get angri want damn pho.
6,Honeslty it didn't taste THAT fresh.),0,honeslti tast fresh.)
7,The potatoes were like rubber and you could te...,0,potato like rubber could tell made ahead time ...
8,The fries were great too.,1,fri great too.
9,A great touch.,1,great touch.


Sentiment analysis

In [30]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import time


In [31]:
t=time.time()
nltk.download('vader_lexicon')
analyzer=SentimentIntensityAnalyzer()
nlp_sentiment_score=[]
for word in df['Review']:
  score=analyzer.polarity_scores(word)
  nlp_sentiment_score.append(score)
print(f'semtiment scoring time :{time.time()-t:.2f} seconds')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


semtiment scoring time :0.21 seconds


In [32]:
df['positive_sentiment']=[score['pos'] for score in nlp_sentiment_score]
df['negative_sentiment']=[score['neg'] for score in nlp_sentiment_score]
df['neutral_sentiment']=[score['neu'] for score in nlp_sentiment_score]

In [33]:
from sklearn.preprocessing import MinMaxScaler
min_max_scalar=MinMaxScaler()


In [34]:
import numpy as np
df['positive_sentiment']=min_max_scalar.fit_transform(np.array(df['positive_sentiment']).reshape(-1,1))
df['negative_sentiment']=min_max_scalar.fit_transform(np.array(df['negative_sentiment']).reshape(-1,1))
df['neutral_sentiment']=min_max_scalar.fit_transform(np.array(df['neutral_sentiment']).reshape(-1,1))

In [35]:
df.head()

Unnamed: 0,Review,Liked,filtered_review,positive_sentiment,negative_sentiment,neutral_sentiment
0,Wow... Loved this place.,1,wow... love place.,0.565,0.0,0.435
1,Crust is not good.,0,crust good.,0.0,0.445,0.555
2,Not tasty and the texture was just nasty.,0,tasti textur nasty.,0.0,0.34,0.66
3,Stopped by during the late May bank holiday of...,1,stop late may bank holiday rick steve recommen...,0.322,0.093,0.585
4,The selection on the menu was great and so wer...,1,select menu great prices.,0.272,0.0,0.728


count vectorization

In [None]:
df['word_count']=df['Review'].apply(lambda x: (x.count(" ")+1)/50)

In [None]:
df.head()

Unnamed: 0,Review,Liked,filtered_review,positive_sentiment,negative_sentiment,neutral_sentiment,word_count
0,Wow... Loved this place.,1,wow... love place.,0.565,0.0,0.435,0.08
1,Crust is not good.,0,crust good.,0.0,0.445,0.555,0.08
2,Not tasty and the texture was just nasty.,0,tasti textur nasty.,0.0,0.34,0.66,0.16
3,Stopped by during the late May bank holiday of...,1,stop late may bank holiday rick steve recommen...,0.322,0.093,0.585,0.3
4,The selection on the menu was great and so wer...,1,select menu great prices.,0.272,0.0,0.728,0.24


In [36]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [48]:
vectorizer=TfidfVectorizer()
x=vectorizer.fit_transform(df['filtered_review']).toarray()
y=df.iloc[:,1].values

In [49]:
df.iloc[:,1]

0      1
1      0
2      0
3      1
4      1
      ..
995    0
996    0
997    0
998    0
999    0
Name: Liked, Length: 1000, dtype: int64

In [39]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [40]:
x_train.shape

(800, 1853)

In [None]:
x_test.shape

(200, 1853)

In [41]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,accuracy_score

In [42]:
x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

RandomForest

In [50]:
model1=RandomForestClassifier()
model1.fit(x_train,y_train)
y_pred=model1.predict(x_test)

In [51]:
y_pred

array([0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 1])

In [53]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.89      0.63      0.74       135
           1       0.52      0.83      0.64        65

    accuracy                           0.69       200
   macro avg       0.70      0.73      0.69       200
weighted avg       0.77      0.69      0.70       200



In [54]:
model1=GaussianNB()
model1.fit(x_train,y_train)
y_pred=model1.predict(x_test)

In [55]:
y_pred

array([1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 1])

In [56]:
print(accuracy_score(y_pred,y_test))

0.685


In [57]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.49      0.77      0.60        61
           1       0.87      0.65      0.74       139

    accuracy                           0.69       200
   macro avg       0.68      0.71      0.67       200
weighted avg       0.75      0.69      0.70       200



Multinomial Naive Bayes

In [58]:
model2=MultinomialNB()
model2.fit(x_train,y_train)
y_pred2=model2.predict(x_test)
print(accuracy_score(y_pred,y_test))

0.685


In [None]:
x_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

Hyperparameter tuning

In [None]:
vectorizer.get_feature_names_out()[1200]

'perfect'

In [59]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [60]:
cross_val_score(SVC(gamma='auto',C=30,kernel='linear'),x,y,cv=5)

array([0.74 , 0.735, 0.74 , 0.765, 0.72 ])

In [61]:
cross_val_score(SVC(gamma='auto',C=20,kernel='rbf'),x,y,cv=5)

array([0.6  , 0.545, 0.575, 0.655, 0.635])

In [62]:
classifier=GridSearchCV(SVC(gamma='auto'),{
                            'C':[10,20,30],'kernel':['rbf','linear'],
},cv=5,return_train_score=False)

In [63]:
classifier.fit(x,y)

In [64]:
pd.DataFrame(classifier.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.498408,0.024763,0.269191,0.007831,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.6,0.545,0.575,0.655,0.635,0.602,0.039699,4
1,0.490479,0.060757,0.08773,0.006168,10,linear,"{'C': 10, 'kernel': 'linear'}",0.74,0.755,0.73,0.805,0.74,0.754,0.026721,1
2,0.537866,0.071501,0.286153,0.049924,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.6,0.545,0.575,0.655,0.635,0.602,0.039699,4
3,0.476734,0.054786,0.078413,0.003831,20,linear,"{'C': 20, 'kernel': 'linear'}",0.74,0.745,0.735,0.805,0.735,0.752,0.026758,2
4,0.491902,0.029142,0.278401,0.006788,30,rbf,"{'C': 30, 'kernel': 'rbf'}",0.6,0.545,0.575,0.655,0.635,0.602,0.039699,4
5,0.574033,0.063117,0.088465,0.0108,30,linear,"{'C': 30, 'kernel': 'linear'}",0.74,0.735,0.74,0.765,0.72,0.74,0.014491,3


In [65]:
model_params={
    'svm':{
        'model':SVC(gamma='auto'),
        'params':{
        'kernel':['linear','rbf'],
        'C':[1,10,20],
    }
    },
    'random_forest':{
        'model':
         RandomForestClassifier(),
    'params':{
        'n_estimators':[1,5,10]
    }
    },
    'logistic_regression':
    {
        'model':LogisticRegression(multi_class='auto',solver='liblinear'),
        'params':{
            'C':[1,5,10]
        }
    },

}

In [66]:
scores=[]
for model_name,model_param in model_params.items():
  clf2=GridSearchCV(model_param['model'],model_param['params'],return_train_score=False)
  clf2.fit(x,y)
  scores.append({
      'model': model_name,
      'best_score': clf2.best_score_,
      'best params': clf2.best_params_
  })

In [67]:
pd.DataFrame(scores)

Unnamed: 0,model,best_score,best params
0,svm,0.761,"{'C': 1, 'kernel': 'linear'}"
1,random_forest,0.723,{'n_estimators': 10}
2,logistic_regression,0.763,{'C': 5}


In [68]:
x_train_2d=np.stack(x_train)
x_test_2d=np.stack(x_test)

In [69]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scale_train_embed=scaler.fit_transform(x_train_2d)
scale_test_embed=scaler.fit_transform(x_test_2d)

In [None]:
x_train_embed=np.array(scale_train_embed).reshape(-1,1)
x_test_embed=np.array(scale_test_embed).reshape(-1,1)

MultinomialNB after HyperParameterTuning

In [70]:
clf=MultinomialNB()
clf.fit(x_train,y_train)
y_pred=clf.predict(x_test)
print("accuracy score is: {}".format(accuracy_score(y_pred,y_test)))

print(classification_report(y_pred,y_test))

accuracy score is: 0.75
              precision    recall  f1-score   support

           0       0.74      0.74      0.74        96
           1       0.76      0.76      0.76       104

    accuracy                           0.75       200
   macro avg       0.75      0.75      0.75       200
weighted avg       0.75      0.75      0.75       200



In [71]:
clf2=RandomForestClassifier()
clf2.fit(x_train,y_train)
y_pred=clf2.predict(x_test)
print("accuracy score is: {}".format(accuracy_score(y_pred,y_test)))

accuracy score is: 0.68


Logistic Regression

In [72]:
clf2=LogisticRegression()
clf2.fit(x_train_2d,y_train)
y_pred=clf2.predict(x_test_2d)
print("accuracy score is: {}".format(accuracy_score(y_pred,y_test)))
print(classification_report(y_pred,y_test))

accuracy score is: 0.735
              precision    recall  f1-score   support

           0       0.80      0.69      0.74       111
           1       0.67      0.79      0.73        89

    accuracy                           0.73       200
   macro avg       0.74      0.74      0.73       200
weighted avg       0.74      0.73      0.74       200



SVM

In [73]:
clf3=SVC(kernel='linear',C=20,gamma='auto')
clf3.fit(x_train,y_train)
y_pred=clf3.predict(x_test)
print("accuracy score is: {}".format(accuracy_score(y_pred,y_test)))
print(classification_report(y_pred,y_test))

accuracy score is: 0.7
              precision    recall  f1-score   support

           0       0.75      0.67      0.71       108
           1       0.65      0.74      0.69        92

    accuracy                           0.70       200
   macro avg       0.70      0.70      0.70       200
weighted avg       0.71      0.70      0.70       200



Decision Tree

In [78]:
from sklearn.tree import DecisionTreeClassifier
clf4=DecisionTreeClassifier()
clf4.fit(x_train,y_train)
y_pred=clf4.predict(x_test)
print("accuracy score is: {}".format(accuracy_score(y_pred,y_test)))
print(classification_report(y_pred,y_test))

accuracy score is: 0.71
              precision    recall  f1-score   support

           0       0.85      0.65      0.74       126
           1       0.58      0.81      0.67        74

    accuracy                           0.71       200
   macro avg       0.72      0.73      0.71       200
weighted avg       0.75      0.71      0.71       200



In [80]:
def prediction(text):
  review_words=text.lower()
  #review_words=nlp(text)
  review=[words for words in review_words if not words in set(stopwords.words('english'))]
  review=" ".join(str(v) for v in review)
  ps=PorterStemmer()
  review=[ps.stem(word) for word in review]
  review=[lemmatizer.lemmatize(word) for word in review]
  review=" ".join(review)
  #return review
  final_review=vectorizer.transform([review]).toarray()
  #review=np.array(review).reshape(-1,1)
  return model1.predict(final_review)
prediction('lovely food here.')


array([1])

In [83]:
text='food is goot here'
if prediction(text):
  print('It is positive review')
else:
  print('It is negative review')

It is positive review


In [None]:
df['result']=df.Review.apply(lambda x:preprocessing(x))

In [84]:
import os
df.to_csv(os.path.join('/content','restaurant_analysis_output.csv'))