In [31]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from nltk.stem.porter import PorterStemmer
import time
path="/home/local/AAPL/ashinde/akshada_work/Document Analysis/Text Classification/"
data=pd.read_csv("Data/Restaurant_Reviews.tsv",delimiter="\t")

In [32]:
data

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [33]:
data.isnull().sum()          #check if any value is null

Review    0
Liked     0
dtype: int64

In [35]:
corpus=[]
lemma=WordNetLemmatizer()
#ps = PorterStemmer()
stopwords=stopwords.words("english")
remv=set(("no","not"))
stopwords=set(stopwords)-remv

In [36]:
def clean_review(review):
    review=review.lower()
    review=[word.strip(string.punctuation) for word in review.split(" ")]
    review=[word for word in review if not any(word.isdigit() for c in word )]
    review=[word for word in review if word not in stopwords]
    review=[word for word in review if len(word)>1]
    lemmas=[lemma.lemmatize(word) for word in review ]
    #stemer=[ps.stem(word) for word in review ]
    corpus.append(lemmas)
    return " ".join(lemmas)

In [37]:
start_time=time.time()
clean_data=data.copy()
clean_data["Review"]=[clean_review(review) for review in clean_data["Review"]]
end_time=time.time()


In [73]:
print("cleaning time is : ",end_time-start_time)
print("\n\n------------------------------------------------------------")
print(clean_data["Review"])


cleaning time is :  0.03133225440979004


------------------------------------------------------------
0                                        wow loved place
1                                         crust not good
2                                not tasty texture nasty
3      stopped late may bank holiday rick steve recom...
4                             selection menu great price
                             ...                        
995                    think food flavor texture lacking
996                              appetite instantly gone
997              overall not impressed would not go back
998    whole experience underwhelming think we'll go ...
999    wasted enough life poured salt wound drawing t...
Name: Review, Length: 1000, dtype: object


# CountVectorization

In [66]:
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score,precision_score,recall_score
cv = CountVectorizer(max_features = 2000)  
X = cv.fit_transform(clean_data["Review"]).toarray()  
y = data.iloc[:,1]



In [67]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.20)

# Naive_Bayes

In [42]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
model=classifier.fit(x_train, y_train)

In [43]:
y_pred=classifier.predict(x_test)

In [44]:
print("score: ",classifier.score(x_test,y_test))
print("confusion matrix:\n",confusion_matrix(y_test,y_pred))
print("Precision:",precision_score(y_test,y_pred))
print("Recall:",recall_score(y_test,y_pred))
print("Accuracy:",accuracy_score(y_test,y_pred))
print("F1-score",f1_score(y_test,y_pred))


score:  0.71
confusion matrix:
 [[59 46]
 [12 83]]
Precision: 0.6434108527131783
Recall: 0.8736842105263158
Accuracy: 0.71
F1-score 0.7410714285714286


# RandomForest

In [47]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 70, random_state = 42)
rf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=70,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [48]:
y_pred=rf.predict(x_test)

In [50]:

print("score: ",classifier.score(x_test,y_test))
print("confusion matrix:\n",confusion_matrix(y_test,y_pred))
print("Precision:",precision_score(y_test,y_pred))
print("Recall:",recall_score(y_test,y_pred))
print("Accuracy:",accuracy_score(y_test,y_pred))
print("F1-score",f1_score(y_test,y_pred))


score:  0.895
confusion matrix:
 [[92 10]
 [29 69]]
Precision: 0.8734177215189873
Recall: 0.7040816326530612
Accuracy: 0.805
F1-score 0.7796610169491525


# SVM

In [68]:
from sklearn.metrics import accuracy_score
from sklearn import svm
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(x_train,y_train)
y_pred = SVM.predict(x_test)# Use accuracy_score function to get the accuracy


SVM Accuracy Score ->  82.0


In [69]:
print("confusion matrix:\n",confusion_matrix(y_test,y_pred))
print("Precision:",precision_score(y_test,y_pred))
print("Recall:",recall_score(y_test,y_pred))
print("Accuracy:",accuracy_score(y_test,y_pred))
print("F1-score",f1_score(y_test,y_pred))
print("score: ",classifier.score(x_test,y_test))

confusion matrix:
 [[84 22]
 [14 80]]
Precision: 0.7843137254901961
Recall: 0.851063829787234
Accuracy: 0.82
F1-score 0.8163265306122448
score:  0.885


# TFIDF

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split 
tfidf = TfidfVectorizer(min_df = 10)
tfidf_result = tfidf.fit_transform(clean_data["Review"]).toarray()
tfidf_df = pd.DataFrame(tfidf_result, columns = tfidf.get_feature_names())
tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]
tfidf_df.index = clean_data.index
data= pd.concat([clean_data, tfidf_df], axis=1)


In [52]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.20)

# Naive Bayes

In [53]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
model=classifier.fit(x_train, y_train)

In [54]:
y_pred=model.predict(x_test)

In [56]:
print("score: ",classifier.score(x_test,y_test))
print("confusion matrix:\n",confusion_matrix(y_test,y_pred))
print("Precision:",precision_score(y_test,y_pred))
print("Recall:",recall_score(y_test,y_pred))
print("Accuracy:",accuracy_score(y_test,y_pred))
print("F1-score",f1_score(y_test,y_pred))


score:  0.675
confusion matrix:
 [[51 53]
 [12 84]]
Precision: 0.6131386861313869
Recall: 0.875
Accuracy: 0.675
F1-score 0.721030042918455


# RandomForest

In [57]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 70, random_state = 42)
rf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=70,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [58]:
y_pred=rf.predict(x_test)

In [60]:
print("confusion matrix:\n",confusion_matrix(y_test,y_pred))
print("Precision:",precision_score(y_test,y_pred))
print("Recall:",recall_score(y_test,y_pred))
print("Accuracy:",accuracy_score(y_test,y_pred))
print("F1-score",f1_score(y_test,y_pred))
print("score: ",classifier.score(x_test,y_test))

confusion matrix:
 [[83 21]
 [22 74]]
Precision: 0.7789473684210526
Recall: 0.7708333333333334
Accuracy: 0.785
F1-score 0.774869109947644
score:  0.675


# SVM

In [None]:
from sklearn.metrics import accuracy_score
from sklearn import svm
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(x_train,y_train)
y_pred = SVM.predict(x_test)# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(y_pred, y_test)*100)

In [64]:
print("confusion matrix:\n",confusion_matrix(y_test,y_pred))
print("Precision:",precision_score(y_test,y_pred))
print("Recall:",recall_score(y_test,y_pred))
print("Accuracy:",accuracy_score(y_test,y_pred))
print("F1-score",f1_score(y_test,y_pred))
print("score: ",classifier.score(x_test,y_test))

confusion matrix:
 [[80 24]
 [19 77]]
Precision: 0.7623762376237624
Recall: 0.8020833333333334
Accuracy: 0.785
F1-score 0.7817258883248732
score:  0.675
