In [None]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn import linear_model
from sklearn import tree
from sklearn.metrics import roc_curve, auc
import pylab 
import scipy.stats as stats
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('seaborn')
%matplotlib inline

#### Data Processing

In [None]:
np.random.seed(500)
predict_data = pd.read_csv('scraping_data.csv') # the data scraped from website, used for prediction
data = pd.read_csv('merged_data.csv') # the data we got from , used for learning
data = data[['review', 'label']]
data['label'] = np.where((data['label'] == 1), 1, 0)
positive = data[data['label'] == 1]
negative = data[data['label'] == 0]

# since the data is unbalanced, the number of true reviews is around 10 times of the false reviews
# so we subset the volume of the positive data
positive = positive.sample(frac = 0.16)
Corpus = positive.append(negative)
Corpus = Corpus.reset_index(drop = True)

# word tokenize
Corpus['review'].dropna(inplace = True)
Corpus['review'] = [entry.lower() for entry in Corpus['review']]
Corpus['review'] = [word_tokenize(entry) for entry in Corpus['review']]
tag_map = defaultdict(lambda: wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(Corpus['review']):
    Final_words = []
    word_Lemmatized = WordNetLemmatizer()
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word, tag_map[tag[0]])
            Final_words.append(word_Final)
    Corpus.loc[index, 'text_final'] = str(Final_words)

# split datasets
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'], Corpus['label'], test_size=0.3)

# tfidf classification of review test
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)
Tfidf_vect = TfidfVectorizer(max_features = 5000)
Tfidf_vect.fit(Corpus['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

#### Linear Regression

In [None]:
from sklearn import linear_model

model = linear_model.LinearRegression()
model.fit(Train_X_Tfidf, Train_Y)
predictions_linear  = model.predict(Test_X_Tfidf)

# prediction using linear regression
predict_linear = list()
for i in range(0, len(predict_data['review'])):
    predict_value = model.predict(Tfidf_vect.transform([str(predict_data['review'][i])]))
    predict_data['predict_Linear'][i] = predict_value
    
# prediction performance
def c_m_analysis(true, pred, threshold):
    tn, fp, fn, tp = confusion_matrix(true, get_classification(pred,threshold)).ravel()
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    fpr = fp / (fp + tn)
    tpr = tp / (tp + fn)
    f_score = 2 * precision * tpr / (precision + tpr)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    print("Precision:\t\t\t%1.2f" % (precision))
    print("Recall/TPR:\t\t\t%1.2f" % (recall))
    print("f-score:\t\t\t%1.2f" % (f_score))
    print("Accuracy:\t\t\t%1.2f" % (accuracy))
c_m_analysis(Test_Y, predictions_linear, 0.5)

# confusion matrix
def get_classification(predictions,threshold):  #take value of prediction -> 0, 1
    classes = np.zeros_like(predictions_linear)
    for i in range(len(classes)):
        if predictions[i] > threshold:
            classes[i] = 1
    return classes
confusion_matrix(Test_Y, get_classification(predictions_linear, 0.5))

# draw roc curve
(fpr, tpr, thresholds) = roc_curve(Test_Y, predictions_linear)
area = auc(fpr, tpr)
plt.clf()
plt.plot(fpr, tpr, label = "Out-Sample ROC Curve with area = %1.2f" % area)
plt.plot([0, 1], [0, 1], 'k')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc = "lower right")
plt.show()

#### Decision Trees

In [None]:
tree_model = tree.DecisionTreeClassifier(max_depth = 10, criterion = 'entropy')
tree_model.fit(Train_X_Tfidf, Train_Y,Train_Y)
predictions_tree = tree_model.predict(Test_X_Tfidf) # prediction
confusion_matrix(Test_Y, get_classification(predictions_tree, 0.5)) # confusion matrix
c_m_analysis(Test_Y, predictions_tree, 0.5) # prediction performance
# roc curve is as the same codes as above, the only difference is following:
(fpr, tpr, thresholds1) = roc_curve(Test_Y, predictions_tree)

#### Naive Bayes

In [None]:
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf, Train_Y)
predictions_NB = Naive.predict(Test_X_Tfidf) # prediction
confusion_matrix(Test_Y, get_classification(predictions_NB, 0.5)) # confusion matrix
c_m_analysis(Test_Y, predictions_NB, 0.5) # prediction performance
# roc curve is as the same codes as above, the only difference is following:
(fpr, tpr, thresholds) = roc_curve(Test_Y, predictions_NB)

#### SVM

In [None]:
SVM = svm.SVC(C = 1.0, kernel = 'linear', degree = 3, gamma = 'auto')
SVM.fit(Train_X_Tfidf, Train_Y)
predictions_SVM = SVM.predict(Test_X_Tfidf) # prediction
np.set_printoptions(threshold = np.inf)
confusion_matrix(Test_Y,get_classification(predictions_SVM,0.5)) # confusion matrix
c_m_analysis(Test_Y, predictions_SVM, 0.5) # prediction performance
# roc curve is as the same codes as above, the only difference is following:
(fpr, tpr, thresholds) = roc_curve(Test_Y, predictions_SVM)

# New York restaurants prediction using SVM
for i in range(0, len(predict_data['review'])):
    predict_value = SVM.predict(Tfidf_vect.transform([str(predict_data['review'][i])]))
    predict_data['predict_SVM'][i] = predict_value
predict_data['predict_SVM'] = np.where((predict_data['predict_SVM'] > 0.5), 1, 0)

sum=0
for i in range(len(predict_data['review'])):
    if int(predict_data['label'][i]) == int(predict_data['predict_SVM'][i]):
        sum = sum + 1
predict_accuracy = sum / len(predict_data['review']) # prediction accuracy