In [None]:
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from nltk import sent_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn import linear_model
from sklearn import tree
from sklearn.metrics import roc_curve, auc
import pylab 
import scipy.stats as stats
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('seaborn')
%matplotlib inline
from wordcloud import WordCloud, STOPWORDS
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

#### Data Processing

In [None]:
np.random.seed(500)
predict_data = pd.read_csv('scraping_data.csv') # the data scraped from website, used for prediction
data = pd.read_csv('merged_data.csv') # the data we got from professor Shebuti Rayana, used for learning
data = data[['review', 'label']]
data['label'] = np.where((data['label'] == 1), 1, 0)
positive = data[data['label'] == 1]
negative = data[data['label'] == 0]

# since the data is unbalanced, the number of true reviews is around 10 times of the false reviews
# so we subset the volume of the positive data
positive = positive.sample(frac = 0.16)
Corpus = positive.append(negative)
Corpus = Corpus.reset_index(drop = True)

# word tokenize
Corpus['review'].dropna(inplace = True)
Corpus['review'] = [entry.lower() for entry in Corpus['review']]
Corpus['review'] = [word_tokenize(entry) for entry in Corpus['review']]
tag_map = defaultdict(lambda: wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(Corpus['review']):
    Final_words = []
    word_Lemmatized = WordNetLemmatizer()
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word, tag_map[tag[0]])
            Final_words.append(word_Final)
    Corpus.loc[index, 'text_final'] = str(Final_words)

# split datasets
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'], Corpus['label'], test_size=0.3)

# tfidf classification of review test
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)
Tfidf_vect = TfidfVectorizer(max_features = 5000)
Tfidf_vect.fit(Corpus['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

#### Linear Regression

In [None]:
from sklearn import linear_model

model = linear_model.LinearRegression()
model.fit(Train_X_Tfidf, Train_Y)
predictions_linear  = model.predict(Test_X_Tfidf)

# prediction using linear regression
predict_linear = list()
for i in range(0, len(predict_data['review'])):
    predict_value = model.predict(Tfidf_vect.transform([str(predict_data['review'][i])]))
    predict_data['predict_Linear'][i] = predict_value
    
# prediction performance
def c_m_analysis(true, pred, threshold):
    tn, fp, fn, tp = confusion_matrix(true, get_classification(pred,threshold)).ravel()
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    fpr = fp / (fp + tn)
    tpr = tp / (tp + fn)
    f_score = 2 * precision * tpr / (precision + tpr)
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    print("Precision:\t\t\t%1.2f" % (precision))
    print("Recall/TPR:\t\t\t%1.2f" % (recall))
    print("f-score:\t\t\t%1.2f" % (f_score))
    print("Accuracy:\t\t\t%1.2f" % (accuracy))
c_m_analysis(Test_Y, predictions_linear, 0.5)

# confusion matrix
def get_classification(predictions,threshold):  #take value of prediction -> 0, 1
    classes = np.zeros_like(predictions_linear)
    for i in range(len(classes)):
        if predictions[i] > threshold:
            classes[i] = 1
    return classes
confusion_matrix(Test_Y, get_classification(predictions_linear, 0.5))

# draw roc curve
(fpr, tpr, thresholds) = roc_curve(Test_Y, predictions_linear)
area = auc(fpr, tpr)
plt.clf()
plt.plot(fpr, tpr, label = "Out-Sample ROC Curve with area = %1.2f" % area)
plt.plot([0, 1], [0, 1], 'k')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc = "lower right")
plt.show()

#### Decision Trees

In [None]:
tree_model = tree.DecisionTreeClassifier(max_depth = 10, criterion = 'entropy')
tree_model.fit(Train_X_Tfidf, Train_Y,Train_Y)
predictions_tree = tree_model.predict(Test_X_Tfidf) # prediction
confusion_matrix(Test_Y, get_classification(predictions_tree, 0.5)) # confusion matrix
c_m_analysis(Test_Y, predictions_tree, 0.5) # prediction performance
# roc curve is as the same codes as above, the only difference is following:
(fpr, tpr, thresholds1) = roc_curve(Test_Y, predictions_tree)

#### Naive Bayes

In [None]:
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf, Train_Y)
predictions_NB = Naive.predict(Test_X_Tfidf) # prediction
confusion_matrix(Test_Y, get_classification(predictions_NB, 0.5)) # confusion matrix
c_m_analysis(Test_Y, predictions_NB, 0.5) # prediction performance
# roc curve is as the same codes as above, the only difference is following:
(fpr, tpr, thresholds) = roc_curve(Test_Y, predictions_NB)

#### SVM

In [None]:
SVM = svm.SVC(C = 1.0, kernel = 'linear', degree = 3, gamma = 'auto')
SVM.fit(Train_X_Tfidf, Train_Y)
predictions_SVM = SVM.predict(Test_X_Tfidf) # prediction
np.set_printoptions(threshold = np.inf)
confusion_matrix(Test_Y,get_classification(predictions_SVM,0.5)) # confusion matrix
c_m_analysis(Test_Y, predictions_SVM, 0.5) # prediction performance
# roc curve is as the same codes as above, the only difference is following:
(fpr, tpr, thresholds) = roc_curve(Test_Y, predictions_SVM)

#### Rating Distribution Pie Charts

In [None]:
data = pd.read_csv('Predict_SVM.csv') # the prediction results using SVM
data = data[['rating', 'predict_SVM']]
data['predict_SVM'] = np.where((data['predict_SVM'] == 1), True, False)
data.reset_index(level = 0, inplace = True)
d = data.groupby(['rating'])['predict_SVM'].value_counts().unstack().plot.pie(subplots = True, autopct = '%.2f%%', figsize = (10,4.6))

#### Reviews Text Length Histograms

In [None]:
df = pd.read_csv('Predict_SVM.csv')

def fix_review(input_review): # fix review content
    output_review = []
    input_review = re.sub(r'[^\w\s]', '', input_review).replace('\xa0', '').replace('\n\n', ' ').replace('\n', ' ').strip().lower().split(' ')
    for i in input_review:
        if i != '':
            output_review.append(i)
    return output_review
df['review'] = df['review'].apply(fix_review)

df['text_length'] = 0
for i in range(len(df)):
    df['text_length'].loc[i] = len(df['review'].loc[i])
    
df_true = df[df['predict_SVM'] == 1]
df_fake = df[df['predict_SVM'] == 0]

bins = [] # set histogram bins
for i in range(1,40):
    bins.append(i)

length_rating = df_true.groupby(['text_length', 'rating']).size().unstack()
COL_NUM = 5
ROW_NUM = 1
fig, axes = plt.subplots(ROW_NUM, COL_NUM, figsize=(25,5))
# fig.suptitle('True Review Text Length Distribution')
for i, (rating, text_length) in enumerate(length_rating.items()): 
    ax = axes[i]
    text_length.plot.hist(grid=True, bins=bins, rwidth=1, ax=ax)
    plt.grid(axis='y', alpha=0.75)
    ax.set_title(f"Stars: {rating}")
    ax.set_ylim([0, 130])    
plt.tight_layout() 

length_rating = df_fake.groupby(['text_length', 'rating']).size().unstack()
COL_NUM = 5
ROW_NUM = 1
fig, axes = plt.subplots(ROW_NUM, COL_NUM, figsize=(25,5))
# fig.suptitle('Fake Review Text Length Distribution')
for i, (rating, text_length) in enumerate(length_rating.items()): 
    ax = axes[i]
    text_length.plot.hist(grid=True, bins=bins, rwidth=1, ax=ax)
    plt.grid(axis='y', alpha=0.75)
    ax.set_title(f"Stars: {rating}")
    ax.set_ylim([0, 130])
plt.tight_layout()

#### Review Text Word Cloud

In [None]:
true_text = ''
for i in range(len(df_true)):
    true_text += ' '.join(df_true['review'].iloc[i])
fake_text = ''
for i in range(len(df_fake)):
    fake_text += ' '.join(df_fake['review'].iloc[i])
true_string = true_text.replace('\n\n', ' ').replace('\n', ' ')
fake_string = fake_text.replace('\n\n', ' ').replace('\n', ' ')

plt.figure(figsize=(12,12))
wordcloud = WordCloud(stopwords=STOPWORDS,background_color='white',width=1200,height=800, max_words=40).generate(true_string)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

plt.figure(figsize=(12,12))
wordcloud = WordCloud(stopwords=STOPWORDS,background_color='white',width=1200,height=800,max_words=40).generate(fake_string)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

#### Weighted Sentiment Analysis

In [None]:
def vader_comparison(texts):
    headers = ['pos','neg','neu','compound']
    print("Name\t",'  pos\t','neg\t','neu\t','compound')
    analyzer = SentimentIntensityAnalyzer()
    for i in range(len(texts)):
        name = texts[i][0]
        sentences = sent_tokenize(texts[i][1])
        pos=compound=neu=neg=0
        for sentence in sentences:
            vs = analyzer.polarity_scores(sentence)
            pos+=vs['pos']/(len(sentences))
            compound+=vs['compound']/(len(sentences))
            neu+=vs['neu']/(len(sentences))
            neg+=vs['neg']/(len(sentences))
        print('%-10s'%name,'%1.2f\t'%pos,'%1.2f\t'%neg,'%1.2f\t'%neu,'%1.2f\t'%compound)

df1 = pd.read_csv('Predict_SVM.csv')
df1_true = df1[df1['predict_SVM'] == 1]
df1_fake = df1[df1['predict_SVM'] == 0]

true_text = ''
for i in df1_true['review']:
    true_text += i.strip().replace('\n\n', '').replace('\n', '').replace("\\", '')
fake_text = ''
for i in df1_fake['review']:
    fake_text += i.strip().replace('\n\n', '').replace('\n', '').replace("\\", '')
texts = [('true', true_text), ('fake', fake_text)]

vader_comparison(texts)

In [None]:
x = ['pos', 'neg', 'neu', 'compound']
y1 = [0.20, 0.03, 0.77, 0.34]
y2 = [0.16, 0.08, 0.76, 0.14]
# plt.title('Weighted Sentiment Analysis')
plt.figure(figsize=(8,5))
plt.plot(x, y1, color='blue', label = 'true reviews')
plt.plot(x, y2, color='red', label = 'fake reviews')
plt.legend()
plt.xlabel('sentiments')
plt.ylabel('weights')