In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string
import nltk

In [2]:
review = pd.read_csv("df.csv")

In [3]:
#review.shape
review['flagged'].value_counts()



N    20752
Y     6206
Name: flagged, dtype: int64

In [4]:
Not_Fake_Percentage = (20752/(20752+6206))*100
Fake_Percentage = (6206/(20752+6206))*100
print(Not_Fake_Percentage)
print(Fake_Percentage)

76.97900437717932
23.020995622820685


In [5]:
review.head

<bound method NDFrame.head of                      reviewID              reviewerID            restaurantID  \
0      GtwU21YOQn-wf4vWRUIx6w  bNYesZ944s6IJVowOnB0iA  pbEiXam9YJL3neCYHGwLUA   
1                     0LpVTc3  TRKxLC3y-ZvP45e5iilMtw  pbEiXam9YJL3neCYHGwLUA   
2               tljtLzf68Fkwf  0EMm8umAqXZzyhxNpL4M9g  pbEiXam9YJL3neCYHGwLUA   
3                         iSN  DlwexC7z88ymAzu45skODw  pbEiXam9YJL3neCYHGwLUA   
4                      Jmwrh7  kW2dk1CWihmh3g7k9N2G8A  pbEiXam9YJL3neCYHGwLUA   
...                       ...                     ...                     ...   
26953              PZu8sDx2T2  tivh8lr6pzBDNfrJLYWh_g  v9qEDxi3t-P0CmGWAkkGvw   
26954               S-zbPPGoB  jKs4FQgkV0wSX8BG2_dgTg  RRflazDtBkqqpvEz2hbV2w   
26955    roKqXYooTy49OMAIJJjf  vX6aOMQ3HWCbwZVfCkCauw  zI0E_yruu58ea-xq9aHi-w   
26956                FefmFaWa  vX6aOMQ3HWCbwZVfCkCauw  6XVXM78gBuU3gpq2hTOgJA   
26957    x8knvE6V8MkwT90wCV0f  OZTkqoi8_luhrL-mMj7O8A  Lr4tZOsttQT-BgFtUkUTaQ  

In [6]:
#Randomly shuffling the dataframe
review = review.sample(frac = 1)

In [7]:
#review.info()

In [8]:
review.reset_index(inplace = True)
review.drop(["index"], axis = 1, inplace = True)

In [9]:
#review.columns


In [10]:
#review.head()

In [11]:
#Creating a function to convert the text in lowercase, remove the extra space, special chr.
def wordopt(reviewContent):
    reviewContent = re.sub('\[.*?\]', '', str(reviewContent))
    reviewContent = re.sub("\\W"," ",reviewContent) 
    reviewContent = re.sub('https?://\S+|www\.\S+', '',reviewContent )
    reviewContent = re.sub('<.*?>+', '', reviewContent)
    reviewContent= re.sub('[%s]' % re.escape(string.punctuation), '',reviewContent )
    reviewContent = re.sub('\n', '',reviewContent )
    reviewContent = re.sub('\w*\d\w*', '', reviewContent)    
    return reviewContent

In [12]:
review["reviewContent"] = review["reviewContent"].apply(wordopt)

In [13]:
def feature_engineering(df):
    # Maximum Number of Reviews per day per reviewer
    mnr_df1 = df[['reviewerID', 'date']].copy()
    mnr_df2 = mnr_df1.groupby(by=['date', 'reviewerID']).size().reset_index(name='mnr')
    mnr_df2['mnr'] = mnr_df2['mnr'] / mnr_df2['mnr'].max()
    df = df.merge(mnr_df2, on=['reviewerID', 'date'], how='inner')

    # Review Length
    df['rl'] = df['reviewContent'].apply(lambda x: len(x.split()))

    # Review Deviation
    df['rd'] = abs(df['rating'] - df['restaurantRating']) / 4

    # Maximum cosine similarity
    review_data = df

    res = OrderedDict()

    # Iterate over data and create groups of reviewers
    for row in review_data.iterrows():
        if row[1].reviewerID in res:
            res[row[1].reviewerID].append(row[1].reviewContent)
        else:
            res[row[1].reviewerID] = [row[1].reviewContent]

In [14]:
def under_sampling(df):
    print("Under-Sampling Data")
    # Count of Reviews
    # print("Authentic", len(df[(df['flagged'] == 'N')]))
    # print("Fake", len(df[(df['flagged'] == 'Y')]))

    sample_size = len(df[(df['flagged'] == 'Y')])

    authentic_reviews_df = df[df['flagged'] == 'N']
    fake_reviews_df = df[df['flagged'] == 'Y']

    authentic_reviews_us_df = authentic_reviews_df.sample(sample_size)
    under_sampled_df = pd.concat([authentic_reviews_us_df, fake_reviews_df], axis=0)

    # print("Under-Sampled Fake", len(under_sampled_df[(under_sampled_df['flagged'] == 'Y')]))
    # print("Under-Sampled Authentic", len(under_sampled_df[(under_sampled_df['flagged'] == 'N')]))

    # Graph of Data Distribution
    # fig, ax = plt.subplots(figsize=(6, 4))
    # sns.countplot(x='flagged', data=under_sampled_df)
    # plt.title("Count of Reviews")
    # plt.show()
    print("Under-Sampling Complete")
    return under_sampled_df

In [15]:
def semi_supervised_learning(df, model, algorithm, threshold=0.8, iterations=40):
    df = df.copy()
    print("Training "+algorithm+" Model")
    labels = df['flagged']

    df.drop(['reviewID', 'reviewerID', 'restaurantID', 'date', 'name', 'location', 'yelpJoinDate', 'flagged',
             'reviewContent', 'restaurantRating'], axis=1, inplace=True)

    train_data, test_data, train_label, test_label = train_test_split(df, labels, test_size=0.25, random_state=42)

    test_data_copy = test_data.copy()
    test_label_copy = test_label.copy()

    all_labeled = False

    current_iteration = 0

    # param_grid = {
    #     'n_estimators': [10, 500],
    #     'max_features': ['auto', 'sqrt', 'log2'],
    #     'max_depth': [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
    #     'criterion': ['gini', 'entropy']
    # }
    # grid_clf_acc = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
    #
    # grid_clf_acc.fit(train_data, train_label)

    pbar = tqdm(total=iterations)

    while not all_labeled and (current_iteration < iterations):
        # print("Before train data length : ", len(train_data))
        # print("Before test data length : ", len(test_data))
        current_iteration += 1
        model.fit(train_data, train_label)

        probabilities = model.predict_proba(test_data)
        pseudo_labels = model.predict(test_data)

        indices = np.argwhere(probabilities > threshold)
        # print("rows above threshold : ", len(indices))
        for item in indices:
            train_data.loc[test_data.index[item[0]]] = test_data.iloc[item[0]]
            train_label.loc[test_data.index[item[0]]] = pseudo_labels[item[0]]
        test_data.drop(test_data.index[indices[:, 0]], inplace=True)
        test_label.drop(test_label.index[indices[:, 0]], inplace=True)
        # print("After train data length : ", len(train_data))
        # print("After test data length : ", len(test_data))
        print("--" * 20)

        if len(test_data) == 0:
            print("Exiting loop")
            all_labeled = True
        pbar.update(1)
    pbar.close()
    predicted_labels = model.predict(test_data_copy)

    # print('Best Params : ', grid_clf_acc.best_params_)
    print(algorithm + ' Model Results')
    print('--' * 20)
    print('Accuracy Score : ' + str(accuracy_score(test_label_copy, predicted_labels)))
    print('Precision Score : ' + str(precision_score(test_label_copy, predicted_labels, pos_label="Y")))
    print('Recall Score : ' + str(recall_score(test_label_copy, predicted_labels, pos_label="Y")))
    print('F1 Score : ' + str(f1_score(test_label_copy, predicted_labels, pos_label="Y")))
    print('Confusion Matrix : \n' + str(confusion_matrix(test_label_copy, predicted_labels)))
    plot_confusion_matrix(test_label_copy, predicted_labels, classes=['N', 'Y'],
                          title=algorithm + ' Confusion Matrix').show()


In [16]:
def plot_confusion_matrix(y_true, y_pred, classes, title=None, cmap=plt.cm.Blues):
    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           xticklabels=classes,
           yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()

    return plt

In [17]:
#Defining dependent and independent variable as x and y
x = review["reviewContent"]
y = review["flagged"]

In [18]:
#Splitting the dataset into training set and testing set.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [19]:
#Convert text to vectors
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

In [20]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

In [21]:
LR = LogisticRegression()
LR.fit(xv_train,y_train)

LogisticRegression()

In [22]:
pred_lr=LR.predict(xv_test)

In [23]:
review['flagged'].value_counts()

N    20752
Y     6206
Name: flagged, dtype: int64

In [24]:
LR.score(xv_test, y_test)

0.7864985163204747

In [25]:
from sklearn.tree import DecisionTreeClassifier

In [26]:
DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)

DecisionTreeClassifier()

In [27]:
pred_dt = DT.predict(xv_test)

In [28]:
DT.score(xv_test, y_test)

0.6899109792284867

In [29]:
print(classification_report(y_test, pred_dt))

              precision    recall  f1-score   support

           N       0.80      0.80      0.80      5202
           Y       0.32      0.31      0.31      1538

    accuracy                           0.69      6740
   macro avg       0.56      0.56      0.56      6740
weighted avg       0.69      0.69      0.69      6740



In [30]:
from sklearn.ensemble import GradientBoostingClassifier

In [31]:
GBC = GradientBoostingClassifier(random_state=0)
GBC.fit(xv_train, y_train)

GradientBoostingClassifier(random_state=0)

In [32]:
pred_gbc = GBC.predict(xv_test)

In [33]:
GBC.score(xv_test, y_test)

0.7793768545994065

In [34]:
from sklearn.ensemble import RandomForestClassifier

In [35]:
RFC = RandomForestClassifier(random_state=0)
RFC.fit(xv_train, y_train)

RandomForestClassifier(random_state=0)

In [36]:
pred_rfc = RFC.predict(xv_test)

In [37]:
RFC.score(xv_test, y_test)

0.7698813056379822

In [38]:
print(classification_report(y_test, pred_rfc))

              precision    recall  f1-score   support

           N       0.77      0.99      0.87      5202
           Y       0.42      0.02      0.04      1538

    accuracy                           0.77      6740
   macro avg       0.59      0.51      0.45      6740
weighted avg       0.69      0.77      0.68      6740



In [39]:
def output_lable(n):
    if n == 0:
        return "Not A Fake Review"
    elif n == 1:
        return "Fake Review"
    
def manual_testing(review):
    testing_review = {"reviewContent":[review]}
    new_def_test= pd.DataFrame(testing_review)
    new_def_test["reviewContent"] = new_def_test["reviewContent"].apply(wordopt) 
    new_x_test = new_def_test["reviewContent"]
    new_xv_test = vectorization.transform(new_x_test)
    pred_LR = LR.predict(new_xv_test)
    pred_DT = DT.predict(new_xv_test)
    pred_GBC = GBC.predict(new_xv_test)
    pred_RFC = RFC.predict(new_xv_test)

    return print("\n\nLR Prediction: {} \nDT Prediction: {} \nGBC Prediction: {} \nRFC Prediction: {}".format(output_lable(pred_LR[0]), 
                                                                                                              output_lable(pred_DT[0]), 
                                                                                                              output_lable(pred_GBC[0]), 
                                                                                                              output_lable(pred_RFC[0])))

In [None]:
review = str(input())
manual_testing(review)