In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from datetime import datetime
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, pairwise_distances
from sklearn.metrics import confusion_matrix
from collections import OrderedDict
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
import re
import pickle
from tqdm import tqdm
from pythainlp.tokenize import word_tokenize


In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

## Cleaning data

In [3]:
def data_cleaning(df):
    # Removing \n from date field
    for i in range(len(df['date'])):
        if df['date'][i][0] == '\n':
            df['date'][i] = df['date'][i][1:]

    # Making yelpJoinDate Format Uniform
    df['yelpJoinDate'] = df['yelpJoinDate'].apply(
        lambda x: datetime.strftime(datetime.strptime(x, '%B %Y'), '01/%m/%Y'))

    # Pre-processing Text Reviews
    # Remove Symbols
    df['reviewContent'] = df['reviewContent'].apply(
        lambda x: ''.join(char for char in str(x) if char not in '!"#$&\'()*.:;<=>?@[\\]^_`{|}~'))

    return df

In [4]:
def data_cleaning_shopee(df):
    # Removing \n from date field
    for i in range(len(df['date'])):
        if df['date'][i][0] == '\n':
            df['date'][i] = df['date'][i][1:]

    # Pre-processing Text Reviews
    # Remove Symbols
    df['reviewContent'] = df['reviewContent'].apply(
        lambda x: ''.join(char for char in str(x) if char not in '!"#$&\'()*.:;<=>?@[\\]^_`{|}~'))
    
    df['reviewContent'] = df['reviewContent'].apply(
        lambda x: x.replace('\n', ' '))

    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", flags=re.UNICODE)

    df['reviewContent'] = df['reviewContent'].apply(
        lambda x: emoji_pattern.sub(r'', x))

    return df

## Feature Engineering

In [5]:
def feature_engineering(df):
    mnr_df1 = df[['reviewerID', 'date']].copy()
    mnr_df2 = mnr_df1.groupby(by=['date', 'reviewerID']).size().reset_index(name='reviewPerDay')
    mnr_df2['scaledReviewPerDay'] = mnr_df2['reviewPerDay'] / mnr_df2['reviewPerDay'].max()
    mnr_df2.drop(columns=['reviewPerDay'], inplace=True)
    df = df.merge(mnr_df2, on=['reviewerID', 'date'], how='inner')

    # Review Length
    df['reviewsLength'] = df['reviewContent'].apply(
        lambda x: len(word_tokenize(x, engine="newmm")))

    # Review Deviation
    df['reviewsDeviation'] = abs(df['rating'] - df['restaurantRating']) / 4

    # Maximum cosine similarity
    review_data = df

    res = OrderedDict()

    # Iterate over data and create groups of reviewers
    for row in review_data.iterrows():
        if row[1].reviewerID in res:
            res[row[1].reviewerID].append(row[1].reviewContent)
        else:
            res[row[1].reviewerID] = [row[1].reviewContent]

    individual_reviewer = [{'reviewerID': k, 'reviewContent': v} for k, v in res.items()]
    df2 = dict()
    df2['reviewerID'] = pd.Series([])
    df2['maximumContentSimilarity'] = pd.Series([])
    vector = TfidfVectorizer(min_df=0)
    count = -1
    for reviewer_data in individual_reviewer:
        count = count + 1
        # Handle Null/single review gracefully -24-Apr-2019
        try:
            tfidf = vector.fit_transform(reviewer_data['reviewContent'])
        except:
            pass
        cosine = 1 - pairwise_distances(tfidf, metric='cosine')

        np.fill_diagonal(cosine, -np.inf)
        max = cosine.max()

        # To handle reviewier with just 1 review
        if max == -np.inf:
            max = 0
        df2['reviewerID'][count] = reviewer_data['reviewerID']
        df2['maximumContentSimilarity'][count] = max

    df3 = pd.DataFrame(df2, columns=['reviewerID', 'maximumContentSimilarity'])

    # left outer join on original datamatrix and cosine dataframe -24-Apr-2019
    df = pd.merge(review_data, df3, on="reviewerID", how="left")

    df.drop(index=np.where(pd.isnull(df))[0], axis=0, inplace=True)
    return df

## Sampling

In [6]:
def under_sampling(df):
    print("Under-Sampling Data")

    sample_size = len(df[(df['flagged'] == 'Y')])

    authentic_reviews_df = df[df['flagged'] == 'N']
    fake_reviews_df = df[df['flagged'] == 'Y']

    authentic_reviews_us_df = authentic_reviews_df.sample(sample_size)
    under_sampled_df = pd.concat([authentic_reviews_us_df, fake_reviews_df], axis=0)

    print("Under-Sampling Complete")
    return under_sampled_df

In [7]:
def over_sampling(df):
    print("Over-Sampling Data")

    sample_size = len(df[(df['flagged'] == 'N')])

    authentic_reviews_df = df[df['flagged'] == 'N']
    fake_reviews_df = df[df['flagged'] == 'Y']

    fake_reviews_os_df = fake_reviews_df.sample(sample_size, replace=True)
    over_sampled_df = pd.concat([authentic_reviews_df, fake_reviews_os_df], axis=0)

    print("Over-Sampling Complete")
    return over_sampled_df

## Plot

In [8]:
def plot_confusion_matrix(y_true, y_pred, classes, title=None, cmap=plt.cm.Blues):
    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           xticklabels=classes,
           yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()

    return plt

## Learning

In [9]:
def supervised_learning(df, model, algorithm):
    df = df.copy()
    print("Training "+algorithm+" Model")
    labels = df['flagged']

    df.drop(['reviewID', 'reviewerID', 'restaurantID', 'date', 'name', 'location', 'yelpJoinDate', 'flagged',
             'reviewContent', 'restaurantRating'], axis=1, inplace=True)

    train_data, test_data, train_label, test_label = train_test_split(df, labels, test_size=0.25, random_state=42)

    # param_grid = {
    #     'n_estimators': [10, 500],
    #     'max_features': ['auto', 'sqrt', 'log2'],
    #     'max_depth': [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
    #     'criterion': ['gini', 'entropy']
    # }
    # grid_clf_acc = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
    #
    # grid_clf_acc.fit(train_data, train_label)

    model.fit(train_data, train_label)
    predicted_labels = model.predict(test_data)

    # print('Best Params : ', grid_clf_acc.best_params_)
    print(algorithm + ' Model Results')
    print('--' * 20)
    print('Accuracy Score : ' + str(accuracy_score(test_label, predicted_labels)))
    print('Precision Score : ' + str(precision_score(test_label, predicted_labels, pos_label="Y")))
    print('Recall Score : ' + str(recall_score(test_label, predicted_labels, pos_label="Y")))
    print('F1 Score : ' + str(f1_score(test_label, predicted_labels, pos_label="Y")))
    print('Confusion Matrix : \n' + str(confusion_matrix(test_label, predicted_labels)))
    plot_confusion_matrix(test_label, predicted_labels, classes=['N', 'Y'],
                          title=algorithm + ' Confusion Matrix').show()

    return test_label, predicted_labels, model

In [10]:
def semi_supervised_learning(df, model, algorithm, threshold=0.8, iterations=40):
    df = df.copy()
    print("Training "+algorithm+" Model")
    labels = df['flagged']

    # df.drop(['reviewID', 'reviewerID', 'restaurantID', 'date', 'name', 'location', 'yelpJoinDate', 'flagged', 'restaurantRating'], axis=1, inplace=True)
    df.drop(['reviewID', 'reviewerID', 'restaurantID', 'date', 'name', 'location', 'yelpJoinDate',
             'flagged', 'restaurantRating', 'usefulCount', 'coolCount', 'funnyCount',
             'complimentCount', 'tipCount', 'fanCount'], axis=1, inplace=True)

    train_data, test_data, train_label, test_label = train_test_split(df, labels, test_size=0.25, random_state=42)

    test_review_content = test_data['reviewContent']
    train_data.drop(['reviewContent'], axis=1, inplace=True)
    test_data.drop(['reviewContent'], axis=1, inplace=True)
    
    test_data_copy = test_data.copy()
    test_label_copy = test_label.copy()

    all_labeled = False

    current_iteration = 0

    # param_grid = {
    #     'n_estimators': [10, 500],
    #     'max_features': ['auto', 'sqrt', 'log2'],
    #     'max_depth': [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
    #     'criterion': ['gini', 'entropy']
    # }
    # grid_clf_acc = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
    #
    # grid_clf_acc.fit(train_data, train_label)

    pbar = tqdm(total=iterations)

    while not all_labeled and (current_iteration < iterations):
        # print("Before train data length : ", len(train_data))
        # print("Before test data length : ", len(test_data))
        current_iteration += 1
        model.fit(train_data, train_label)

        probabilities = model.predict_proba(test_data)
        pseudo_labels = model.predict(test_data)

        indices = np.argwhere(probabilities > threshold)

        # print("rows above threshold : ", len(indices))
        for item in indices:
            train_data.loc[test_data.index[item[0]]] = test_data.iloc[item[0]]
            train_label.loc[test_data.index[item[0]]] = pseudo_labels[item[0]]
        test_data.drop(test_data.index[indices[:, 0]], inplace=True)
        test_label.drop(test_label.index[indices[:, 0]], inplace=True)
        # print("After train data length : ", len(train_data))
        # print("After test data length : ", len(test_data))
        print("--" * 20)

        if len(test_data) == 0:
            print("Exiting loop")
            all_labeled = True
        pbar.update(1)
    pbar.close()
    predicted_labels = model.predict(test_data_copy)

    # print('Best Params : ', grid_clf_acc.best_params_)
    print(algorithm + ' Model Results')
    print('--' * 20)
    print('Accuracy Score : ' + str(accuracy_score(test_label_copy, predicted_labels)))
    print('Precision Score : ' + str(precision_score(test_label_copy, predicted_labels, pos_label="Y")))
    print('Recall Score : ' + str(recall_score(test_label_copy, predicted_labels, pos_label="Y")))
    print('F1 Score : ' + str(f1_score(test_label_copy, predicted_labels, pos_label="Y")))
    print('Confusion Matrix : \n' + str(confusion_matrix(test_label_copy, predicted_labels)))
    plot_confusion_matrix(test_label_copy, predicted_labels, classes=['N', 'Y'],
                          title=algorithm + ' Confusion Matrix').show()

    results = test_data_copy.copy()
    results['reviewContent'] = test_review_content
    results['flagged'] = test_label_copy
    results['predicted'] = predicted_labels

    return model, results
                          

## Make train dataset

In [11]:
df1 = pd.read_csv('../Data/raw_thai_df.csv', index_col=0)

In [12]:
df2 = pd.read_csv('../Data/shopee_follower.csv')

In [13]:
df2['date'] = df2['date'].astype('datetime64[ns]')
df2['date'] = df2['date'].dt.strftime(r'%m/%d/%Y')

In [14]:
df1.drop(columns=['flagged', 'name', 'location', 'yelpJoinDate', 'usefulCount', 'coolCount', 'funnyCount', 'complimentCount', 'tipCount', 'fanCount'], inplace=True)

In [15]:
col_list = list(df1.columns )

In [16]:
df2 = df2[col_list]

In [17]:
df = data_cleaning_shopee(df2)
df = feature_engineering(df)
df.to_csv('../Data/thai_shopee_df.csv', index=False)

  df2['reviewerID'] = pd.Series([])
  df2['maximumContentSimilarity'] = pd.Series([])


In [18]:
df = pd.read_csv('../Data/thai_shopee_df.csv')

In [19]:
test_df = df.drop(['reviewID', 'reviewerID', 'restaurantID', 'date', 'reviewContent', 'restaurantRating'], axis=1)

In [27]:
for col in test_df.columns:
    test_df[col] = test_df[col].apply(lambda x: float(str(x).split('k')[0]) * 1000 if str(x)[-1] == 'k' else x)

In [28]:
test_df

Unnamed: 0,rating,reviewUsefulCount,friendCount,reviewCount,firstCount,scaledReviewPerDay,reviewsLength,reviewsDeviation,maximumContentSimilarity
0,5,6.0,7.0,1,0,0.25,93,0.025,0.0
1,5,18.0,60.0,1,0,0.25,37,0.025,0.0
2,5,16.0,36.0,1,1,0.25,83,0.025,0.0
3,5,12.0,11.0,1,0,0.25,74,0.025,0.0
4,5,2.0,10.0,1,0,0.25,24,0.025,0.0
...,...,...,...,...,...,...,...,...,...
1129,5,10.0,2,1,0,0.25,89,0.025,0.0
1130,5,3.0,26,1,0,0.25,25,0.025,0.0
1131,5,11.0,102,1,0,0.25,33,0.025,0.0
1132,5,12.0,186,1,0,0.25,56,0.025,0.0


### Train DF Columns
- rating = comment rating **<span style="color:CornflowerBlue">(raw review file)</span>**
- reviewUsefulCount = number of user's review useful count raw **<span style="color:CornflowerBlue">(count form raw review file)</span>** 
- friendCount = number of user's friend raw **<span style="color:CornflowerBlue">(join form user file)</span>** 
- reviewCount = number of user's review count raw **<span style="color:CornflowerBlue">(join form user file)</span>** 
- firstCount = number of user's first comment raw **<span style="color:CornflowerBlue">(count form raw review file)</span>** 
- scaledReviewPerDay = scaled review per day raw **<span style="color:CornflowerBlue">(calculate from raw review file)</span>** 
- reviewsLength = review length raw **<span style="color:CornflowerBlue">(calculate from raw review file)</span>** 
- reviewsDeviation = store rating - user rating raw **<span style="color:CornflowerBlue">(calculate form businees and reviews file)</span>**
- maximumContentSimilarity = maximun content similarity raw **<span style="color:CornflowerBlue">(calculate from raw review file)</span>** 

In [35]:
test_df

Unnamed: 0,rating,reviewUsefulCount,friendCount,reviewCount,firstCount,scaledReviewPerDay,reviewsLength,reviewsDeviation,maximumContentSimilarity
0,5,6.0,7.0,1,0,0.25,93,0.025,0.0
1,5,18.0,60.0,1,0,0.25,37,0.025,0.0
2,5,16.0,36.0,1,1,0.25,83,0.025,0.0
3,5,12.0,11.0,1,0,0.25,74,0.025,0.0
4,5,2.0,10.0,1,0,0.25,24,0.025,0.0
...,...,...,...,...,...,...,...,...,...
1129,5,10.0,2,1,0,0.25,89,0.025,0.0
1130,5,3.0,26,1,0,0.25,25,0.025,0.0
1131,5,11.0,102,1,0,0.25,33,0.025,0.0
1132,5,12.0,186,1,0,0.25,56,0.025,0.0


## Model

In [29]:
filename = '../Model/finalized_model.sav'

In [30]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.predict(test_df)

In [37]:
df['flag'] = result

In [39]:
df.to_csv('../Data/thai_shopee_result_df.csv', index=False)