In [None]:
import sys
sys.path.append('..')

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import pickle
import Utils.dataframe as learning

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

## Make train dataset

In [None]:
# df = pd.read_csv('../Data/raw_thai_df.csv', index_col=0)
# df = data_cleaning(df)
# df = feature_engineering(df)
# df.to_csv('../Data/thai_df.csv', index=False)


In [None]:
df = pd.read_csv('../Data/thai_df.csv')

In [None]:
# under_sampled_df = under_sampling(df)
# under_sampled_df.to_csv('../Data/thai_under_sampled_df.csv', index=False)
under_sampled_df = pd.read_csv('../Data/thai_under_sampled_df.csv')

In [None]:
# over_sampled_df = over_sampling(df)
# over_sampled_df.to_csv('../Data/thai_over_sampled_df.csv', index=False)
over_sampled_df = pd.read_csv('../Data/thai_over_sampled_df.csv')

# Dataframe EDA

In [None]:
# df.head(5)

In [None]:
# df.columns

In [None]:
train_df = df.drop(['reviewID', 'reviewerID', 'restaurantID', 'date', 'name', 'location', 'yelpJoinDate', 'flagged', 'reviewContent', 'restaurantRating', 'usefulCount', 'coolCount', 'funnyCount',
                     'complimentCount', 'tipCount', 'fanCount'], axis=1)
train_df.to_csv('../Data/thai_train_df.csv', index=False)

In [None]:
train_df.columns

### Train DF Columns
- rating = comment rating **<span style="color:CornflowerBlue">(raw review file)</span>**
- reviewUsefulCount = number of user's review useful count raw **<span style="color:CornflowerBlue">(count form raw review file)</span>** 
- friendCount = number of user's friend raw **<span style="color:CornflowerBlue">(join form user file)</span>** 
- reviewCount = number of user's review count raw **<span style="color:CornflowerBlue">(join form user file)</span>** 
- firstCount = number of user's first comment raw **<span style="color:CornflowerBlue">(count form raw review file)</span>** 
- usefulCount = number of comment's useful count raw **<span style="color:CornflowerBlue">(raw review file)</span>** 
- coolCount = number of comment's cool count raw **<span style="color:CornflowerBlue">(raw review file)</span>** 
- funnyCount = number of comment's funny count raw **<span style="color:CornflowerBlue">(raw review file)</span>** 
- complimentCount = review's compliment count raw **<span style="color:CornflowerBlue">(join form tip file)</span>** 
- tipCount = user's tip count raw **<span style="color:CornflowerBlue">(join form tip file)</span>** 
- fanCount = number of user's fan raw **<span style="color:CornflowerBlue">(join form user file)</span>** 
- scaledReviewPerDay = scaled review per day raw **<span style="color:CornflowerBlue">(calculate from raw review file)</span>** 
- reviewsLength = review length raw **<span style="color:CornflowerBlue">(calculate from raw review file)</span>** 
- reviewsDeviation = store rating - user rating raw **<span style="color:CornflowerBlue">(calculate form businees and reviews file)</span>**
- maximumContentSimilarity = maximun content similarity raw **<span style="color:CornflowerBlue">(calculate from raw review file)</span>** 

## Model

In [None]:
rf = RandomForestClassifier(random_state=42, criterion='entropy', max_depth=14, max_features='auto',
                            n_estimators=500)
nb = GaussianNB()

In [None]:
learning.semi_supervised_learning(df, model=rf, threshold=0.7, iterations=15, algorithm='Random Forest', 
                        drog_columns= ['reviewID', 'reviewerID', 'restaurantID', 'date', 'name', 'location', 'yelpJoinDate',
                        'flagged', 'restaurantRating', 'usefulCount', 'coolCount', 'funnyCount',
                        'complimentCount', 'tipCount', 'fanCount'] , target_column='flagged')

learning.semi_supervised_learning(df, model=nb, threshold=0.7, iterations=15, algorithm='Naive Bayes',
                        drog_columns= ['reviewID', 'reviewerID', 'restaurantID', 'date', 'name', 'location', 'yelpJoinDate',
                        'flagged', 'restaurantRating', 'usefulCount', 'coolCount', 'funnyCount',
                        'complimentCount', 'tipCount', 'fanCount'] , target_column='flagged')


In [None]:
learning.semi_supervised_learning(under_sampled_df, model=rf, threshold=0.7, iterations=15, algorithm='Random Forest',
                        drog_columns= ['reviewID', 'reviewerID', 'restaurantID', 'date', 'name', 'location', 'yelpJoinDate',
                        'flagged', 'restaurantRating', 'usefulCount', 'coolCount', 'funnyCount',
                        'complimentCount', 'tipCount', 'fanCount'] , target_column='flagged')
                    
learning.semi_supervised_learning(under_sampled_df, model=nb, threshold=0.7, iterations=15, algorithm='Naive Bayes',
                        drog_columns= ['reviewID', 'reviewerID', 'restaurantID', 'date', 'name', 'location', 'yelpJoinDate',
                        'flagged', 'restaurantRating', 'usefulCount', 'coolCount', 'funnyCount',
                        'complimentCount', 'tipCount', 'fanCount'] , target_column='flagged')

In [None]:
model, results = learning.semi_supervised_learning(over_sampled_df, model=rf, threshold=0.7, iterations=15, algorithm='Random Forest',
                        drog_columns= ['reviewID', 'reviewerID', 'restaurantID', 'date', 'name', 'location', 'yelpJoinDate',
                        'flagged', 'restaurantRating', 'usefulCount', 'coolCount', 'funnyCount',
                        'complimentCount', 'tipCount', 'fanCount'] , target_column='flagged')

learning.semi_supervised_learning(over_sampled_df, model=nb, threshold=0.7, iterations=15, algorithm='Naive Bayes',
                        drog_columns= ['reviewID', 'reviewerID', 'restaurantID', 'date', 'name', 'location', 'yelpJoinDate',
                        'flagged', 'restaurantRating', 'usefulCount', 'coolCount', 'funnyCount',
                        'complimentCount', 'tipCount', 'fanCount'] , target_column='flagged')

In [None]:
results.to_csv('../Data/thai_results.csv', index=False)

In [None]:
# save the model to disk
filename = '../Model/finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))