In [None]:
import sys
sys.path.append('..')

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
import Utils.dataframe as dataframe_helper
import Utils.learning as learning
import mlflow

In [None]:
experiment_name = "Behavior & Context (Eng) Model"

In [None]:
pd.set_option('display.max_columns', None)
mlflow.set_tracking_uri("../mlruns")
mlflow.set_experiment(experiment_name)

In [None]:
drop_columns = ['reviewID', 
                'reviewerID', 
                'restaurantID', 
                'date', 
                'name', 
                'location', 
                'yelpJoinDate',
                'restaurantRating',
                'usefulCount',
                'coolCount',
                'funnyCount',
                'complimentCount',
                'tipCount',
                'fanCount'
                ]

## Make train dataset

In [None]:
raw = dataframe_helper.load_data()
raw = dataframe_helper.data_cleaning(raw)
raw.columns

In [None]:
raw

In [None]:
df = dataframe_helper.load_data()
df = dataframe_helper.data_cleaning(df)
# df.to_csv('../Data/clean_data.csv', index=False)
# df = pd.read_csv('../Data/clean_data.csv')
df = dataframe_helper.feature_engineering(df)


In [None]:
under_sampled_df = dataframe_helper.under_sampling(df=df.copy(), target='flagged')
train_under_sampled_df = under_sampled_df.drop(drop_columns, axis=1)

In [None]:
over_sampled_df = dataframe_helper.over_sampling(df=df.copy(), target='flagged')
train_over_sampled_df = over_sampled_df.drop(drop_columns, axis=1)

In [None]:
drop_columns

In [None]:
train_df = df.drop(drop_columns, axis=1)

In [None]:
df.columns

In [None]:
train_df.columns

## Model

In [None]:
rf = RandomForestClassifier(random_state=42, criterion='entropy', max_depth=14, max_features='auto', n_estimators=500)
lr = LogisticRegression(solver='liblinear', penalty ='l2' , C = 2.0)
nb = GaussianNB()

In [None]:
learning.supervised_learning(train_df.copy(), model=rf, algorithm='Random Forest', drop_column='reviewContent', target_column='flagged')

In [None]:
learning.supervised_learning(train_df.copy(), model=nb, algorithm='Naive Bayes', drop_column='reviewContent', target_column='flagged')

In [None]:
learning.supervised_learning(train_df.copy(), model=lr, algorithm='Logistic Regression', drop_column='reviewContent', target_column='flagged')

In [None]:
learning.semi_supervised_learning(train_df.copy(), experiment_name, df_type='normal', model=rf, threshold=0.7, iterations=15, algorithm='Random Forest', target_column ='flagged', log=True)

In [None]:
learning.semi_supervised_learning(train_df.copy(), experiment_name, df_type='normal', model=nb, threshold=0.7, iterations=15, algorithm='Naive Bayes', target_column ='flagged', log=True)

In [None]:
learning.semi_supervised_learning(train_df.copy(), experiment_name, df_type='normal', model=lr, threshold=0.7, iterations=15, algorithm='Logistic Regression', target_column ='flagged', log=True)

In [None]:
learning.supervised_learning(train_under_sampled_df.copy(), model=rf, algorithm='Random Forest', drop_column='reviewContent', target_column='flagged')

In [None]:
learning.supervised_learning(train_under_sampled_df.copy(), model=nb, algorithm='Naive Bayes', drop_column='reviewContent', target_column='flagged')

In [None]:
learning.supervised_learning(train_under_sampled_df.copy(), model=lr, algorithm='Logistic Regression', drop_column='reviewContent', target_column='flagged')

In [None]:
learning.semi_supervised_learning(train_under_sampled_df.copy(), experiment_name, df_type='under_sampled', model=rf, threshold=0.7, iterations=15, algorithm='Random Forest', target_column ='flagged', log=True)

In [None]:
learning.semi_supervised_learning(train_under_sampled_df.copy(), experiment_name, df_type='under_sampled', model=nb, threshold=0.7, iterations=15, algorithm='Naive Bayes', target_column ='flagged', log=True)

In [None]:
learning.semi_supervised_learning(train_under_sampled_df.copy(), experiment_name, df_type='under_sampled', model=lr, threshold=0.7, iterations=15, algorithm='Logistic Regression', target_column ='flagged', log=True)

In [None]:
learning.supervised_learning(train_over_sampled_df.copy(), model=rf, algorithm='Random Forest', drop_column='reviewContent', target_column='flagged')

In [None]:
learning.supervised_learning(train_over_sampled_df.copy(), model=nb, algorithm='Naive Bayes', drop_column='reviewContent', target_column='flagged')

In [None]:
learning.supervised_learning(train_over_sampled_df.copy(), model=lr, algorithm='Logistic Regression', drop_column='reviewContent', target_column='flagged')

In [None]:
learning.semi_supervised_learning(train_over_sampled_df.copy(), experiment_name, df_type='over_sampled', model=rf, threshold=0.7, iterations=15, algorithm='Random Forest', target_column ='flagged', log=True)

In [None]:
learning.semi_supervised_learning(train_over_sampled_df.copy(), experiment_name, df_type='over_sampled', model=nb, threshold=0.7, iterations=15, algorithm='Naive Bayes', target_column ='flagged', log=True)

In [None]:
learning.semi_supervised_learning(train_over_sampled_df.copy(), experiment_name, df_type='over_sampled', model=lr, threshold=0.7, iterations=15, algorithm='Logistic Regression', target_column ='flagged', log=True)