In [None]:
import sys
sys.path.append('..')

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import Utils.dataframe as dataframe_helper
import Utils.learning as learning


In [None]:
pd.set_option('display.max_columns', None)

## Feature Engineering

In [None]:
def feature_engineering_one_hot(df:pd.DataFrame, N:int = None):
    with open("../Data/top_word/top_y_word.txt", "r") as f:
        fake_words = dataframe_helper.string_to_list(f.read())
  
    with open("../Data/top_word/top_n_word.txt", "r") as f:
        non_fake_words = dataframe_helper.string_to_list(f.read())

    fake_words = fake_words[:N]
    non_fake_words = non_fake_words[:N]
        
    # KL one hot encoding
    fakeWordOneHot = []
    nonFakeWordOneHot = []

    for content in df['reviewContent']:
        fakeOneHot = ''
        nonFakeOneHot = ''
        words = re.findall(r'\b[A-Za-z][a-z]{2,9}\b', content)

        for word in fake_words:
            if word in words:
                fakeOneHot += '1'
            else:
                fakeOneHot += '0'

        for word in non_fake_words:
            if word in words:
                nonFakeOneHot += '1'
            else:
                nonFakeOneHot += '0'
                
        fakeWordOneHot.append(fakeOneHot)
        nonFakeWordOneHot.append(nonFakeOneHot)
    
    df['fakeWordsOneHot'] = fakeWordOneHot
    df['nonFakeWordsOneHot'] = nonFakeWordOneHot

    df = dataframe_helper.onehot(df, 'fakeWordsOneHot', fake_words, 'fake')
    df = dataframe_helper.onehot(df, 'nonFakeWordsOneHot', non_fake_words, 'non fake')
    
    return df
    

In [None]:
def feature_engineering_word_count(df:pd.DataFrame, N:int = None):
    with open("../Data/top_word/top_y_word.txt", "r") as f:
        fake_words = dataframe_helper.string_to_list(f.read())
  
    with open("../Data/top_word/top_n_word.txt", "r") as f:
        non_fake_words = dataframe_helper.string_to_list(f.read())

    fake_words = fake_words[:N]
    non_fake_words = non_fake_words[:N]
        
    # KL word count
    fakeWordsCount = []
    nonFakeWordsCount = []

    for content in df['reviewContent']:
        words = re.findall(r'\b[A-Za-z][a-z]{2,9}\b', content)
        fakeCount = 0
        nonFakeCount = 0
        for word in words:
            if word in fake_words:
                fakeCount += 1
            elif word in non_fake_words:
                nonFakeCount += 1
        fakeWordsCount.append(fakeCount)
        nonFakeWordsCount.append(nonFakeCount)
    
    df['fakeWordsCount'] = fakeWordsCount
    df['nonFakeWordsCount'] = nonFakeWordsCount
    
    return df
    

## Make train dataset

In [None]:
df = dataframe_helper.load_data()
df = dataframe_helper.data_cleaning(df)
df = feature_engineering_one_hot(df)

In [None]:
df.head()

In [None]:
under_sampled_df = dataframe_helper.under_sampling(df=df, target='flagged', big_sample='Y', small_sample='N')
under_sampled_df.reset_index(drop=True, inplace=True)
# under_sampled_df.to_csv('../Data/under_sampled_KL_df.csv', index=False)
# under_sampled_df = pd.read_csv('../Data/under_sampled_KL_df.csv')

In [None]:
over_sampled_df = dataframe_helper.over_sampling(df=df, target='flagged', big_sample='Y', small_sample='N')
over_sampled_df.reset_index(drop=True, inplace=True)
# over_sampled_df.to_csv('../Data/over_sampled_KL_df.csv', index=False)
# over_sampled_df = pd.read_csv('../Data/over_sampled_KL_df.csv')

## Model

In [None]:
rf = RandomForestClassifier(random_state=42, criterion='entropy', max_depth=14, max_features='auto',
                            n_estimators=500)
nb = GaussianNB()

In [None]:
# model, results, feature = learning.semi_supervised_learning(df, model=rf, threshold=0.7, iterations=15, algorithm='Random Forest')
# model, results, feature = learning.semi_supervised_learning(df, model=nb, threshold=0.7, iterations=15, algorithm='Naive Bayes')

In [None]:
# model, results, feature = learning.semi_supervised_learning(under_sampled_df, model=rf, threshold=0.7, iterations=15, algorithm='Random Forest')
# model, results, feature = learning.semi_supervised_learning(under_sampled_df, model=nb, threshold=0.7, iterations=15, algorithm='Naive Bayes')

In [None]:
model, results, feature = learning.semi_supervised_learning(over_sampled_df, model=rf, threshold=0.7, iterations=15, algorithm='Random Forest')
# model, results, feature = learning.semi_supervised_learning(over_sampled_df, model=nb, threshold=0.7, iterations=15, algorithm='Naive Bayes')

In [None]:
feat_importances = pd.Series(model.feature_importances_, index=feature)
feat_importances.nlargest(100).plot(kind='barh',figsize=(25,25))

In [None]:
# sort = model.feature_importances_.argsort()
# plt.barh(boston.feature_names[sort], model.feature_importances_[sort])
# plt.xlabel("Feature Importance")