In [None]:
import sys
sys.path.append('..')

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
import re
import pickle
from pythainlp.tokenize import word_tokenize
import Utils.dataframe as dataframe_helper


In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

## Cleaning data

In [4]:
def data_cleaning(df):
    # Removing \n from date field
    for i in range(len(df['date'])):
        if df['date'][i][0] == '\n':
            df['date'][i] = df['date'][i][1:]

    # Pre-processing Text Reviews
    # Remove Symbols
    df['reviewContent'] = df['reviewContent'].apply(
        lambda x: ''.join(char for char in str(x) if char not in '!"#$&\'()*.:;<=>?@[\\]^_`{|}~'))
    
    df['reviewContent'] = df['reviewContent'].apply(
        lambda x: x.replace('\n', ' '))

    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", flags=re.UNICODE)

    df['reviewContent'] = df['reviewContent'].apply(
        lambda x: emoji_pattern.sub(r'', x))

    return df

## Make train dataset

In [11]:
df1 = pd.read_csv('../Data/raw_thai_df.csv', index_col=0)

In [12]:
df2 = pd.read_csv('../Data/shopee_follower.csv')

In [13]:
df2['date'] = df2['date'].astype('datetime64[ns]')
df2['date'] = df2['date'].dt.strftime(r'%m/%d/%Y')

In [14]:
df1.drop(columns=['flagged', 'name', 'location', 'yelpJoinDate', 'usefulCount', 'coolCount', 'funnyCount', 'complimentCount', 'tipCount', 'fanCount'], inplace=True)

In [15]:
col_list = list(df1.columns )

In [16]:
df2 = df2[col_list]

In [17]:
df = data_cleaning(df2)
df = dataframe_helper.feature_engineering_thai(df)
df.to_csv('../Data/thai_shopee_df.csv', index=False)

  df2['reviewerID'] = pd.Series([])
  df2['maximumContentSimilarity'] = pd.Series([])


In [18]:
df = pd.read_csv('../Data/thai_shopee_df.csv')

In [19]:
test_df = df.drop(['reviewID', 'reviewerID', 'restaurantID', 'date', 'reviewContent', 'restaurantRating'], axis=1)

In [27]:
for col in test_df.columns:
    test_df[col] = test_df[col].apply(lambda x: float(str(x).split('k')[0]) * 1000 if str(x)[-1] == 'k' else x)

In [28]:
test_df

Unnamed: 0,rating,reviewUsefulCount,friendCount,reviewCount,firstCount,scaledReviewPerDay,reviewsLength,reviewsDeviation,maximumContentSimilarity
0,5,6.0,7.0,1,0,0.25,93,0.025,0.0
1,5,18.0,60.0,1,0,0.25,37,0.025,0.0
2,5,16.0,36.0,1,1,0.25,83,0.025,0.0
3,5,12.0,11.0,1,0,0.25,74,0.025,0.0
4,5,2.0,10.0,1,0,0.25,24,0.025,0.0
...,...,...,...,...,...,...,...,...,...
1129,5,10.0,2,1,0,0.25,89,0.025,0.0
1130,5,3.0,26,1,0,0.25,25,0.025,0.0
1131,5,11.0,102,1,0,0.25,33,0.025,0.0
1132,5,12.0,186,1,0,0.25,56,0.025,0.0


### Train DF Columns
- rating = comment rating **<span style="color:CornflowerBlue">(raw review file)</span>**
- reviewUsefulCount = number of user's review useful count raw **<span style="color:CornflowerBlue">(count form raw review file)</span>** 
- friendCount = number of user's friend raw **<span style="color:CornflowerBlue">(join form user file)</span>** 
- reviewCount = number of user's review count raw **<span style="color:CornflowerBlue">(join form user file)</span>** 
- firstCount = number of user's first comment raw **<span style="color:CornflowerBlue">(count form raw review file)</span>** 
- scaledReviewPerDay = scaled review per day raw **<span style="color:CornflowerBlue">(calculate from raw review file)</span>** 
- reviewsLength = review length raw **<span style="color:CornflowerBlue">(calculate from raw review file)</span>** 
- reviewsDeviation = store rating - user rating raw **<span style="color:CornflowerBlue">(calculate form businees and reviews file)</span>**
- maximumContentSimilarity = maximun content similarity raw **<span style="color:CornflowerBlue">(calculate from raw review file)</span>** 

In [35]:
test_df

Unnamed: 0,rating,reviewUsefulCount,friendCount,reviewCount,firstCount,scaledReviewPerDay,reviewsLength,reviewsDeviation,maximumContentSimilarity
0,5,6.0,7.0,1,0,0.25,93,0.025,0.0
1,5,18.0,60.0,1,0,0.25,37,0.025,0.0
2,5,16.0,36.0,1,1,0.25,83,0.025,0.0
3,5,12.0,11.0,1,0,0.25,74,0.025,0.0
4,5,2.0,10.0,1,0,0.25,24,0.025,0.0
...,...,...,...,...,...,...,...,...,...
1129,5,10.0,2,1,0,0.25,89,0.025,0.0
1130,5,3.0,26,1,0,0.25,25,0.025,0.0
1131,5,11.0,102,1,0,0.25,33,0.025,0.0
1132,5,12.0,186,1,0,0.25,56,0.025,0.0


## Model

In [29]:
filename = '../Model/finalized_model.sav'

In [30]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.predict(test_df)

In [37]:
df['flag'] = result

In [39]:
df.to_csv('../Data/thai_shopee_result_df.csv', index=False)