In [1]:
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score



In [2]:
#!pip install gensim==3.8.3
#!pip install gensim
#!pip install daal4py
#!pip install numpy --upgrade

# ** Loading the Dataset ** #

In [3]:
encoded_label_dict = {"CG" : 0, "OR" : 1}
def encode_label(x):
    return encoded_label_dict.get(x,-1)

df = pd.read_csv("Preprocessed Fake Reviews Detection Dataset.csv")

df.drop('Unnamed: 0',axis=1,inplace=True)
df.dropna(inplace=True)


In [4]:
df['length'] = df['text_'].apply(len)
df["target"] = df["label"].apply(lambda x: encode_label(x))


In [5]:
train, test = train_test_split(df, test_size=0.2, shuffle=True, stratify=None, random_state=35)


In [6]:
COMMENT = 'text_'
train[COMMENT].fillna("unknown", inplace=True)
test[COMMENT].fillna("unknown", inplace=True)

# ** TFIDF - Feature Extraction ** #

In [7]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()


n = train.shape[0]
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )
trn_term_doc = vec.fit_transform(train[COMMENT])
test_term_doc = vec.transform(test[COMMENT])



# ** daal4py Interface for Random Forest Classifer ** #

In [9]:
from sklearn.pipeline import Pipeline

from daal4py.sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

pipeline = Pipeline([
    #('bow',CountVectorizer(analyzer=text_process)),
    #('tfidf',TfidfTransformer()),
    ('classifier',RandomForestClassifier())
])

import time
start_time = time.time()
pipeline.fit(trn_term_doc,train["target"])
xgb_pred = pipeline.predict(test_term_doc)
patched_time = time.time() - start_time
print("Time to calculate Using Random Forest Classifier  {:4.1f}\033[0m seconds".format(patched_time))

xgb_pred

print('Classification Report:',classification_report(test["target"],xgb_pred))
print('Confusion Matrix:',confusion_matrix(test["target"],xgb_pred))
print('Accuracy Score:',accuracy_score(test["target"],xgb_pred))
print('Model Prediction Accuracy:',str(np.round(accuracy_score(test["target"],xgb_pred)*100,2)) + '%')

  warn(


Time to calculate Using Random Forest Classifier  70.9[0m seconds
Classification Report:               precision    recall  f1-score   support

           0       0.90      0.88      0.89      3998
           1       0.88      0.90      0.89      4089

    accuracy                           0.89      8087
   macro avg       0.89      0.89      0.89      8087
weighted avg       0.89      0.89      0.89      8087

Confusion Matrix: [[3507  491]
 [ 391 3698]]
Accuracy Score: 0.8909360702361815
Model Prediction Accuracy: 89.09%
