## Importing Packages

In [1]:
import os
import sys
import pandas as pd
import nltk
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
import matplotlib.pyplot as plt
import string
import re
import numpy as np
import itertools
from nltk import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import ConfusionMatrixDisplay, recall_score,\
    accuracy_score, precision_score, f1_score, classification_report
import warnings
import xgboost

warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('fake reviews dataset.csv')
df.head()

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...


## Data Undersatding

In [3]:
df['category'].value_counts()

category
Kindle_Store_5                  4730
Books_5                         4370
Pet_Supplies_5                  4254
Home_and_Kitchen_5              4056
Electronics_5                   3988
Sports_and_Outdoors_5           3946
Tools_and_Home_Improvement_5    3858
Clothing_Shoes_and_Jewelry_5    3848
Toys_and_Games_5                3794
Movies_and_TV_5                 3588
Name: count, dtype: int64

In [4]:
df.describe()

Unnamed: 0,rating
count,40432.0
mean,4.256579
std,1.144354
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


In [5]:
# Creating 'target' Column for Classification
df['target'] = np.where(df['label'] == 'CG', 1, 0)

In [6]:
df

Unnamed: 0,category,rating,label,text_,target
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor...",1
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I...",1
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...,1
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i...",1
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...,1
...,...,...,...,...,...
40427,Clothing_Shoes_and_Jewelry_5,4.0,OR,I had read some reviews saying that this bra r...,0
40428,Clothing_Shoes_and_Jewelry_5,5.0,CG,I wasn't sure exactly what it would be. It is ...,1
40429,Clothing_Shoes_and_Jewelry_5,2.0,OR,"You can wear the hood by itself, wear it with ...",0
40430,Clothing_Shoes_and_Jewelry_5,1.0,CG,I liked nothing about this dress. The only rea...,1


## Text Preprocessing: Tokenization

In [7]:
# Tokenizing the text data in the 'text_' column of df
def tokenizer(x):
    
    corpus = [word_tokenize(doc) for doc in x]

# Getting common stop words in english that we'll remove during tokenization/text normalization
    stop_words = stopwords.words('english')
    corpus_no_stopwords = []
    for words in corpus:
        docs = [x.lower() for x in words if ((x.isalpha()) & (x not in stop_words))]
        corpus_no_stopwords.append(docs)
    return corpus_no_stopwords

## Lemmantizer

In [8]:
# Defining new function 'lemmatizer'. This function takes two arguments: corpus(list of sentences or text data that we want to lemmatize), and as_string 
def lemmatizer(corpus, as_string=True):
    lem = WordNetLemmatizer()
# Defining an inner function 'pos_tagger'    
    def pos_tagger(nltk_tag):
        if nltk_tag.startswith('J'):
            return wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return wordnet.VERB
        elif nltk_tag.startswith('N'):
            return wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return wordnet.ADV
        else:         
            return None
    lemmatized_corpus = []
    for sentence in corpus:
        pos_tags = pos_tag(sentence)
        lemmatized_sentence = []
        for word, tag in pos_tags:
            pos = pos_tagger(tag)
            if pos is not None:
                lemmatized_word = lem.lemmatize(word, pos)
            else:
                lemmatized_word = lem.lemmatize(word)
            lemmatized_sentence.append(lemmatized_word)
        lemmatized_corpus.append(lemmatized_sentence)
    if as_string:
        lemmatized_corpus  = [' '.join(x) for x in lemmatized_corpus]
    return lemmatized_corpus
# After processing all words in the sentence, the lemmatized_sentence is added to the lemmatized_corpus
        

In [9]:
# Tokenizing a text corpus
corpus_tokenized = tokenizer(df['text_'])

In [10]:
# Lemmatizing a text corpus
lemmatized_corpus = lemmatizer(corpus_tokenized)

## Pre vectorizing

In [11]:
# Adding Preprocessed Text Column to DataFrame
joined_lemm_corpus = [' '.join(x) for x in lemmatized_corpus]
df['text_preproccesed'] = pd.Series(data=lemmatized_corpus)

In [13]:
# Creating Document-Term Matrix Using CountVectorizer
vec = CountVectorizer(min_df = 0.05, max_df = 0.95)
X = vec.fit_transform(lemmatized_corpus)
countvec_df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names_out())

In [14]:
# Creating TF-IDF Matrix Using TfidfVectorizer
tfidf = TfidfVectorizer(min_df = 0.05, max_df = 0.95)
Y = tfidf.fit_transform(lemmatized_corpus)
tfidf_df = pd.DataFrame(Y.toarray(), columns=tfidf.get_feature_names_out())

In [15]:
tfidf_df

Unnamed: 0,also,anyone,best,big,bit,book,buy,ca,character,come,...,try,two,use,want,way,well,work,would,write,year
0,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.365431,0.000000,0.000000,0.0,0.000000
1,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.728227
2,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
3,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.436464,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
4,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.000000,...,0.000000,0.350925,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40427,0.000000,0.0,0.112657,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.103073,...,0.115232,0.206830,0.000000,0.099786,0.000000,0.075666,0.000000,0.077660,0.0,0.000000
40428,0.000000,0.0,0.000000,0.091199,0.000000,0.0,0.13296,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.061434,0.067497,0.252213,0.0,0.000000
40429,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.00000,0.000000,0.0,0.000000,...,0.174608,0.000000,0.000000,0.000000,0.000000,0.114655,0.000000,0.117676,0.0,0.000000
40430,0.064548,0.0,0.000000,0.000000,0.145785,0.0,0.11615,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.107335,0.000000,0.055082,0.0,0.000000


In [16]:
X

<40432x89 sparse matrix of type '<class 'numpy.int64'>'
	with 375538 stored elements in Compressed Sparse Row format>

### Splitting Data into Training and Testing Sets

In [17]:
X_train, X_test, y_train,y_test = train_test_split(df['text_'],df['target'], test_size=0.3,random_state=42)

In [18]:
# Preprocessing Text Data in Training and Testing Sets
X_train_preprocessed = lemmatizer(tokenizer(X_train))
X_test_preprocessed = lemmatizer(tokenizer(X_test))

### Random Forest and Logistic Regression with the Count Vectorizer

In [19]:
# Defining Evaluation Metrics Function
def scores(y_test, y_pred):
    print(f"""    Accuracy: {round(accuracy_score(y_test,y_pred),3)}
    
    Recall: {round(recall_score(y_test,y_pred),3)}
    
    F1: {round(f1_score(y_test,y_pred),3)}
    
    Precision: {round(precision_score(y_test,y_pred),3)}""")

In [20]:
# Train and Evaluate Random Forest Model with CountVectorizer
steps = [('countvec',CountVectorizer(min_df = 0.05, max_df = 0.95)),('rfc',RandomForestClassifier(n_estimators=200,random_state=42))]
pipe_cv_rf = Pipeline(steps)
pipe_cv_rf.fit(X_train_preprocessed, y_train)
pipe_cv_rf.score(X_train_preprocessed,y_train)

0.9825100699597201

In [21]:
# Calculating Mean Cross-Validation Score for Random Forest Model with CountVectorizer
crossval_rf_cv = np.mean(cross_val_score(pipe_cv_rf, X_train_preprocessed,y_train,scoring='accuracy',cv=5))

In [22]:
crossval_rf_cv

0.7627021346850904

In [23]:
# Train and Evaluate Logistic Regression Model with CountVectorizer
steps[1] = ('logreg',LogisticRegression(random_state=42))
pipe_cv_lr = Pipeline(steps)
pipe_cv_lr.fit(X_train_preprocessed, y_train)
pipe_cv_lr.score(X_train_preprocessed,y_train)

0.7288177513956611

In [24]:
# Calculating Mean Cross-Validation Score for Logistic Regression Model with CountVectorizer
crossval_lr_cv = np.mean(cross_val_score(pipe_cv_lr, X_train_preprocessed,y_train,scoring='accuracy',cv=5))

In [25]:
crossval_lr_cv

0.7266269054338063

### Random Forest and Logistic Regression with the Tfidf Vectorizer

In [26]:
# Train and Evaluate Random Forest Model with TfidfVectorizer
steps = [('tfidfvec',TfidfVectorizer(min_df = 0.05, max_df = 0.95)),('rfc',RandomForestClassifier(n_estimators=200,random_state=42))]
pipe_idf_rf = Pipeline(steps)
pipe_idf_rf.fit(X_train_preprocessed, y_train)
pipe_idf_rf.score(X_train_preprocessed,y_train)

0.98187407250371

In [27]:
# Calculating Mean Cross-Validation Score for Random Forest Model with TfidfVectorizer
crossval_rf_idf = np.mean(cross_val_score(pipe_idf_rf, X_train_preprocessed,y_train,scoring='accuracy',cv=5))

In [28]:
crossval_rf_idf

0.7631969029932032

In [29]:
# Train and Evaluate Logistic Regression Model with TfidfVectorizer
steps[1] = ('logreg',LogisticRegression(random_state=42))
pipe_idf_lr = Pipeline(steps)
pipe_idf_lr.fit(X_train_preprocessed, y_train)
pipe_idf_lr.score(X_train_preprocessed,y_train)

0.7300190799236803

In [30]:
# Calculating Mean Cross-Validation Score for Logistic Regression Model with TfidfVectorizer
crossval_lr_idf = np.mean(cross_val_score(pipe_idf_lr, X_train_preprocessed,y_train,scoring='accuracy',cv=5))

In [31]:
crossval_lr_idf

0.7267683043675561

- We run two models with both types of vectorizers, and based on our scores, we decided to stick with the Count Vectorizer.

In [32]:
# Defining Pipelines for Random Forest and Logistic Regression Models with CountVectorizer
steps_rf = [('countvec',CountVectorizer(min_df = 0.05, max_df = 0.95)),('rfc',RandomForestClassifier(n_estimators=200,random_state=42))]
steps_lr = [('countvec',CountVectorizer(min_df = 0.05, max_df = 0.95)),('logreg',LogisticRegression(random_state=42,max_iter=10000))]

pipe_rf = Pipeline(steps_rf)
pipe_lr = Pipeline(steps_lr)

### Hyperparameter Grids

In [33]:
# Defining Hyperparameter Grids for Random Forest and Logistic Regression Models
param_rf = {'rfc__n_estimators':[100,150,200],
                       'rfc__max_depth':[2,3,4,5],
                       'rfc__min_samples_leaf':[1,2,3]
                      }

param_lr = {'logreg__penalty':['l1', 'l2', 'elasticnet', None],
                 'logreg__C':[0.001,0.01,0.1,1,10,100],
                 'logreg__solver':['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
                }

### Hyperparameter Tuning with GridSearchCV

In [34]:
gs_rf = GridSearchCV(estimator=pipe_rf, param_grid=param_rf,scoring='accuracy',cv=5,error_score=0)
gs_lr = GridSearchCV(estimator=pipe_lr, param_grid=param_lr,scoring='accuracy',cv=5,error_score=0)
gs_rf.fit(X_train_preprocessed,y_train)
gs_lr.fit(X_train_preprocessed,y_train)

In [35]:
# Best Hyperparameters for Random Forest Model
gs_rf.best_params_

{'rfc__max_depth': 5, 'rfc__min_samples_leaf': 2, 'rfc__n_estimators': 100}

In [36]:
# Best Hyperparameters for Logistic Regression Model
gs_lr.best_params_

{'logreg__C': 1, 'logreg__penalty': 'l1', 'logreg__solver': 'liblinear'}

In [37]:
# Best Cross-Validation Score for Random Forest Model
gs_rf.best_score_

0.7035902146170281

In [38]:
# Best Cross-Validation Score for Logistic Regression Model
gs_lr.best_score_

0.7268742427732242

### AdaBoost Classifier with CountVectorizer

In [39]:
steps = [('countvec',CountVectorizer(min_df = 0.05, max_df = 0.95)),('ada',AdaBoostClassifier(random_state=42))]
pipe_ada_booster = Pipeline(steps).fit(X_train_preprocessed,y_train)
pipe_ada_booster.score(X_train_preprocessed,y_train)

0.7291357501236662

In [40]:
crossval_ada_booster = np.mean(cross_val_score(pipe_ada_booster, X_train_preprocessed,y_train,scoring='accuracy',cv=5))
crossval_ada_booster

0.7228819653159707

### GradientBoosting Classifier 

In [41]:
gbc = ('gbc',GradientBoostingClassifier(random_state=42))
steps[1] = gbc
pipe_gbc = Pipeline(steps).fit(X_train_preprocessed,y_train)
pipe_gbc.score(X_train_preprocessed,y_train)

0.7461310154759381

In [42]:
crossval_gradient_booster = np.mean(cross_val_score(pipe_gbc, X_train_preprocessed,y_train,scoring='accuracy',cv=5))
crossval_gradient_booster

0.7361320310125132

### XGBClassifier with CountVectorizer

In [43]:
steps = [('countvec',CountVectorizer(min_df = 0.05, max_df = 0.95)),('xgb',xgboost.XGBClassifier(random_state=42, objective='binary:logistic'))]
xgb_pipe = Pipeline(steps).fit(X_train_preprocessed,y_train)
xgb_pipe.score(X_train_preprocessed, y_train)

0.829870680517278

In [44]:
crossval_xg_booster = np.mean(cross_val_score(xgb_pipe, X_train_preprocessed,y_train,scoring='accuracy',cv=5))
crossval_xg_booster

0.7729489040068961

### Hyperparameter Grids for Boosting Algorithms

In [45]:
param_grid_ada = {'ada__n_estimators':[50,100,150,200],
                  'ada__learning_rate':[0.01,0.1,0.2,0.5,1]
                 }

param_grid_gradient = {'gbc__n_estimators':[50,100,150,200],
                       'gbc__learning_rate':[0.01,0.1,0.2,0.5,1],
                       'gbc__max_depth':[1,2,3,4]
                      }

param_grid_xgb = {
    'xgb__n_estimators': [50, 100, 200],
    'xgb__learning_rate': [0.01, 0.05, 0.1],
    'xgb__max_depth': [3, 5, 7, 9],
    'xgb__subsample': [0.7, 0.8, 0.9, 1.0],
    'xgb__colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'xgb__gamma': [0, 0.1, 0.2],
    'xgb__min_child_weight': [1, 2, 3],
    'xgb__reg_alpha': [0, 0.1, 0.5],
    'xgb__reg_lambda': [1, 1.5, 2]
}

In [None]:
gs_ada = GridSearchCV(estimator=pipe_ada_booster, param_grid=param_grid_ada,scoring='accuracy',cv=5,error_score=0)
gs_gradient = GridSearchCV(estimator=pipe_gbc, param_grid=param_grid_gradient,scoring='accuracy',cv=5,error_score=0)
gs_xgb = GridSearchCV(estimator=xgb_pipe, param_grid=param_grid_xgb,scoring='accuracy',cv=5,error_score=0)

gs_ada.fit(X_train_preprocessed,y_train)
gs_gradient.fit(X_train_preprocessed,y_train)
gs_xgb.fit(X_train_preprocessed,y_train)

In [None]:
# Best Hyperparameters for AdaBoost Classifier
gs_ada.best_params_

In [None]:
# Best Cross-Validation Score for AdaBoost Classifier
gs_ada.best_score_

In [None]:
# Best Hyperparameters for GradientBoosting Classifier
gs_gradient.best_params_

In [None]:
# Best Cross-Validation Score for GradientBoosting Classifier
gs_gradient.best_score_

In [None]:
# Best Hyperparameters for XGBClassifier 
gs_xgb.best_params_

In [None]:
# Best Cross-Validation Score for XGBClassifier 
gs_xgb.best_score_