In [2]:
import time
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report


In [3]:
# Install catboost
!pip3 install catboost

from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB



In [4]:
pd.set_option('max_colwidth', None)

In [5]:
df=pd.read_csv('fake reviews dataset.csv')

In [6]:
df.head()

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfortable. I love it!Very pretty"
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I've had mine for a couple of years"
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and feel of this pillow.
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it is a great product for the price! I"
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the set for two months now and have not been


In [7]:
df['label'].value_counts()

label
CG    20216
OR    20216
Name: count, dtype: int64

In [8]:
df['text_'] = df['text_'].str.replace('\n', ' ')
df['target'] = np.where(df['label']=='CG', 1, 0)


In [9]:
df['target'].value_counts()

target
1    20216
0    20216
Name: count, dtype: int64

In [10]:
def punctuation_to_features(df, column):
    """Identify punctuation within a column and convert to a text representation.
    
    Args:
        df (object): Pandas dataframe.
        column (string): Name of column containing text. 
        
    Returns:
        df[column]: Original column with punctuation converted to text, 
                    i.e. "Wow! > "Wow exclamation"
    
    """
    
    df[column] = df[column].replace('!', ' exclamation ')
    df[column] = df[column].replace('?', ' question ')
    df[column] = df[column].replace('\'', ' quotation ')
    df[column] = df[column].replace('\"', ' quotation ')
    
    return df[column]

In [11]:
df['text_'] = punctuation_to_features(df, 'text_')
nltk.download('punkt');

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Vaibhav\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
def tokenize(column):
   
    
    tokens = nltk.word_tokenize(column)
    return [w for w in tokens if w.isalpha()] 

In [13]:

df['tokenized'] = df.apply(lambda x: tokenize(x['text_']), axis=1)
df.head(10)

Unnamed: 0,category,rating,label,text_,target,tokenized
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfortable. I love it!Very pretty",1,"[Love, this, Well, made, sturdy, and, very, comfortable, I, love, it, Very, pretty]"
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I've had mine for a couple of years",1,"[love, it, a, great, upgrade, from, the, original, I, had, mine, for, a, couple, of, years]"
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and feel of this pillow.,1,"[This, pillow, saved, my, back, I, love, the, look, and, feel, of, this, pillow]"
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it is a great product for the price! I",1,"[Missing, information, on, how, to, use, it, but, it, is, a, great, product, for, the, price, I]"
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the set for two months now and have not been,1,"[Very, nice, set, Good, quality, We, have, had, the, set, for, two, months, now, and, have, not, been]"
5,Home_and_Kitchen_5,3.0,CG,I WANTED DIFFERENT FLAVORS BUT THEY ARE NOT.,1,"[I, WANTED, DIFFERENT, FLAVORS, BUT, THEY, ARE, NOT]"
6,Home_and_Kitchen_5,5.0,CG,They are the perfect touch for me and the only thing I wish they had a little more space.,1,"[They, are, the, perfect, touch, for, me, and, the, only, thing, I, wish, they, had, a, little, more, space]"
7,Home_and_Kitchen_5,3.0,CG,These done fit well and look great. I love the smoothness of the edges and the extra,1,"[These, done, fit, well, and, look, great, I, love, the, smoothness, of, the, edges, and, the, extra]"
8,Home_and_Kitchen_5,5.0,CG,"Great big numbers & easy to read, the only thing I didn't like is the size of the",1,"[Great, big, numbers, easy, to, read, the, only, thing, I, did, like, is, the, size, of, the]"
9,Home_and_Kitchen_5,5.0,CG,My son loves this comforter and it is very well made. We also have a baby,1,"[My, son, loves, this, comforter, and, it, is, very, well, made, We, also, have, a, baby]"


In [14]:
nltk.download('stopwords');

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vaibhav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
def remove_stopwords(tokenized_column):
    
    stops = set(stopwords.words("english"))
    return [word for word in tokenized_column if not word in stops]

In [16]:

df['stopwords_removed'] = df.apply(lambda x: remove_stopwords(x['tokenized']), axis=1)
df.head(10)

Unnamed: 0,category,rating,label,text_,target,tokenized,stopwords_removed
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfortable. I love it!Very pretty",1,"[Love, this, Well, made, sturdy, and, very, comfortable, I, love, it, Very, pretty]","[Love, Well, made, sturdy, comfortable, I, love, Very, pretty]"
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I've had mine for a couple of years",1,"[love, it, a, great, upgrade, from, the, original, I, had, mine, for, a, couple, of, years]","[love, great, upgrade, original, I, mine, couple, years]"
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and feel of this pillow.,1,"[This, pillow, saved, my, back, I, love, the, look, and, feel, of, this, pillow]","[This, pillow, saved, back, I, love, look, feel, pillow]"
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it is a great product for the price! I",1,"[Missing, information, on, how, to, use, it, but, it, is, a, great, product, for, the, price, I]","[Missing, information, use, great, product, price, I]"
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the set for two months now and have not been,1,"[Very, nice, set, Good, quality, We, have, had, the, set, for, two, months, now, and, have, not, been]","[Very, nice, set, Good, quality, We, set, two, months]"
5,Home_and_Kitchen_5,3.0,CG,I WANTED DIFFERENT FLAVORS BUT THEY ARE NOT.,1,"[I, WANTED, DIFFERENT, FLAVORS, BUT, THEY, ARE, NOT]","[I, WANTED, DIFFERENT, FLAVORS, BUT, THEY, ARE, NOT]"
6,Home_and_Kitchen_5,5.0,CG,They are the perfect touch for me and the only thing I wish they had a little more space.,1,"[They, are, the, perfect, touch, for, me, and, the, only, thing, I, wish, they, had, a, little, more, space]","[They, perfect, touch, thing, I, wish, little, space]"
7,Home_and_Kitchen_5,3.0,CG,These done fit well and look great. I love the smoothness of the edges and the extra,1,"[These, done, fit, well, and, look, great, I, love, the, smoothness, of, the, edges, and, the, extra]","[These, done, fit, well, look, great, I, love, smoothness, edges, extra]"
8,Home_and_Kitchen_5,5.0,CG,"Great big numbers & easy to read, the only thing I didn't like is the size of the",1,"[Great, big, numbers, easy, to, read, the, only, thing, I, did, like, is, the, size, of, the]","[Great, big, numbers, easy, read, thing, I, like, size]"
9,Home_and_Kitchen_5,5.0,CG,My son loves this comforter and it is very well made. We also have a baby,1,"[My, son, loves, this, comforter, and, it, is, very, well, made, We, also, have, a, baby]","[My, son, loves, comforter, well, made, We, also, baby]"


In [17]:
def apply_stemming(tokenized_column):
    
    
    stemmer = PorterStemmer() 
    return [stemmer.stem(word).lower() for word in tokenized_column]
     



In [18]:
df['porter_stemmed'] = df.apply(lambda x: apply_stemming(x['stopwords_removed']), axis=1)
df.head(10)

Unnamed: 0,category,rating,label,text_,target,tokenized,stopwords_removed,porter_stemmed
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfortable. I love it!Very pretty",1,"[Love, this, Well, made, sturdy, and, very, comfortable, I, love, it, Very, pretty]","[Love, Well, made, sturdy, comfortable, I, love, Very, pretty]","[love, well, made, sturdi, comfort, i, love, veri, pretti]"
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I've had mine for a couple of years",1,"[love, it, a, great, upgrade, from, the, original, I, had, mine, for, a, couple, of, years]","[love, great, upgrade, original, I, mine, couple, years]","[love, great, upgrad, origin, i, mine, coupl, year]"
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and feel of this pillow.,1,"[This, pillow, saved, my, back, I, love, the, look, and, feel, of, this, pillow]","[This, pillow, saved, back, I, love, look, feel, pillow]","[thi, pillow, save, back, i, love, look, feel, pillow]"
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it is a great product for the price! I",1,"[Missing, information, on, how, to, use, it, but, it, is, a, great, product, for, the, price, I]","[Missing, information, use, great, product, price, I]","[miss, inform, use, great, product, price, i]"
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the set for two months now and have not been,1,"[Very, nice, set, Good, quality, We, have, had, the, set, for, two, months, now, and, have, not, been]","[Very, nice, set, Good, quality, We, set, two, months]","[veri, nice, set, good, qualiti, we, set, two, month]"
5,Home_and_Kitchen_5,3.0,CG,I WANTED DIFFERENT FLAVORS BUT THEY ARE NOT.,1,"[I, WANTED, DIFFERENT, FLAVORS, BUT, THEY, ARE, NOT]","[I, WANTED, DIFFERENT, FLAVORS, BUT, THEY, ARE, NOT]","[i, want, differ, flavor, but, they, are, not]"
6,Home_and_Kitchen_5,5.0,CG,They are the perfect touch for me and the only thing I wish they had a little more space.,1,"[They, are, the, perfect, touch, for, me, and, the, only, thing, I, wish, they, had, a, little, more, space]","[They, perfect, touch, thing, I, wish, little, space]","[they, perfect, touch, thing, i, wish, littl, space]"
7,Home_and_Kitchen_5,3.0,CG,These done fit well and look great. I love the smoothness of the edges and the extra,1,"[These, done, fit, well, and, look, great, I, love, the, smoothness, of, the, edges, and, the, extra]","[These, done, fit, well, look, great, I, love, smoothness, edges, extra]","[these, done, fit, well, look, great, i, love, smooth, edg, extra]"
8,Home_and_Kitchen_5,5.0,CG,"Great big numbers & easy to read, the only thing I didn't like is the size of the",1,"[Great, big, numbers, easy, to, read, the, only, thing, I, did, like, is, the, size, of, the]","[Great, big, numbers, easy, read, thing, I, like, size]","[great, big, number, easi, read, thing, i, like, size]"
9,Home_and_Kitchen_5,5.0,CG,My son loves this comforter and it is very well made. We also have a baby,1,"[My, son, loves, this, comforter, and, it, is, very, well, made, We, also, have, a, baby]","[My, son, loves, comforter, well, made, We, also, baby]","[my, son, love, comfort, well, made, we, also, babi]"


In [19]:

def rejoin_words(tokenized_column):
    return ( " ".join(tokenized_column))
  

In [20]:
df['all_text'] = df.apply(lambda x: rejoin_words(x['porter_stemmed']), axis=1)

In [21]:

df[['all_text']].head(10)

Unnamed: 0,all_text
0,love well made sturdi comfort i love veri pretti
1,love great upgrad origin i mine coupl year
2,thi pillow save back i love look feel pillow
3,miss inform use great product price i
4,veri nice set good qualiti we set two month
5,i want differ flavor but they are not
6,they perfect touch thing i wish littl space
7,these done fit well look great i love smooth edg extra
8,great big number easi read thing i like size
9,my son love comfort well made we also babi


In [22]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X = tfidf_vectorizer.fit_transform(df['all_text']).toarray()
y = df['target']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [24]:
# Display the shapes of the resulting arrays
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (32345, 5000)
X_test shape: (8087, 5000)
y_train shape: (32345,)
y_test shape: (8087,)


In [25]:
models = {
    "LinearSVC": LinearSVC(),
    "XGBClassifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "LGBMClassifier": LGBMClassifier(),
    "CatBoostClassifier": CatBoostClassifier(silent=True),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "AdaBoostClassifier": AdaBoostClassifier(),
    "BernoulliNB": BernoulliNB(),
    "MultinomialNB": MultinomialNB()
}

In [26]:
# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy}")
    print(f"F1 Score: {f1}")
    print(f"ROC AUC Score: {roc_auc}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("-" * 50)

Model: LinearSVC
Accuracy: 0.875602819339681
F1 Score: 0.8745948641236599
ROC AUC Score: 0.8755886549601154
Precision: 0.8756864702945582
Recall: 0.8735059760956175
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.88      0.88      4071
           1       0.88      0.87      0.87      4016

    accuracy                           0.88      8087
   macro avg       0.88      0.88      0.88      8087
weighted avg       0.88      0.88      0.88      8087

--------------------------------------------------


Parameters: { "use_label_encoder" } are not used.



Model: XGBClassifier
Accuracy: 0.8530975639915914
F1 Score: 0.8489318413021363
ROC AUC Score: 0.8529494769631863
Precision: 0.8674636174636174
Recall: 0.8311752988047809
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.87      0.86      4071
           1       0.87      0.83      0.85      4016

    accuracy                           0.85      8087
   macro avg       0.85      0.85      0.85      8087
weighted avg       0.85      0.85      0.85      8087

--------------------------------------------------
[LightGBM] [Info] Number of positive: 16200, number of negative: 16145
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.163391 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 174458
[LightGBM] [Info] Number of data points in the train set: 32345, number of used features: 3234
[



Model: AdaBoostClassifier
Accuracy: 0.7810065537282057
F1 Score: 0.7775405099861826
ROC AUC Score: 0.7809367112733052
Precision: 0.7845373891001267
Recall: 0.7706673306772909
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.79      0.78      4071
           1       0.78      0.77      0.78      4016

    accuracy                           0.78      8087
   macro avg       0.78      0.78      0.78      8087
weighted avg       0.78      0.78      0.78      8087

--------------------------------------------------
Model: BernoulliNB
Accuracy: 0.744404599975269
F1 Score: 0.7829465504567888
ROC AUC Score: 0.7456467424333617
Precision: 0.6769566006900308
Recall: 0.9282868525896414
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.56      0.69      4071
           1       0.68      0.93      0.78      4016

    accuracy                           0.74      8087
   macro avg    

In [61]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, classification_report

# Initialize and train the LinearSVC model
best_model = LinearSVC()
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

# Calculate and print metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("Best Model: LinearSVC")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"ROC AUC Score: {roc_auc}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print("Classification Report:")
print(classification_report(y_test, y_pred))


Best Model: LinearSVC
Accuracy: 0.875602819339681
F1 Score: 0.8745948641236599
ROC AUC Score: 0.8755886549601154
Precision: 0.8756864702945582
Recall: 0.8735059760956175
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.88      0.88      4071
           1       0.88      0.87      0.87      4016

    accuracy                           0.88      8087
   macro avg       0.88      0.88      0.88      8087
weighted avg       0.88      0.88      0.88      8087



In [62]:
import joblib

# Save the trained model
joblib.dump(best_model, 'linear_svc_model.pkl')
# Save the vectorizer
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']