In [11]:
import time
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

In [2]:
!pip install catboost


Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [3]:
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

In [4]:
pd.set_option('max_colwidth', None)

Load dataset

In [5]:
from google.colab import files

# Choose the file from your local machine
dt = files.upload()


Saving fake reviews dataset.csv to fake reviews dataset.csv


In [6]:
df = pd.read_csv('fake reviews dataset.csv', names=['category', 'rating', 'label', 'text'])

In [10]:
print(df.shape)

(40433, 4)


In [9]:
df.head()

Unnamed: 0,category,rating,label,text
0,category,rating,label,text_
1,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfortable. I love it!Very pretty"
2,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I've had mine for a couple of years"
3,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and feel of this pillow.
4,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it is a great product for the price! I"


In [None]:
df['label'].value_counts()

CG       20216
OR       20216
label        1
Name: label, dtype: int64

In [None]:
df['text'] = df['text'].str.replace('\n', ' ')

In [None]:
df['target'] = np.where(df['label']=='CG', 1, 0)

In [None]:
df['target'].value_counts()

0    20217
1    20216
Name: target, dtype: int64

Create features from punctuation

In [None]:
def punctuation_to_features(df, column):
    """Identify punctuation within a column and convert to a text representation.

    Args:
        df (object): Pandas dataframe.
        column (string): Name of column containing text.

    Returns:
        df[column]: Original column with punctuation converted to text,
                    i.e. "Wow! > "Wow exclamation"

    """

    df[column] = df[column].replace('!', ' exclamation ')
    df[column] = df[column].replace('?', ' question ')
    df[column] = df[column].replace('\'', ' quotation ')
    df[column] = df[column].replace('\"', ' quotation ')

    return df[column]

In [None]:
df['text'] = punctuation_to_features(df, 'text')

Tokenize the data

In [None]:
nltk.download('punkt');

def tokenize(column):
    """Tokenizes a Pandas dataframe column and returns a list of tokens.

    Args:
        column: Pandas dataframe column (i.e. df['text']).

    Returns:
        tokens (list): Tokenized list, i.e. [Donald, Trump, tweets]

    """

    tokens = nltk.word_tokenize(column)
    return [w for w in tokens if w.isalpha()]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
df['tokenized'] = df.apply(lambda x: tokenize(x['text']), axis=1)
df.head()

Unnamed: 0,category,rating,label,text,target,tokenized
0,category,rating,label,text_,0,[]
1,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfortable. I love it!Very pretty",1,"[Love, this, Well, made, sturdy, and, very, comfortable, I, love, it, Very, pretty]"
2,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I've had mine for a couple of years",1,"[love, it, a, great, upgrade, from, the, original, I, had, mine, for, a, couple, of, years]"
3,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and feel of this pillow.,1,"[This, pillow, saved, my, back, I, love, the, look, and, feel, of, this, pillow]"
4,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it is a great product for the price! I",1,"[Missing, information, on, how, to, use, it, but, it, is, a, great, product, for, the, price, I]"


Stopword removal

In [None]:
nltk.download('stopwords');

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def remove_stopwords(tokenized_column):
    """Return a list of tokens with English stopwords removed.

    Args:
        column: Pandas dataframe column of tokenized data from tokenize()

    Returns:
        tokens (list): Tokenized list with stopwords removed.

    """
    stops = set(stopwords.words("english"))
    return [word for word in tokenized_column if not word in stops]

In [None]:
df['stopwords_removed'] = df.apply(lambda x: remove_stopwords(x['tokenized']), axis=1)
df.head()

Unnamed: 0,category,rating,label,text,target,tokenized,stopwords_removed
0,category,rating,label,text_,0,[],[]
1,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfortable. I love it!Very pretty",1,"[Love, this, Well, made, sturdy, and, very, comfortable, I, love, it, Very, pretty]","[Love, Well, made, sturdy, comfortable, I, love, Very, pretty]"
2,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I've had mine for a couple of years",1,"[love, it, a, great, upgrade, from, the, original, I, had, mine, for, a, couple, of, years]","[love, great, upgrade, original, I, mine, couple, years]"
3,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and feel of this pillow.,1,"[This, pillow, saved, my, back, I, love, the, look, and, feel, of, this, pillow]","[This, pillow, saved, back, I, love, look, feel, pillow]"
4,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it is a great product for the price! I",1,"[Missing, information, on, how, to, use, it, but, it, is, a, great, product, for, the, price, I]","[Missing, information, use, great, product, price, I]"


In [None]:
def apply_stemming(tokenized_column):
    """Return a list of tokens with Porter stemming applied.

    Args:
        column: Pandas dataframe column of tokenized data with stopwords removed.

    Returns:
        tokens (list): Tokenized list with words Porter stemmed.

    """

    stemmer = PorterStemmer()
    return [stemmer.stem(word).lower() for word in tokenized_column]

In [None]:
df['porter_stemmed'] = df.apply(lambda x: apply_stemming(x['stopwords_removed']), axis=1)
df.head()

Unnamed: 0,category,rating,label,text,target,tokenized,stopwords_removed,porter_stemmed
0,category,rating,label,text_,0,[],[],[]
1,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfortable. I love it!Very pretty",1,"[Love, this, Well, made, sturdy, and, very, comfortable, I, love, it, Very, pretty]","[Love, Well, made, sturdy, comfortable, I, love, Very, pretty]","[love, well, made, sturdi, comfort, i, love, veri, pretti]"
2,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I've had mine for a couple of years",1,"[love, it, a, great, upgrade, from, the, original, I, had, mine, for, a, couple, of, years]","[love, great, upgrade, original, I, mine, couple, years]","[love, great, upgrad, origin, i, mine, coupl, year]"
3,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and feel of this pillow.,1,"[This, pillow, saved, my, back, I, love, the, look, and, feel, of, this, pillow]","[This, pillow, saved, back, I, love, look, feel, pillow]","[thi, pillow, save, back, i, love, look, feel, pillow]"
4,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it is a great product for the price! I",1,"[Missing, information, on, how, to, use, it, but, it, is, a, great, product, for, the, price, I]","[Missing, information, use, great, product, price, I]","[miss, inform, use, great, product, price, i]"


Rejoin words

In [None]:
def rejoin_words(tokenized_column):
    return ( " ".join(tokenized_column))

In [None]:
df['all_text'] = df.apply(lambda x: rejoin_words(x['porter_stemmed']), axis=1)

In [None]:
df[['all_text']].head()

Unnamed: 0,all_text
0,
1,love well made sturdi comfort i love veri pretti
2,love great upgrad origin i mine coupl year
3,thi pillow save back i love look feel pillow
4,miss inform use great product price i


Create training and test data

In [None]:
X = df['all_text']
y = df['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, shuffle=True)

Run the model selection process

In [None]:
classifiers = {}
classifiers.update({"XGBClassifier": XGBClassifier(eval_metric='logloss',
                                                   objective='binary:logistic',
                                                   )})
classifiers.update({"CatBoostClassifier": CatBoostClassifier(silent=True)})
classifiers.update({"LinearSVC": LinearSVC()})
classifiers.update({"MultinomialNB": MultinomialNB()})
classifiers.update({"LGBMClassifier": LGBMClassifier()})
classifiers.update({"RandomForestClassifier": RandomForestClassifier()})
classifiers.update({"DecisionTreeClassifier": DecisionTreeClassifier()})
classifiers.update({"ExtraTreeClassifier": ExtraTreeClassifier()})
classifiers.update({"AdaBoostClassifier": AdaBoostClassifier()})
classifiers.update({"KNeighborsClassifier": KNeighborsClassifier()})
classifiers.update({"RidgeClassifier": RidgeClassifier()})
classifiers.update({"SGDClassifier": SGDClassifier()})
classifiers.update({"BaggingClassifier": BaggingClassifier()})
classifiers.update({"BernoulliNB": BernoulliNB()})

In [None]:
df_models = pd.DataFrame(columns=['model', 'run_time', 'roc_auc', 'roc_auc_std'])

for key in classifiers:

    start_time = time.time()
    pipeline = Pipeline([("tfidf", TfidfVectorizer()), ("clf", classifiers[key] )])
    cv = cross_val_score(pipeline, X, y, cv=5, scoring='roc_auc')

    row = {'model': key,
           'run_time': format(round((time.time() - start_time)/60,2)),
           'roc_auc': cv.mean(),
           'roc_auc_std': cv.std(),
    }

    df_models = df_models.append(row, ignore_index=True)

df_models = df_models.sort_values(by='roc_auc', ascending=False)

  df_models = df_models.append(row, ignore_index=True)
  df_models = df_models.append(row, ignore_index=True)
  df_models = df_models.append(row, ignore_index=True)
  df_models = df_models.append(row, ignore_index=True)


[LightGBM] [Info] Number of positive: 16173, number of negative: 16173
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.124950 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 176208
[LightGBM] [Info] Number of data points in the train set: 32346, number of used features: 3262
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 16173, number of negative: 16173
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.869087 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 171382
[LightGBM] [Info] Number of data points in the train set: 32346, number of used features: 3141
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> inits

  df_models = df_models.append(row, ignore_index=True)
  df_models = df_models.append(row, ignore_index=True)
  df_models = df_models.append(row, ignore_index=True)
  df_models = df_models.append(row, ignore_index=True)
  df_models = df_models.append(row, ignore_index=True)
  df_models = df_models.append(row, ignore_index=True)
  df_models = df_models.append(row, ignore_index=True)
  df_models = df_models.append(row, ignore_index=True)
  df_models = df_models.append(row, ignore_index=True)
  df_models = df_models.append(row, ignore_index=True)


In [None]:
df_models

Unnamed: 0,model,run_time,roc_auc,roc_auc_std
11,SGDClassifier,0.13,0.925414,0.009009
1,CatBoostClassifier,46.31,0.922939,0.01016
2,LinearSVC,0.15,0.922387,0.012612
10,RidgeClassifier,0.13,0.92226,0.013279
4,LGBMClassifier,1.23,0.918352,0.010481
0,XGBClassifier,3.6,0.916804,0.010358
5,RandomForestClassifier,6.8,0.912067,0.013549
3,MultinomialNB,0.11,0.901807,0.019811
12,BaggingClassifier,13.93,0.856358,0.009985
8,AdaBoostClassifier,0.94,0.844253,0.020968


In [None]:
bundled_pipeline = Pipeline([("tfidf", TfidfVectorizer()),
                             ("clf", SGDClassifier())
                            ])
bundled_pipeline.fit(X_train, y_train)
y_pred = bundled_pipeline.predict(X_test)

In [None]:
accuracy_score = accuracy_score(y_test, y_pred)
precision_score = precision_score(y_test, y_pred)
recall_score = recall_score(y_test, y_pred)
roc_auc_score = roc_auc_score(y_test, y_pred)

In [None]:
print('Accuracy:', accuracy_score)
print('Precision:', precision_score)
print('Recall:', recall_score)
print('ROC/AUC:', roc_auc_score)

Accuracy: 0.8692497938994229
Precision: 0.8966175195143105
Recall: 0.8392596200681929
ROC/AUC: 0.8697219219081544


Saving the model

In [None]:
import pickle
filename = 'model.pkl'
pickle.dump(bundled_pipeline, open(filename, 'wb'))

In [None]:
from google.colab import drive
drive.mount('/content/drive')


**TESTING OUR MODEL ON OTHER AMAZON DATASETS**

In [None]:
import joblib

In [None]:
model = joblib.load('model.pkl')

In [None]:
from google.colab import files

# Choose the file from your local machine
pt = files.upload()

Saving 7817_1.csv to 7817_1.csv


In [None]:
amazon_review = pd.read_csv('7817_1.csv')
#amazon_review = pd.DataFrame('7817_1.csv', columns=['category', 'rating', 'reviews.text'])



In [None]:
amazon_review = pd.DataFrame(amazon_review, columns=['category', 'rating', 'reviews.text'])

In [None]:
amazon_review.head()

Unnamed: 0,category,rating,reviews.text
0,,,I initially had trouble deciding between the p...
1,,,Allow me to preface this with a little history...
2,,,I am enjoying it so far. Great for reading. Ha...
3,,,I bought one of the first Paperwhites and have...
4,,,I have to say upfront - I don't like coroporat...


In [None]:
amazon_review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1597 entries, 0 to 1596
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   category      0 non-null      float64
 1   rating        0 non-null      float64
 2   reviews.text  1597 non-null   object 
dtypes: float64(2), object(1)
memory usage: 37.6+ KB


In [None]:
def punctuation_to_features(amazon_review, column):
    """Identify punctuation within a column and convert to a text representation.

    Args:
        df (object): Pandas dataframe.
        column (string): Name of column containing text.

    Returns:
        df[column]: Original column with punctuation converted to text,
                    i.e. "Wow! > "Wow exclamation"

    """

    amazon_review[column] = amazon_review[column].replace('!', ' exclamation ')
    amazon_review[column] = amazon_review[column].replace('?', ' question ')
    amazon_review[column] = amazon_review[column].replace('\'', ' quotation ')
    amazon_review[column] = amazon_review[column].replace('\"', ' quotation ')

    return amazon_review[column]

In [None]:
amazon_review['reviews.text'] = punctuation_to_features(amazon_review, 'reviews.text')

In [None]:
nltk.download('punkt');

def tokenize(column):
    """Tokenizes a Pandas dataframe column and returns a list of tokens.

    Args:
        column: Pandas dataframe column (i.e. df['text']).

    Returns:
        tokens (list): Tokenized list, i.e. [Donald, Trump, tweets]

    """

    tokens = nltk.word_tokenize(column)
    return [w for w in tokens if w.isalpha()]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
amazon_review['tokenized'] = amazon_review.apply(lambda x: tokenize(x['reviews.text']), axis=1)
amazon_review.head()

Unnamed: 0,category,rating,reviews.text,tokenized
0,,,I initially had trouble deciding between the p...,"[I, initially, had, trouble, deciding, between..."
1,,,Allow me to preface this with a little history...,"[Allow, me, to, preface, this, with, a, little..."
2,,,I am enjoying it so far. Great for reading. Ha...,"[I, am, enjoying, it, so, far, Great, for, rea..."
3,,,I bought one of the first Paperwhites and have...,"[I, bought, one, of, the, first, Paperwhites, ..."
4,,,I have to say upfront - I don't like coroporat...,"[I, have, to, say, upfront, I, do, like, corop..."


In [None]:
nltk.download('stopwords');

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def remove_stopwords(tokenized_column):
    """Return a list of tokens with English stopwords removed.

    Args:
        column: Pandas dataframe column of tokenized data from tokenize()

    Returns:
        tokens (list): Tokenized list with stopwords removed.

    """
    stops = set(stopwords.words("english"))
    return [word for word in tokenized_column if not word in stops]

In [None]:
amazon_review['stopwords_removed'] = amazon_review.apply(lambda x: remove_stopwords(x['tokenized']), axis=1)
amazon_review.head()

Unnamed: 0,category,rating,reviews.text,tokenized,stopwords_removed
0,,,I initially had trouble deciding between the p...,"[I, initially, had, trouble, deciding, between...","[I, initially, trouble, deciding, paperwhite, ..."
1,,,Allow me to preface this with a little history...,"[Allow, me, to, preface, this, with, a, little...","[Allow, preface, little, history, I, casual, r..."
2,,,I am enjoying it so far. Great for reading. Ha...,"[I, am, enjoying, it, so, far, Great, for, rea...","[I, enjoying, far, Great, reading, Had, origin..."
3,,,I bought one of the first Paperwhites and have...,"[I, bought, one, of, the, first, Paperwhites, ...","[I, bought, one, first, Paperwhites, pleased, ..."
4,,,I have to say upfront - I don't like coroporat...,"[I, have, to, say, upfront, I, do, like, corop...","[I, say, upfront, I, like, coroporate, hermeti..."


In [None]:
def apply_stemming(tokenized_column):
    """Return a list of tokens with Porter stemming applied.

    Args:
        column: Pandas dataframe column of tokenized data with stopwords removed.

    Returns:
        tokens (list): Tokenized list with words Porter stemmed.

    """

    stemmer = PorterStemmer()
    return [stemmer.stem(word).lower() for word in tokenized_column]

In [None]:
amazon_review['porter_stemmed'] = amazon_review.apply(lambda x: apply_stemming(x['stopwords_removed']), axis=1)
amazon_review.head()

Unnamed: 0,category,rating,reviews.text,tokenized,stopwords_removed,porter_stemmed
0,,,I initially had trouble deciding between the p...,"[I, initially, had, trouble, deciding, between...","[I, initially, trouble, deciding, paperwhite, ...","[i, initi, troubl, decid, paperwhit, voyag, re..."
1,,,Allow me to preface this with a little history...,"[Allow, me, to, preface, this, with, a, little...","[Allow, preface, little, history, I, casual, r...","[allow, prefac, littl, histori, i, casual, rea..."
2,,,I am enjoying it so far. Great for reading. Ha...,"[I, am, enjoying, it, so, far, Great, for, rea...","[I, enjoying, far, Great, reading, Had, origin...","[i, enjoy, far, great, read, had, origin, fire..."
3,,,I bought one of the first Paperwhites and have...,"[I, bought, one, of, the, first, Paperwhites, ...","[I, bought, one, first, Paperwhites, pleased, ...","[i, bought, one, first, paperwhit, pleas, cons..."
4,,,I have to say upfront - I don't like coroporat...,"[I, have, to, say, upfront, I, do, like, corop...","[I, say, upfront, I, like, coroporate, hermeti...","[i, say, upfront, i, like, coropor, hermet, cl..."


In [None]:
def rejoin_words(tokenized_column):
    return ( " ".join(tokenized_column))

In [None]:
amazon_review['reviews.text'] = amazon_review.apply(lambda x: rejoin_words(x['porter_stemmed']), axis=1)

In [None]:
X_new = amazon_review['reviews.text']
y_new = amazon_review['rating']

In [None]:
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new, y_new,  random_state=1, shuffle=True)

In [None]:
predictions = model.predict(X_new)

In [None]:
accuracy = accuracy_score(y_new, predictions)
report = classification_report(y, predictions)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)

ValueError: Found input variables with inconsistent numbers of samples: [40433, 1597]