In [1]:
# Importing required libraries
import pandas as pd
import string
import numpy as np
import openpyxl
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.svm import SVC

# Downloading required NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\30abh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\30abh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\30abh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Defining preprocessing functions
def clean_text(text):
    if not isinstance(text, str) or text.strip() == "":
        return ""
    stop_words = set(stopwords.words('english'))
    nopunc = ''.join([char for char in text if char.isalnum() or char.isspace()])
    words = word_tokenize(nopunc)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

def stem_and_lemmatize(text):
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    words = [stemmer.stem(lemmatizer.lemmatize(word)) for word in text.split()]
    return ' '.join(words)


In [5]:
# Load and preprocess the dataset
dataset_path = 'fake-reviews-dataset.xlsx'
df = pd.read_excel(dataset_path)

df = df.dropna(subset=['text_'])
df['text_'] = df['text_'].astype(str)

# preprocessing
df['cleaned_text'] = df['text_'].apply(clean_text)
df['processed_text'] = df['cleaned_text'].apply(stem_and_lemmatize)

# Display the first few rows to verify preprocessing
df.head()


Unnamed: 0,category,rating,label,text_,cleaned_text,processed_text
0,Home_and_Kitchen_5,5,CG,"Love this! Well made, sturdy, and very comfor...",Love Well made sturdy comfortable love itVery ...,love well made sturdi comfort love itveri pretti
1,Home_and_Kitchen_5,5,CG,"love it, a great upgrade from the original. I...",love great upgrade original Ive mine couple years,love great upgrad origin ive mine coupl year
2,Home_and_Kitchen_5,5,CG,This pillow saved my back. I love the look and...,pillow saved back love look feel pillow,pillow save back love look feel pillow
3,Home_and_Kitchen_5,1,CG,"Missing information on how to use it, but it i...",Missing information use great product price,miss inform use great product price
4,Home_and_Kitchen_5,5,CG,Very nice set. Good quality. We have had the s...,nice set Good quality set two months,nice set good qualiti set two month


In [7]:
# Split the dataset
# 'label' column is assumed to contain the target variable
X = df['processed_text']
y = df['label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

# Display dataset sizes
print("Training size:", len(X_train))
print("Testing size:", len(X_test))


Training size: 26280
Testing size: 14152


In [9]:
# Create and train the model pipeline
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('classifier', SVC(kernel='linear'))
])

# Train the model
pipeline.fit(X_train, y_train)

# Display training completion
print("Model training completed.")



Model training completed.


In [11]:
# Make predictions and evaluate the model
predictions = pipeline.predict(X_test)

# Display evaluation metrics
print("Classification Report:\n", classification_report(y_test, predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))
print("Accuracy Score:", accuracy_score(y_test, predictions))


Classification Report:
               precision    recall  f1-score   support

          CG       0.88      0.87      0.87      7121
          OR       0.87      0.87      0.87      7031

    accuracy                           0.87     14152
   macro avg       0.87      0.87      0.87     14152
weighted avg       0.87      0.87      0.87     14152

Confusion Matrix:
 [[6165  956]
 [ 880 6151]]
Accuracy Score: 0.8702656868287167
