In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
import warnings
import string
from nltk.corpus import stopwords
import nltk


In [2]:

# Download NLTK stopwords
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:

# Suppress warnings
warnings.filterwarnings('ignore')

# Load the dataset
df = pd.read_csv('CHP1_fakeReviewData.csv')
print(df.head())


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text_'], df['label'], test_size=0.35, random_state=42)



             category  rating label  \
0  Home_and_Kitchen_5     5.0    CG   
1  Home_and_Kitchen_5     5.0    CG   
2  Home_and_Kitchen_5     5.0    CG   
3  Home_and_Kitchen_5     1.0    CG   
4  Home_and_Kitchen_5     5.0    CG   

                                               text_  \
0  love this  well made sturdy and very comfortab...   
1  love it a great upgrade from the original  ive...   
2  this pillow saved my back i love the look and ...   
3  missing information on how to use it but it is...   
4  very nice set good quality we have had the set...   

                                              tokens  \
0  ['love', 'well', 'made', 'sturdy', 'comfortabl...   
1  ['love', 'great', 'upgrade', 'original', 'ive'...   
2  ['pillow', 'saved', 'back', 'love', 'look', 'f...   
3  ['missing', 'information', 'use', 'great', 'pr...   
4  ['nice', 'set', 'good', 'quality', 'set', 'two...   

                                      processed_text  
0  love well made sturdy comfortable

In [4]:

# Define the pipeline with TfidfVectorizer and SVC
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        lowercase=True,  # Convert text to lowercase
        stop_words='english',  # Remove English stopwords
        max_df=0.8,  # Ignore terms that appear in more than 80% of documents
        min_df=5,  # Ignore terms that appear in fewer than 5 documents
        ngram_range=(1, 2)  # Use unigrams and bigrams
    )),
    ('classifier', SVC(kernel='linear', probability=True))  # Use a linear kernel for SVC
])

# Train the pipeline
pipeline.fit(X_train, y_train)

# Make predictions
svc_pred = pipeline.predict(X_test)

# Evaluate the model
print('Classification Report:\n', classification_report(y_test, svc_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, svc_pred))
print('Accuracy Score:', accuracy_score(y_test, svc_pred))
print('Model Prediction Accuracy:', str(np.round(accuracy_score(y_test, svc_pred) * 100, 2)) + '%')

# Save the entire pipeline (including the vectorizer and model)
joblib.dump(pipeline, 'svc_pipeline.pkl')

print('\nSupport Vector Machines Model Accuracy:', str(np.round(accuracy_score(y_test, svc_pred) * 100, 2)) + '%')

Classification Report:
               precision    recall  f1-score   support

          CG       0.91      0.89      0.90      7059
          OR       0.89      0.91      0.90      7034

    accuracy                           0.90     14093
   macro avg       0.90      0.90      0.90     14093
weighted avg       0.90      0.90      0.90     14093

Confusion Matrix:
 [[6258  801]
 [ 609 6425]]
Accuracy Score: 0.8999503299510395
Model Prediction Accuracy: 90.0%

Support Vector Machines Model Accuracy: 90.0%
