In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import pickle
import nltk
from nltk.corpus import stopwords
import string
import seaborn as sns
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC  # Import SVM
from sklearn.metrics import accuracy_score, precision_score

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\The
[nltk_data]     Printshop\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\The
[nltk_data]     Printshop\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Load the dataset
df = pd.read_csv('fake reviews dataset.csv')
print(df.columns)

Index(['category', 'rating', 'label', 'text_'], dtype='object')


In [4]:
df.drop(columns = {'category'}, inplace = True)
print(df.columns)

Index(['rating', 'label', 'text_'], dtype='object')


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40432 entries, 0 to 40431
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   rating  40432 non-null  float64
 1   label   40432 non-null  object 
 2   text_   40432 non-null  object 
dtypes: float64(1), object(2)
memory usage: 947.8+ KB


In [6]:
df.rename(columns = {'text_': 'text'}, inplace = True)
df.head()

Unnamed: 0,rating,label,text
0,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,5.0,CG,"love it, a great upgrade from the original. I..."
2,5.0,CG,This pillow saved my back. I love the look and...
3,1.0,CG,"Missing information on how to use it, but it i..."
4,5.0,CG,Very nice set. Good quality. We have had the s...


In [7]:
# Function to encode the target labels
def encode_label(df):
    labels = {
        'CG': 1,  # Fake review
        'OR': 0,  # Real review
    }
    df['target'] = df['label'].map(labels)
    return df


In [8]:
# Apply the label encoding
df = encode_label(df)
df['target'] = df['target'].fillna(0).replace([np.inf, -np.inf], 0).astype(int)
df.head()

Unnamed: 0,rating,label,text,target
0,5.0,CG,"Love this! Well made, sturdy, and very comfor...",1
1,5.0,CG,"love it, a great upgrade from the original. I...",1
2,5.0,CG,This pillow saved my back. I love the look and...,1
3,1.0,CG,"Missing information on how to use it, but it i...",1
4,5.0,CG,Very nice set. Good quality. We have had the s...,1


In [9]:
# Preprocessing: Tokenization, removing stopwords, stemming
ps = PorterStemmer()

In [10]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    # Remove non-alphanumeric characters and apply stemming
    text = [ps.stem(word) for word in text if word.isalnum() and word not in stopwords.words('english')]
    
    return " ".join(text)

df.head()

Unnamed: 0,rating,label,text,target
0,5.0,CG,"Love this! Well made, sturdy, and very comfor...",1
1,5.0,CG,"love it, a great upgrade from the original. I...",1
2,5.0,CG,This pillow saved my back. I love the look and...,1
3,1.0,CG,"Missing information on how to use it, but it i...",1
4,5.0,CG,Very nice set. Good quality. We have had the s...,1


In [11]:
# Apply text transformation
df['transformed_text'] = df['text'].apply(transform_text)

In [12]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(df['transformed_text']).toarray()
y = df['target'].values

In [13]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32)

In [14]:
# Train an SVM model
svm_model = SVC(kernel='linear')  # You can change the kernel if needed (e.g., 'rbf', 'poly', 'sigmoid')
svm_model.fit(X_train, y_train)

In [15]:
# Evaluate the model
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')

In [16]:
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")

Accuracy: 0.8615061209348337
Precision: 0.8616278288338828


In [17]:
# Save the trained model and vectorizer
with open('svm_model2.pkl', 'wb') as model_file:
    pickle.dump(svm_model, model_file)

with open('svm_vectorizer2.pkl', 'wb') as vectorizer_file:
    pickle.dump(tfidf, vectorizer_file)