In [None]:
from google.colab import files
files.upload()  # Upload kaggle.json here


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"abhaybhardwaja","key":"cfa1f6bad41589b33c19bacb4b2648a6"}'}

In [None]:
!pip install -q kaggle
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download the dataset
!kaggle datasets download -d mexwell/fake-reviews-dataset
!unzip -q fake-reviews-dataset.zip

Dataset URL: https://www.kaggle.com/datasets/mexwell/fake-reviews-dataset
License(s): Attribution 4.0 International (CC BY 4.0)
fake-reviews-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)
replace fake reviews dataset.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y


In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("fake reviews dataset.csv")

# Check columns and first few rows
print("Columns:", df.columns.tolist())
print("\nFirst few rows:")
print(df.head())

Columns: ['category', 'rating', 'label', 'text_']

First few rows:
             category  rating label  \
0  Home_and_Kitchen_5     5.0    CG   
1  Home_and_Kitchen_5     5.0    CG   
2  Home_and_Kitchen_5     5.0    CG   
3  Home_and_Kitchen_5     1.0    CG   
4  Home_and_Kitchen_5     5.0    CG   

                                               text_  
0  Love this!  Well made, sturdy, and very comfor...  
1  love it, a great upgrade from the original.  I...  
2  This pillow saved my back. I love the look and...  
3  Missing information on how to use it, but it i...  
4  Very nice set. Good quality. We have had the s...  


In [None]:
# Keep only necessary columns
df = df[['label', 'text_']]

# Drop rows with missing or empty text
df = df[df['text_'].notnull()]
df = df[df['text_'].str.strip() != '']
df = df[df['label'].isin(['CG', 'OR'])]  # Keep only known labels
df.reset_index(drop=True, inplace=True)

# Map labels: CG (Computer Generated / Fake) -> 1, OR (Original / Real) -> 0
df['label'] = df['label'].map({'CG': 1, 'OR': 0})

print(f"Cleaned dataset size: {df.shape}")
print("Label distribution:\n", df['label'].value_counts())

Cleaned dataset size: (40432, 2)
Label distribution:
 label
1    20216
0    20216
Name: count, dtype: int64


In [None]:
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already present
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Cleaning function
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply cleaning
df['cleaned'] = df['text_'].apply(clean_text)

# Drop rows with empty cleaned text
df = df[df['cleaned'].str.strip() != '']
df.reset_index(drop=True, inplace=True)

# Sanity check
print("Example cleaned text:\n", df['cleaned'].iloc[0])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Example cleaned text:
 love well made sturdy comfortable love itvery pretty


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF
tfidf = TfidfVectorizer(max_features=5000)

# Fit and transform the cleaned text
X = tfidf.fit_transform(df['cleaned']).toarray()

# Labels
y = df['label'].values

print("TF-IDF shape:", X.shape)


TF-IDF shape: (40431, 5000)


In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (32344, 5000)
Test shape: (8087, 5000)


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import classification_report, accuracy_score

# Predict on test set
y_pred = rf_model.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8519846667491034

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.83      0.85      4069
           1       0.84      0.87      0.85      4018

    accuracy                           0.85      8087
   macro avg       0.85      0.85      0.85      8087
weighted avg       0.85      0.85      0.85      8087

