In [1]:
!pip install --upgrade transformers datasets;

Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading transformers-4.51.3-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m88.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.whl (183 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, transformers
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: transformers
    Found existing installation: transformers 4.51.1
    Uninstalling transformers-4.51.1:
      Successfully uninstal

In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import re
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch
import shutil

In [23]:
#uploaded = files.upload()
file_path =  "/kaggle/input/fake-reviews-dataset/fake reviews dataset.csv"

def load_kaggle_dataset(file_path):
    df = pd.read_csv(file_path)
    df.columns = [col.strip().lower() for col in df.columns]
    print("Columns found:", df.columns.tolist())
    return df

In [24]:
# Text cleaning
def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    return ""

In [25]:
# Label conversion
def assign_labels(df):
    if 'label' not in df.columns or 'text_' not in df.columns:
        raise ValueError("Dataset must contain 'text_' and 'label' columns. Found: {}".format(df.columns.tolist()))

    label_mapping = {'OR': 0, 'CG': 1}
    df = df[df['text_'].notnull()]
    df['label'] = df['label'].map(label_mapping)

    if df['label'].isnull().any():
        raise ValueError("Label conversion failed — check for invalid labels in your data.")

    df['text'] = df['text_'].apply(preprocess_text)
    return df[['label', 'text']]


In [26]:
# Train-validation-test split
def split_dataset(df):
    train, temp = train_test_split(df, test_size=0.3, stratify=df['label'], random_state=42)
    val, test = train_test_split(temp, test_size=0.5, stratify=temp['label'], random_state=42)
    return train, val, test

In [27]:
# Tokenization and formatting for Hugging Face Datasets
def prepare_hf_dataset(train, val, test):
    tokenizer = BertTokenizer.from_pretrained('SravaniNirati/bert_fake_review_detection')

    def tokenize(batch):
        return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=512)

    train = train.sample(frac=1, random_state=42).reset_index(drop=True)
    
    train_dataset = Dataset.from_pandas(train)
    val_dataset = Dataset.from_pandas(val)
    test_dataset = Dataset.from_pandas(test)

    train_dataset = train_dataset.map(tokenize, batched=True)
    val_dataset = val_dataset.map(tokenize, batched=True)
    test_dataset = test_dataset.map(tokenize, batched=True)

    columns_to_return = ['input_ids', 'attention_mask', 'label']
    train_dataset.set_format(type='torch', columns=columns_to_return)
    val_dataset.set_format(type='torch', columns=columns_to_return)
    test_dataset.set_format(type='torch', columns=columns_to_return)

    return train_dataset, val_dataset, test_dataset, tokenizer


In [28]:
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {'accuracy': accuracy_score(labels, predictions)}

In [29]:
# Evaluate pretrained model and return instance for saving
def evaluate_pretrained_model(test_dataset):
    model = BertForSequenceClassification.from_pretrained('SravaniNirati/bert_fake_review_detection')
    trainer = Trainer(model=model, eval_dataset=test_dataset, compute_metrics=compute_metrics)
    results = trainer.evaluate()
    print("Evaluation Results:", results)
    return model


In [30]:
def fine_tune_model(train_dataset, val_dataset, tokenizer):
    model = BertForSequenceClassification.from_pretrained('SravaniNirati/bert_fake_review_detection')

    training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
    save_steps=200,
    eval_steps=200,
    save_total_limit=1
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()
    return model


In [31]:
# Save model locally (Kaggle version)
def save_and_download_model(model, tokenizer, path='bert_fake_review_model'):
    model.save_pretrained(path)
    tokenizer.save_pretrained(path)
    shutil.make_archive(path, 'zip', path)
    print(f"✅ Model saved and zipped at: {path}.zip — You can download it from the right-side file panel.")


In [33]:
# Main execution flow
if __name__ == "__main__":
    file_path = "/kaggle/input/fake-reviews-dataset/fake reviews dataset.csv"  
    df = load_kaggle_dataset(file_path)
    df_clean = assign_labels(df)
    train, val, test = split_dataset(df_clean)
    train_dataset, val_dataset, test_dataset, tokenizer = prepare_hf_dataset(train, val, test)
    model = evaluate_pretrained_model(test_dataset)
    save_and_download_model(model, tokenizer)
    print("✅ Evaluation complete and model zipped for download!")


Columns found: ['category', 'rating', 'label', 'text_']


Map:   0%|          | 0/28302 [00:00<?, ? examples/s]

Map:   0%|          | 0/6065 [00:00<?, ? examples/s]

Map:   0%|          | 0/6065 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Evaluation Results: {'eval_loss': 5.204216003417969, 'eval_model_preparation_time': 0.0026, 'eval_accuracy': 0.20131904369332235, 'eval_runtime': 89.1135, 'eval_samples_per_second': 68.059, 'eval_steps_per_second': 8.517}
✅ Model saved and zipped at: bert_fake_review_model.zip — You can download it from the right-side file panel.
✅ Evaluation complete and model zipped for download!


In [None]:
# !pip install -U transformers

In [34]:
# Main Execution
if __name__ == "__main__":
    file_path = "/kaggle/input/fake-reviews-dataset/fake reviews dataset.csv"
    df = load_kaggle_dataset(file_path)

    # Inspect label distribution (optional)
    print("\nLabel distribution:\n", df['label'].value_counts())

    df_clean = assign_labels(df)
    train, val, test = split_dataset(df_clean)
    train_dataset, val_dataset, test_dataset, tokenizer = prepare_hf_dataset(train, val, test)

    model = fine_tune_model(train_dataset, val_dataset, tokenizer)

    # Evaluate on test set
    trainer = Trainer(model=model, eval_dataset=test_dataset, compute_metrics=compute_metrics)
    test_results = trainer.evaluate()
    print("📊 Test Accuracy after Fine-tuning:", test_results)

    save_and_download_model(model, tokenizer)
    print("✅ All done!")

Columns found: ['category', 'rating', 'label', 'text_']

Label distribution:
 label
CG    20216
OR    20216
Name: count, dtype: int64


Map:   0%|          | 0/28302 [00:00<?, ? examples/s]

Map:   0%|          | 0/6065 [00:00<?, ? examples/s]

Map:   0%|          | 0/6065 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss
100,0.519
200,0.2122
300,0.2256
400,0.1574
500,0.1264
600,0.1254
700,0.1343
800,0.1402
900,0.1311
1000,0.1161


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


📊 Test Accuracy after Fine-tuning: {'eval_loss': 0.2128247320652008, 'eval_model_preparation_time': 0.0027, 'eval_accuracy': 0.9699917559769168, 'eval_runtime': 90.5011, 'eval_samples_per_second': 67.016, 'eval_steps_per_second': 8.387}
✅ Model saved and zipped at: bert_fake_review_model.zip — You can download it from the right-side file panel.
✅ All done!
