In [3]:
# 📌 Install necessary packages
!pip install transformers datasets sentencepiece

# ✅ Imports
from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import pandas as pd

# ✅ Load and prepare dataset
df = pd.read_csv("/kaggle/input/fake-reviews-dataset/fake reviews dataset.csv")
df = df.dropna(subset=['text_', 'label'])

# ✅ Generate explanation-based summary
def generate_target(row):
    if row['label'] == 1:
        return "Summary: " + row['text_'][:100] + " ... This review might be fake due to vague wording or repetition."
    else:
        return "Summary: " + row['text_'][:100] + " ... This review appears genuine and specific."

df['target'] = df.apply(generate_target, axis=1)

# ✅ Convert to Hugging Face dataset
dataset = Dataset.from_pandas(df[['text_', 'target']].rename(columns={"text_": "text", "target": "summary"}))
dataset = dataset.train_test_split(test_size=0.1)

# ✅ Tokenizer and model
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# ✅ Preprocessing function
def preprocess_function(examples):
    inputs = ["summarize: " + text for text in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    labels = tokenizer(examples["summary"], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = dataset.map(preprocess_function, batched=True)

# ✅ Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    save_total_limit=2,
)

# ✅ Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
)

# ✅ Train the model
trainer.train()

# ✅ Save the model
model.save_pretrained("t5-fake-review-summarizer")
tokenizer.save_pretrained("t5-fake-review-summarizer")

# ✅ Load and summarize new review using fine-tuned model
model = T5ForConditionalGeneration.from_pretrained("t5-fake-review-summarizer")
tokenizer = T5Tokenizer.from_pretrained("t5-fake-review-summarizer")

text = "This product changed my life. It is the best ever. Highly recommended!!!"
input_text = "summarize: " + text
input_ids = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).input_ids

output = model.generate(input_ids, max_length=50, num_beams=4, early_stopping=True)
print(tokenizer.decode(output[0], skip_special_tokens=True))




2025-04-17 22:51:30.802425: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744930290.824752      99 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744930290.831698      99 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface

Map:   0%|          | 0/36388 [00:00<?, ? examples/s]

Map:   0%|          | 0/4044 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.0447,0.048714
2,0.0362,0.04035
3,0.0343,0.040062




Summary: This product changed my life. It is the best ever. Highly recommended!!!... This review appears genuine and specific.


In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [4]:
!zip -r t5-fake-review-summarizer.zip t5-fake-review-summarizer


  adding: t5-fake-review-summarizer/ (stored 0%)
  adding: t5-fake-review-summarizer/added_tokens.json (deflated 83%)
  adding: t5-fake-review-summarizer/generation_config.json (deflated 29%)
  adding: t5-fake-review-summarizer/special_tokens_map.json (deflated 85%)
  adding: t5-fake-review-summarizer/config.json (deflated 62%)
  adding: t5-fake-review-summarizer/model.safetensors (deflated 9%)
  adding: t5-fake-review-summarizer/spiece.model (deflated 48%)
  adding: t5-fake-review-summarizer/tokenizer_config.json (deflated 94%)


In [5]:
!zip -r results.zip results


  adding: results/ (stored 0%)
  adding: results/checkpoint-6500/ (stored 0%)
  adding: results/checkpoint-6500/added_tokens.json (deflated 83%)
  adding: results/checkpoint-6500/rng_state.pth (deflated 25%)
  adding: results/checkpoint-6500/trainer_state.json (deflated 73%)
  adding: results/checkpoint-6500/generation_config.json (deflated 29%)
  adding: results/checkpoint-6500/optimizer.pt (deflated 7%)
  adding: results/checkpoint-6500/scheduler.pt (deflated 55%)
  adding: results/checkpoint-6500/special_tokens_map.json (deflated 85%)
  adding: results/checkpoint-6500/config.json (deflated 62%)
  adding: results/checkpoint-6500/training_args.bin (deflated 51%)
  adding: results/checkpoint-6500/model.safetensors (deflated 9%)
  adding: results/checkpoint-6500/spiece.model (deflated 48%)
  adding: results/checkpoint-6500/tokenizer_config.json (deflated 94%)
  adding: results/checkpoint-6825/ (stored 0%)
  adding: results/checkpoint-6825/added_tokens.json (deflated 83%)
  adding: resul