In [None]:
# Install required libraries
!pip install transformers==4.50.3 datasets torch pandas numpy scikit-learn spacy matplotlib plotly ipywidgets tqdm jsonlines --upgrade accelerate

In [None]:
import transformers
print(transformers.__version__)

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import re
import spacy
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, pipeline
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import Dataset
import torch
import ipywidgets as widgets
from IPython.display import display

In [None]:
# Adjust file paths as needed:
train_file = '/content/drive/MyDrive/AmazonReviews/train.ft.txt'
test_file = '/content/drive/MyDrive/AmazonReviews/test.ft.txt'

In [None]:
import re
from datasets import Dataset, load_dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch

# --- Step 1: Load the fastText-formatted dataset more efficiently --- #

def load_ft_dataset_efficient(file_path):
    """
    Load a fastText-formatted dataset using a generator for memory efficiency.
    """
    def data_generator(file_path):
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                parts = line.strip().split(" ", 1)
                if len(parts) < 2:
                    continue
                label_str = parts[0]
                text = parts[1]
                label = 0 if label_str == "__label__1" else 1
                yield {"text": text, "label": label}

    return Dataset.from_generator(data_generator, gen_kwargs={"file_path": file_path})

print("Loading training dataset...")
train_dataset = load_ft_dataset_efficient(train_file)
print("Number of training samples:", len(train_dataset))

print("Loading testing dataset...")
test_dataset = load_ft_dataset_efficient(test_file)
print("Number of testing samples:", len(test_dataset))

# --- Step 2: Tokenize the Dataset --- #

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)

# Use batched=True for efficiency
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# Remove the 'text' column as the model doesn't need it, keeping only the tokenized inputs
tokenized_train = tokenized_train.remove_columns(["text"])
tokenized_test = tokenized_test.remove_columns(["text"])

# Rename the 'label' column to 'labels' as expected by the Trainer
tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_test = tokenized_test.rename_column("label", "labels")

# Set the format to PyTorch tensors for efficient training
tokenized_train.set_format("torch")
tokenized_test.set_format("torch")

# --- Step 3: Fine-Tune DistilBERT --- #

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Determine the device to use for training
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device) # Move the model to the GPU if available

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/AmazonReviews/distilbert_finetuned",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,  # Increased batch size significantly
    per_device_eval_batch_size=64,   # Increased evaluation batch size
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=500,
    save_total_limit=2,
    fp16=True,                      # Enable mixed precision for faster training and lower memory usage
    dataloader_num_workers=4,       # Use multiple workers for data loading (adjust based on your CPU cores)
    use_mps_device=True if device.type == 'mps' else False, # Enable MPS for Apple Silicon GPUs
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
)

# Begin fine-tuning
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("/content/drive/MyDrive/AmazonReviews/distilbert_finetuned")
tokenizer.save_pretrained("/content/drive/MyDrive/AmazonReviews/distilbert_finetuned")
print("Fine-tuning completed and model saved.")