### Pytorch Finetuning ###

In [1]:
from transformers import (
    AutoModelForSequenceClassification,
    DistilBertTokenizer,
    TrainingArguments,
    Trainer,
)
import torch
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
from datasets import Dataset

##################################################################################################################


def compute_metrics(p):
    logits, labels = p
    predictions = np.argmax(logits, axis=1)
    return {"accuracy": accuracy_score(labels, predictions)}


def collate_fn(batch):
    return {
        "input_ids": torch.stack([item["input_ids"] for item in batch]),
        "attention_mask": torch.stack([item["attention_mask"] for item in batch]),
        "labels": torch.tensor([item["labels"] for item in batch]),
    }


def process_labels(batch):
    sentiments = batch["sentiment"]
    labels = [
        label_mapping.get(s[0], -1) if isinstance(s, list) else label_mapping.get(s, -1)
        for s in sentiments
    ]
    batch["labels"] = labels
    return batch


def tokenize_function(examples):
    return tokenizer(examples["reviews_decoded"], padding="max_length", truncation=True)


##################################################################################################################

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = DistilBertTokenizer.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english"
)
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english"
).to(device)

number_of_samples = 10
debug = True

X_train = pd.read_csv("../data/Train_Test_splits/X_train_50proc_trunc_pad.csv")
X_test = pd.read_csv("../data/Train_Test_splits/X_test_50proc_trunc_pad.csv")
y_train = pd.read_csv("../data/Train_Test_splits/y_train_50proc.csv")
y_test = pd.read_csv("../data/Train_Test_splits/y_test_50proc.csv")

train_df = pd.concat([X_train, y_train], axis=1).head(number_of_samples)
test_df = pd.concat([X_test, y_test], axis=1).head(number_of_samples)

if debug:
    print(train_df.head())
    print()
    print(test_df.head())

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

label_mapping = {
    "LABEL_0": 0,
    "LABEL_1": 1,
}

train_dataset = train_dataset.map(process_labels, batched=True)
test_dataset = test_dataset.map(process_labels, batched=True)

train_dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask", "labels"]
)
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    gradient_accumulation_steps=2,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)

pretrained_results = trainer.evaluate()
print()
print(f"Pretrained model evaluation results: {pretrained_results}")
print(f"Accuracy: {pretrained_results['eval_accuracy']}")

trainer.train()

finetuned_results = trainer.evaluate()
print(f"Fine-tuned model evaluation results: {finetuned_results}")
print(f"Accuracy: {finetuned_results['eval_accuracy']}")

  from .autonotebook import tqdm as notebook_tqdm
2025-01-15 12:57:26.932874: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736942247.137399    3413 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736942247.199361    3413 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-15 12:57:27.656342: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


                                     reviews_decoded sentiment
0   one of the other reviewers has mentioned that...   LABEL_1
1   a wonderful little production.   the filming ...   LABEL_1
2   i thought this was a wonderful way to spend t...   LABEL_1
3   basically there's a family where a little boy...   LABEL_0
4   petter mattei's'love in the time of money'is ...   LABEL_1

                                     reviews_decoded sentiment
0   this movie was bad from the start. the only p...   LABEL_0
1   god, i never felt so insulted in my whole lif...   LABEL_0
2   not being a fan of the coen brothers or georg...   LABEL_1
3   the movie andaz apna apna in my books is the ...   LABEL_1
4   i have to say i was really looking forward on...   LABEL_0


Map: 100%|██████████| 10/10 [00:00<00:00, 178.27 examples/s]
Map: 100%|██████████| 10/10 [00:00<00:00, 227.72 examples/s]
Map: 100%|██████████| 10/10 [00:00<00:00, 2829.59 examples/s]
Map: 100%|██████████| 10/10 [00:00<00:00, 3097.03 examples/s]



Pretrained model evaluation results: {'eval_loss': 0.39291030168533325, 'eval_model_preparation_time': 0.0014, 'eval_accuracy': 0.8, 'eval_runtime': 1.4644, 'eval_samples_per_second': 6.829, 'eval_steps_per_second': 6.829}
Accuracy: 0.8


Step,Training Loss


Fine-tuned model evaluation results: {'eval_loss': 0.814533531665802, 'eval_model_preparation_time': 0.0014, 'eval_accuracy': 0.9, 'eval_runtime': 1.0543, 'eval_samples_per_second': 9.485, 'eval_steps_per_second': 9.485, 'epoch': 1.0}
Accuracy: 0.9


In [2]:
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

print("Model and tokenizer saved.")

Model and tokenizer saved.
