In [17]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from datasets import Dataset 
import pandas as pd
import numpy as np
import torch
import evaluate 
import os 
from transformers import Trainer


In [18]:
data = pd.read_csv('../data/preprocessed_data.csv')
dataset = Dataset.from_pandas(data)

In [19]:
from transformers import AutoTokenizer

# Load your tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Define the tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


In [20]:
tokenized_ds = dataset.map(tokenize_function, batched=True)
tokenized_ds = tokenized_ds.train_test_split(test_size=0.2)

small_train = tokenized_ds['train'].shuffle(seed=13).select(range(200))
small_test = tokenized_ds['test'].shuffle(seed=13).select(range(200))


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [21]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/Users/erum/LHL_LLM/notebooks/modelsosdel",
    eval_strategy='epoch',
    num_train_epochs=4,
    learning_rate=2e-5,
    per_device_train_batch_size=4,          
    per_device_eval_batch_size=4,
    fp16=False,                             
    warmup_ratio=0.1,
    optim="adamw_torch",                    
    logging_dir="./logs",                   
    logging_steps=50,
    save_strategy="epoch"                   
)




In [22]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [23]:
import numpy as np
import evaluate

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "f1": f1.compute(predictions=predictions, references=labels)["f1"]
    }


In [28]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2  # Binary sentiment classification
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train,
    eval_dataset=small_test,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


In [30]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6846,0.653972,0.71,0.766129
2,0.4848,0.408856,0.815,0.781065
3,0.1534,0.293125,0.86,0.851064
4,0.0658,0.295693,0.895,0.894472




TrainOutput(global_step=200, training_loss=0.34715510964393614, metrics={'train_runtime': 100.0688, 'train_samples_per_second': 7.994, 'train_steps_per_second': 1.999, 'total_flos': 105973918924800.0, 'train_loss': 0.34715510964393614, 'epoch': 4.0})

In [25]:
trainer.evaluate()



{'eval_loss': 0.03512755408883095,
 'eval_accuracy': 0.995,
 'eval_f1': 0.9943502824858758,
 'eval_runtime': 5.3398,
 'eval_samples_per_second': 37.455,
 'eval_steps_per_second': 9.364,
 'epoch': 4.0}

After hyperparameter tuning, model locks into high accuracy and F1 with almost negligible training loss, suggesting a powerful fit on training data.

In [31]:
models = "./models"  # or use a full/relative path as needed


In [32]:
import os

ft_model = os.path.abspath(os.path.join(models, 'optimized_model'))
os.makedirs(ft_model, exist_ok=True)

model.save_pretrained(ft_model)
tokenizer.save_pretrained(ft_model)


('/Users/erum/LHL_LLM/notebooks/models/optimized_model/tokenizer_config.json',
 '/Users/erum/LHL_LLM/notebooks/models/optimized_model/special_tokens_map.json',
 '/Users/erum/LHL_LLM/notebooks/models/optimized_model/vocab.txt',
 '/Users/erum/LHL_LLM/notebooks/models/optimized_model/added_tokens.json',
 '/Users/erum/LHL_LLM/notebooks/models/optimized_model/tokenizer.json')

In [44]:
ft_model = os.path.abspath('./models/optimized_model')


In [33]:
# update labels
model.config.id2label = {0: "Negative", 1: "Positive"}
model.config.label2id = {"Negative": 0, "Positive": 1}

# Save the updated config with the model
model.save_pretrained("/Users/erum/LHL_LLM/notebooks/optimized_model")
tokenizer.save_pretrained("/Users/erum/LHL_LLM/notebooks/optimized_model")


('/Users/erum/LHL_LLM/notebooks/optimized_model/tokenizer_config.json',
 '/Users/erum/LHL_LLM/notebooks/optimized_model/special_tokens_map.json',
 '/Users/erum/LHL_LLM/notebooks/optimized_model/vocab.txt',
 '/Users/erum/LHL_LLM/notebooks/optimized_model/added_tokens.json',
 '/Users/erum/LHL_LLM/notebooks/optimized_model/tokenizer.json')