In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import pandas as pd
from datasets import Dataset
import torch
from torch.optim import AdamW
import evaluate
import numpy as np
df = pd.read_csv("https://raw.githubusercontent.com/AbdlSi/IMDB-Sentiment-Analysis-BERT-Model-/refs/heads/main/data/processed/clean_data.csv")

df
dataset = Dataset.from_pandas(df)
dataset = dataset.rename_column("sentiment","labels")
dataset = dataset.rename_column("review","text")

dataset

Dataset({
    features: ['text', 'labels'],
    num_rows: 50000
})

In [3]:
model_path = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_path)
# max_length = 512

id2label = {0:"negative",1:"positive"}
label2id = {"negative":0, "positive":1}
model = AutoModelForSequenceClassification.from_pretrained(
    model_path,
    num_labels=2,
    id2label =id2label ,
    label2id = label2id,
)

for name ,param in model.base_model.named_parameters():
  param.requires_grad = False

def preprocess_function(examples):
  return tokenizer(examples["text"], truncation= True)


tokenized_dataset = dataset.map(preprocess_function, batched = True)
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [4]:
tokenized_dataset =tokenized_dataset.class_encode_column("labels")
print(tokenized_dataset.features["labels"])

Casting to class labels:   0%|          | 0/50000 [00:00<?, ? examples/s]

ClassLabel(names=['negative', 'positive'])


In [5]:
split1 = tokenized_dataset.train_test_split(test_size = 0.2, seed = 34, stratify_by_column="labels")
train_ds = split1["train"]
mixed_ds = split1["test"]

split2 = mixed_ds.train_test_split(test_size=0.5, seed = 54, stratify_by_column="labels")
test_ds = split2["train"]
val_ds = split2["test"]
print(train_ds)
print(test_ds)
print(val_ds)

Dataset({
    features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 40000
})
Dataset({
    features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 5000
})
Dataset({
    features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 5000
})


In [6]:
accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")
f1_score = evaluate.load("f1")

def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  probabilities = np.exp(predictions) / np.exp(predictions).sum(-1,keepdims = True)

  positive_class_probs = probabilities[:,1]
  auc = np.round(auc_score.compute(prediction_scores = positive_class_probs, references = labels)['roc_auc'],3)

  predicted_classes = np.argmax(predictions, axis = 1)
  acc = np.round(accuracy.compute(predictions = predicted_classes, references = labels)['accuracy'],3)
  f1 = np.round(f1_score.compute(predictions=predicted_classes, references=labels, average="binary")["f1"],3)
  return {"Accuracy": acc, "AUC":auc, "F1": f1}

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [8]:
# Training Parameters
from torch.optim import AdamW



lr = 2e-5
batch_size = 32
num_epochs = 4

training_args = TrainingArguments(
    output_dir = "ber-sentiment-classifier_teacher",
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = num_epochs,
    logging_strategy = "epoch",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end = True,
)

optimizer = AdamW(model.parameters(),lr = lr)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_ds,
    eval_dataset = val_ds,


    data_collator=data_collator,
    optimizers = (optimizer,None),
    compute_metrics = compute_metrics,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Auc,F1
1,0.679018,0.658455,0.669,0.74,0.706
2,0.655278,0.642497,0.681,0.775,0.731
3,0.641964,0.632944,0.695,0.785,0.737
4,0.634726,0.630936,0.69,0.787,0.739


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.weight', 'bert.encoder.layer.2.attention.output.LayerNorm.bias', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.2.output.LayerNorm.bias', 'bert.encoder.layer.3.attention.output.LayerNorm.weight', 'bert.encoder.layer.3.attention.output.LayerNorm.bias', 'bert.encoder.layer.3.output.LayerNorm.weight', 'bert.encoder.layer.3.output.LayerNorm.bias', 'bert.encoder.layer.4.attention.output.La

TrainOutput(global_step=5000, training_loss=0.6527465942382813, metrics={'train_runtime': 1006.0826, 'train_samples_per_second': 159.033, 'train_steps_per_second': 4.97, 'total_flos': 4.206006175447872e+16, 'train_loss': 0.6527465942382813, 'epoch': 4.0})

In [9]:
test_results = trainer.evaluate(test_ds)

print(test_results)



{'eval_loss': 0.6323181390762329, 'eval_Accuracy': 0.671, 'eval_AUC': 0.782, 'eval_F1': 0.724, 'eval_runtime': 30.8551, 'eval_samples_per_second': 162.825, 'eval_steps_per_second': 5.088, 'epoch': 4.0}


In [11]:
from google.colab import drive
drive.mount('/content/drive')
trainer.save_model("bert-imdb-finetuned")
tokenizer.save_pretrained("bert-imdb-finetuned")
SAVE_DIR = "/content/drive/MyDrive/models/bert-imdb-finetuned"
trainer.save_model(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

Mounted at /content/drive


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

('/content/drive/MyDrive/models/bert-imdb-finetuned/tokenizer_config.json',
 '/content/drive/MyDrive/models/bert-imdb-finetuned/tokenizer.json')