<a href="https://colab.research.google.com/github/Andreas-Lukito/twitter-airline-sentiment-analysis/blob/dev%2Fandreas/colab_notebooks/03albert-base-v2-fine-tune-balanced.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Albert Fine-Tune Model for Predicting Sentiment

## Iport Libraries

In [None]:
# Common Python Libraries
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

# Deep Learning Libraries
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AlbertTokenizerFast, AlbertForSequenceClassification
from torch.optim import Adam

# Data Preprocessing
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from sklearn.preprocessing import OrdinalEncoder

# Model metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error, r2_score

from google.colab import drive
drive.mount('/content/drive')

project_path = "/content/drive/MyDrive/airline_sentiment_analysis"
SEED = 42

model_name = "albert-base-v2"

Mounted at /content/drive


## Choose Device

In [2]:
# Detect available device
if torch.cuda.is_available():
    # check if ROCm backend is active
    if torch.version.hip is not None:
        backend = "ROCm"
    else:
        backend = "CUDA"

    device = torch.device("cuda")
    print(f"PyTorch is using GPU: {torch.cuda.get_device_name(0)}")
    print(f"Backend: {backend}")
else:
    device = torch.device("cpu")
    print("PyTorch is not using GPU — running on CPU")

PyTorch is using GPU: NVIDIA L4
Backend: CUDA


## Import Data

In [None]:
before_date = "2025-11"

# Data path
cleaned_data_path = os.path.join(project_path,f"news_cache/{before_date}/csv/")
clean_cached_file = os.path.join(cleaned_data_path, f"{before_date}_clean_news_data.csv")

# Import Data
news_data = pd.read_csv(filepath_or_buffer=clean_cached_file, sep=',')

In [None]:
news_data.head()

Unnamed: 0.1,Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone,length,clean_text
0,0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada),4,What said.
1,1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada),9,plus you've added commercials to the experienc...
2,2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada),12,I didn't today... Must mean I need to take ano...
3,3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada),17,"it's really aggressive to blast obnoxious ""ent..."
4,4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada),10,and it's a really big bad thing about it


## Split the data to Train, Test, and Validation

In [None]:
test_size = 0.20
val_size = 0.10

# Splitting the data into train and temp (which will be further split into validation and test)
train_df, test_df = train_test_split(news_data, test_size=test_size, random_state=SEED)

# Splitting train into validation and test sets
train_df, val_df = train_test_split(train_df, test_size=val_size, random_state=SEED)

In [6]:
# # Split to x and y values
x_train = train_df[["clean_text"]]
y_train = train_df[["airline_sentiment"]]

x_test = test_df["clean_text"].tolist()
y_test = test_df[["airline_sentiment"]]

x_val = val_df["clean_text"].tolist()
y_val = val_df[["airline_sentiment"]]


## Data Preprocessing

### Tokenizer for the text

In [11]:
tokenizer = AlbertTokenizerFast.from_pretrained(model_name)

class sentiment_text(torch.utils.data.Dataset): # create a class that behaves like torch.utils.data.Dataset
    def __init__(self, texts, labels, tokenizer):
        self.encodings = tokenizer( # converts raw text -> model input
                                    texts,
                                    truncation = True,
                                    padding = True,
                                    max_length = 256 # since the max length of the tweets are around 35 - 40 words
                                )

        # get the labels
        self.labels = labels

    def __getitem__(self, index): # so that pytorch can get the data (returns one sample of the data)
        item = {key: torch.tensor(val[index]) for key, val in self.encodings.items()} # self.encoding stores (input_ids, attention_mask, label)
        item["labels"] = torch.tensor(self.labels[index], dtype=torch.long) # get the label on the chosen index while converting to a torch tensor format
        return item

    def __len__(self): #to get the length of the data (used when batching)
        return len(self.labels)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

In [None]:
train_dataset = sentiment_text(x_train, y_train, tokenizer)
test_dataset  = sentiment_text(x_test, y_test, tokenizer)
val_dataset  = sentiment_text(x_val, y_val, tokenizer)

### Data Loader for the Model

In [13]:

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=32)
val_loader  = DataLoader(val_dataset, batch_size=32)

## Train Baseline Model

In [None]:
model = AlbertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=1 # Since This is Regression
)

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
optimizer = Adam(model.parameters(), lr=5e-5)

In [16]:
model.to(device)
model.train() #make the model to training mode

for epoch in tqdm(range(3), desc="Training Albert Model", unit="epoch"):  # number of epochs

    for batch in train_loader:
        for k, v in batch.items():
            batch[k] = v.to(device)

        optimizer.zero_grad() # Resets all gradients to zero before computing new ones.
        outputs = model(**batch)  # forward pass
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} loss: {loss.item()}")

Training Albert Model:  33%|███▎      | 1/3 [00:23<00:46, 23.20s/epoch]

Epoch 1 loss: 0.7642505168914795


Training Albert Model:  67%|██████▋   | 2/3 [00:45<00:22, 22.74s/epoch]

Epoch 2 loss: 0.5422011017799377


Training Albert Model: 100%|██████████| 3/3 [01:08<00:00, 22.76s/epoch]

Epoch 3 loss: 0.28575804829597473





## Model Evaluation

In [None]:
def evaluate_model(model, data_loader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in data_loader:
            inputs, labels = inputs.to(device), labels.to(device) 
            outputs = model(inputs)

            all_preds.extend(outputs.view(-1).cpu().numpy())
            all_labels.extend(labels.view(-1).cpu().numpy())

    # Compute metrics
    mse = mean_squared_error(all_labels, all_preds)
    mae = mean_absolute_error(all_labels, all_preds)
    rmse = root_mean_squared_error(all_labels, all_preds)
    r2 = r2_score(all_labels, all_preds)

    return mse, mae, rmse, r2

In [None]:
mse, mae, rmse, r2 = evaluate_model(
                                    model,
                                    test_loader,
                                    device
                                    )


print(f"mse       = {mse:.4f}")
print(f"mae       = {mae:.4f}")
print(f"rmse      = {rmse:.4f}")
print(f"r²        = {r2:.4f}")