In [1]:
!pip install transformers torch pandas sentencepiece



Fine Tuning AfroLM model to ASAG

In [2]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from transformers import AdamW
from tqdm import tqdm
from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error
import matplotlib.pyplot as plt
import numpy as np


# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define dataset class
class GradingDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data.reset_index(drop=True)  # Reset index to avoid KeyError
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        question = self.data.loc[idx, "Translated Desired Answer"]
        student_answer = self.data.loc[idx, "Translated Student Answer"]
        average_score = self.data.loc[idx, "Average Score"]

        encoding = self.tokenizer.encode_plus(
            question,
            student_answer,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        input_ids = encoding["input_ids"].squeeze()
        attention_mask = encoding["attention_mask"].squeeze()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "average_score": average_score
        }

# Load the dataset
data_path = "/content/translated_data_google_twi.csv"
df = pd.read_csv(data_path)

df = df.dropna(axis=1, how='all')

# Split the dataset into train, validation, and test sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(val_df, test_size=0.5, random_state=42)


# Initialize the tokenizer and model
model_name = "bonadossou/afrolm_active_learning"
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
tokenizer.model_max_length = 256
model = XLMRobertaForSequenceClassification.from_pretrained(model_name, num_labels=1).to(device)

# Define dataset parameters
max_length = 256
batch_size = 8
num_epochs = 6
learning_rate = 2e-5

# Create datasets and data loaders
train_dataset = GradingDataset(train_df, tokenizer, max_length)
val_dataset = GradingDataset(val_df, tokenizer, max_length)
test_dataset = GradingDataset(test_df, tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.MSELoss()

# Fine-tuning loop
for epoch in range(num_epochs):
    # Training
    model.train()
    train_loss = 0

    for batch in tqdm(train_loader, desc="Epoch {} - Training".format(epoch + 1)):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        average_scores = batch["average_score"].to(device).float()  # Convert to float

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predicted_scores = outputs.logits.squeeze()

        loss = loss_fn(predicted_scores, average_scores)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)

    # Validation
    model.eval()
    val_loss = 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Epoch {} - Validation".format(epoch + 1)):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            average_scores = batch["average_score"].to(device).float()  # Convert to float

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predicted_scores = outputs.logits.squeeze()

            loss = loss_fn(predicted_scores, average_scores)

            val_loss += loss.item()

    val_loss /= len(val_loader)

    # Print training and validation metrics for each epoch
    print("Epoch {}/{}".format(epoch + 1, num_epochs))
    print("Train Loss: {:.4f}".format(train_loss))
    print("Val Loss: {:.4f}".format(val_loss))
    print()

# Evaluation on test set
model.eval()
test_loss = 0
predictions = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        average_scores = batch["average_score"].to(device).float()  # Convert to float

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predicted_scores = outputs.logits.squeeze()

        loss = loss_fn(predicted_scores, average_scores)

        test_loss += loss.item()
        predictions.extend(predicted_scores.tolist())

test_loss /= len(test_loader)

# Convert predictions to binary labels (e.g., pass/fail)
binary_predictions = [1 if score >= 0.5 else 0 for score in predictions]
binary_labels = [1 if score >= 0.5 else 0 for score in test_df["Average Score"]]

# Calculate accuracy and mean absolute error (MAE)
accuracy = accuracy_score(binary_labels, binary_predictions)
mae = mean_absolute_error(test_df["Average Score"], predictions)



Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at bonadossou/afrolm_active_learning and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 - Training:   2%|▏         | 5/228 [00:02<01:34,  2.36it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 1 - Training:  18%|█▊        | 40/228 [00:14<01:06,  2.83it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 1 - Training:  24%|██▍

Epoch 1/6
Train Loss: 1.6836
Val Loss: 1.3040



Epoch 2 - Training:  22%|██▏       | 50/228 [00:16<01:00,  2.96it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 2 - Training:  39%|███▊      | 88/228 [00:29<00:47,  2.95it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 2 - Training:  46%|████▌     | 104/228 [00:35<00:42,  2.94it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 2 - Training:  51%|█████▏    | 117/228 [00:39<00:38,  2.85it/s]Be aware, overflowing tokens are not returned fo

Epoch 2/6
Train Loss: 1.2588
Val Loss: 1.1738



Epoch 3 - Training:  18%|█▊        | 42/228 [00:14<01:03,  2.93it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 3 - Training:  28%|██▊       | 63/228 [00:21<00:55,  2.95it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 3 - Training:  32%|███▏      | 73/228 [00:24<00:53,  2.92it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 3 - Training:  34%|███▍      | 77/228 [00:26<00:51,  2.91it/s]Be aware, overflowing tokens are not returned for 

Epoch 3/6
Train Loss: 1.1149
Val Loss: 1.2359



Epoch 4 - Training:   2%|▏         | 4/228 [00:01<01:17,  2.91it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 4 - Training:  30%|██▉       | 68/228 [00:23<00:54,  2.95it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 4 - Training:  36%|███▌      | 81/228 [00:27<00:50,  2.91it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 4 - Training:  53%|█████▎    | 121/228 [00:41<00:36,  2.95it/s]Be aware, overflowing tokens are not returned for 

Epoch 4/6
Train Loss: 0.9720
Val Loss: 0.9824



Epoch 5 - Training:   6%|▌         | 14/228 [00:04<01:13,  2.93it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 5 - Training:  16%|█▌        | 37/228 [00:12<01:04,  2.97it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 5 - Training:  22%|██▏       | 50/228 [00:17<01:01,  2.91it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 5 - Training:  22%|██▏       | 51/228 [00:17<01:00,  2.92it/s]Be aware, overflowing tokens are not returned for 

Epoch 5/6
Train Loss: 0.8858
Val Loss: 1.1672



Epoch 6 - Training:  11%|█         | 24/228 [00:08<01:09,  2.93it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 6 - Training:  14%|█▎        | 31/228 [00:10<01:06,  2.94it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 6 - Training:  23%|██▎       | 53/228 [00:18<01:00,  2.90it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 6 - Training:  58%|█████▊    | 132/228 [00:44<00:32,  2.91it/s]Be aware, overflowing tokens are not returned for

Epoch 6/6
Train Loss: 0.7509
Val Loss: 1.0768



Testing:  14%|█▍        | 4/29 [00:00<00:02, 11.02it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Testing:  62%|██████▏   | 18/29 [00:01<00:01, 10.84it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Testing:  83%|████████▎ | 24/29 [00:02<00:00, 10.79it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Testing: 100%|██████████| 29/29 [00:02<00:00, 11.01it/s]


In [3]:
print("Accuracy: {:.4f}".format(accuracy))
print("Mean Absolute Error (MAE): {:.4f}".format(mae))
print("Mean Squared Error (MSE): {:.4f}".format(test_loss))


Accuracy: 1.0000
Mean Absolute Error (MAE): 0.6328
Mean Squared Error (MSE): 0.8459


In [4]:
model.eval()
predictions = []

# Load the test dataset
test_data_path = "/content/trans_test_data_google_twi_full.csv"
df_test = pd.read_csv(test_data_path)
df_test = df_test.dropna(axis=1, how='all')

test_dataset = GradingDataset(df_test, tokenizer, max_length)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predicted_scores = outputs.logits.squeeze()

      # Convert predicted_scores to a NumPy array and then round up the scores to two decimal places
        rounded_predictions = [min(round(score.item(), 2), 5.0) for score in predicted_scores]


        predictions.extend(rounded_predictions)

# Round up the predicted scores to two decimal places
rounded_predictions = [round(score, 2) for score in predictions]



Testing:  33%|███▎      | 19/57 [00:01<00:04,  9.21it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Testing:  39%|███▊      | 22/57 [00:02<00:03,  9.25it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Testing:  42%|████▏     | 24/57 [00:02<00:03,  9.70it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the re

In [5]:

df_test["Predicted Score"] = rounded_predictions

# Calculate the mean absolute error of the test set

true_scores = df_test['Average Score'].values
predicted_scores = df_test['Predicted Score'].values

mae = mean_absolute_error(true_scores, predicted_scores)
print("Mean Absolute Error:", mae)

# Drop the "score_avg" column
df_test.drop('Average Score', axis=1, inplace=True)


# Save the predictions to a CSV file
output_csv_path = "/content/predicted_test_csv_twi.csv"
df_test.to_csv(output_csv_path, index=False)

# Convert predictions and labels to numpy arrays
predictions = np.array(predicted_scores)

# labels = np.array(test_df["Average Score"])
labels = np.array(true_scores)


# Create a scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(labels, predictions, label='Predicted Scores')
plt.xlabel("True Scores")
plt.ylabel("Predicted Scores")
plt.title("True Scores vs Predicted Scores")

# Add a line plot for reference
plt.plot([min(labels), max(labels)], [min(labels), max(labels)], color='red', linestyle='--', label='Ideal Line')

plt.legend()
plt.tight_layout()

# Save the plot as a PNG file
plt.savefig("/content/scatter_plot.png", dpi=300)  # Replace "scatter_plot.png" with your desired file name/path
plt.close()

Mean Absolute Error: 0.7285274725274725
