In [17]:
from datasets import load_dataset

# Load the Swahili news dataset
dataset = load_dataset('community-datasets/swahili_news')

In [18]:
import re

def clean_text(example):
    text = example['text']
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove non-alphabetic characters except spaces
    text = re.sub(r'[^a-zA-ZäöüÄÖÜßẞ\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Convert to lowercase
    text = text.lower()
    example['text'] = text
    return example

# Apply the cleaning function
dataset = dataset.map(clean_text)


In [19]:
# Get the unique labels
unique_labels = dataset['train'].unique('label')
print("Unique Labels:", unique_labels)

Unique Labels: [0, 1, 2, 3, 4, 5]


In [20]:
def map_labels_to_binary(example):
    if example['label'] == 1:
        example['label'] = 1
    else:
        example['label'] = 0
    return example

# Apply the mapping function
dataset = dataset.map(map_labels_to_binary)


Map: 100%|██████████| 22207/22207 [00:08<00:00, 2690.55 examples/s]
Map: 100%|██████████| 7338/7338 [00:04<00:00, 1721.78 examples/s]


In [21]:
from collections import Counter

# Calculate label distribution
label_counts = Counter(dataset['train']['label'])
print("Label Distribution in Training Set:", label_counts)


Label Distribution in Training Set: Counter({0: 12052, 1: 10155})


In [22]:
from datasets import DatasetDict

# Assuming the dataset has only a 'train' split, we'll create our own splits
dataset = dataset['train'].train_test_split(test_size=0.2, seed=42)
test_valid = dataset['test'].train_test_split(test_size=0.5, seed=42)

# Create a DatasetDict
dataset = DatasetDict({
    'train': dataset['train'],
    'validation': test_valid['train'],
    'test': test_valid['test'],
})


In [23]:
from transformers import AutoTokenizer

# Load the tokenizer from the first fine-tuning step
tokenizer = AutoTokenizer.from_pretrained('./swahili-xlmr-finetuned-100k')

def tokenize_function(batch):
    return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=128)

# Tokenize the datasets
tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map: 100%|██████████| 17765/17765 [00:36<00:00, 488.46 examples/s]
Map: 100%|██████████| 2221/2221 [00:03<00:00, 717.28 examples/s]
Map: 100%|██████████| 2221/2221 [00:03<00:00, 622.41 examples/s]


In [24]:
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])


In [25]:
from transformers import AutoModelForSequenceClassification

# Load the model from the first fine-tuning step
model = AutoModelForSequenceClassification.from_pretrained(
    './swahili-xlmr-finetuned-100k',
    num_labels=2,
)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at ./swahili-xlmr-finetuned-100k and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='/datasets/mdawood/results_binary_classification',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Adjust based on your GPU memory
    per_device_eval_batch_size=8,
    num_train_epochs=3,             # Adjust based on your needs
    weight_decay=0.01,
    logging_dir='/datasets/mdawood/logs_binary_classification',
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
)




In [27]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Convert logits to probabilities using softmax
    probs = np.exp(logits) / np.exp(logits).sum(-1, keepdims=True)
    # Get the predicted class (0 or 1)
    predictions = np.argmax(probs, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }


In [28]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    compute_metrics=compute_metrics,
)


In [29]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msayfullah-jumoorty[0m ([33msayf[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3043,0.297799,0.919406,0.911254,0.889642,0.933943
2,0.2308,0.26878,0.92751,0.918233,0.917766,0.918699
3,0.1681,0.298946,0.926159,0.917505,0.908367,0.926829


TrainOutput(global_step=6663, training_loss=0.25240587884020416, metrics={'train_runtime': 4694.018, 'train_samples_per_second': 11.354, 'train_steps_per_second': 1.419, 'total_flos': 3505625923852800.0, 'train_loss': 0.25240587884020416, 'epoch': 3.0})

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7fe1d49334f0>> (for post_run_cell), with arguments args (<ExecutionResult object at 7fe278633910, execution_count=29 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 7fe278633a60, raw_cell="trainer.train()" store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell://tunnel%2Bmsl-daggy/home-mscluster/mdawood/nlp/SecondFineTune.ipynb#X15sdnNjb2RlLXJlbW90ZQ%3D%3D> result=TrainOutput(global_step=6663, training_loss=0.25240587884020416, metrics={'train_runtime': 4694.018, 'train_samples_per_second': 11.354, 'train_steps_per_second': 1.419, 'total_flos': 3505625923852800.0, 'train_loss': 0.25240587884020416, 'epoch': 3.0})>,),kwargs {}:


TypeError: _WandbInit._pause_backend() takes 1 positional argument but 2 were given

In [30]:
# Save the fine-tuned model
trainer.save_model('/datasets/mdawood/swahili-xlmr-binary-classification-100k')

# Save the tokenizer
tokenizer.save_pretrained('/datasets/mdawood/swahili-xlmr-binary-classification-100k')

Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7fe1d49334f0>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 7fe1cf4e65c0, raw_cell="# Save the fine-tuned model
trainer.save_model('/d.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell://tunnel%2Bmsl-daggy/home-mscluster/mdawood/nlp/SecondFineTune.ipynb#X22sdnNjb2RlLXJlbW90ZQ%3D%3D>,),kwargs {}:


TypeError: _WandbInit._resume_backend() takes 1 positional argument but 2 were given

('/datasets/mdawood/swahili-xlmr-binary-classification-100k/tokenizer_config.json',
 '/datasets/mdawood/swahili-xlmr-binary-classification-100k/special_tokens_map.json',
 '/datasets/mdawood/swahili-xlmr-binary-classification-100k/tokenizer.json')

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7fe1d49334f0>> (for post_run_cell), with arguments args (<ExecutionResult object at 7fe1cf4e5690, execution_count=30 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 7fe1cf4e65c0, raw_cell="# Save the fine-tuned model
trainer.save_model('/d.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell://tunnel%2Bmsl-daggy/home-mscluster/mdawood/nlp/SecondFineTune.ipynb#X22sdnNjb2RlLXJlbW90ZQ%3D%3D> result=('/datasets/mdawood/swahili-xlmr-binary-classification-100k/tokenizer_config.json', '/datasets/mdawood/swahili-xlmr-binary-classification-100k/special_tokens_map.json', '/datasets/mdawood/swahili-xlmr-binary-classification-100k/tokenizer.json')>,),kwargs {}:


TypeError: _WandbInit._pause_backend() takes 1 positional argument but 2 were given

In [31]:
test_results = trainer.evaluate(eval_dataset=tokenized_datasets['test'])
print("Test Results:", test_results)


Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7fe1d49334f0>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 7fe2784bffd0, raw_cell="test_results = trainer.evaluate(eval_dataset=token.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell://tunnel%2Bmsl-daggy/home-mscluster/mdawood/nlp/SecondFineTune.ipynb#X16sdnNjb2RlLXJlbW90ZQ%3D%3D>,),kwargs {}:


TypeError: _WandbInit._resume_backend() takes 1 positional argument but 2 were given

Test Results: {'eval_loss': 0.2684372365474701, 'eval_accuracy': 0.9275101305718145, 'eval_f1': 0.9206505667816658, 'eval_precision': 0.9229249011857708, 'eval_recall': 0.9183874139626352, 'eval_runtime': 47.8165, 'eval_samples_per_second': 46.448, 'eval_steps_per_second': 5.814, 'epoch': 3.0}
Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7fe1d49334f0>> (for post_run_cell), with arguments args (<ExecutionResult object at 7fe2784bdc30, execution_count=31 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 7fe2784bffd0, raw_cell="test_results = trainer.evaluate(eval_dataset=token.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell://tunnel%2Bmsl-daggy/home-mscluster/mdawood/nlp/SecondFineTune.ipynb#X16sdnNjb2RlLXJlbW90ZQ%3D%3D> result=None>,),kwargs {}:


TypeError: _WandbInit._pause_backend() takes 1 positional argument but 2 were given

In [32]:
print(f"Test Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"Test F1 Score: {test_results['eval_f1']:.4f}")
print(f"Test Precision: {test_results['eval_precision']:.4f}")
print(f"Test Recall: {test_results['eval_recall']:.4f}")


Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7fe1d49334f0>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 7fe1d1e289d0, raw_cell="print(f"Test Accuracy: {test_results['eval_accurac.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell://tunnel%2Bmsl-daggy/home-mscluster/mdawood/nlp/SecondFineTune.ipynb#X20sdnNjb2RlLXJlbW90ZQ%3D%3D>,),kwargs {}:


TypeError: _WandbInit._resume_backend() takes 1 positional argument but 2 were given

Test Accuracy: 0.9275
Test F1 Score: 0.9207
Test Precision: 0.9229
Test Recall: 0.9184
Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7fe1d49334f0>> (for post_run_cell), with arguments args (<ExecutionResult object at 7fe1cf73f7c0, execution_count=32 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 7fe1d1e289d0, raw_cell="print(f"Test Accuracy: {test_results['eval_accurac.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell://tunnel%2Bmsl-daggy/home-mscluster/mdawood/nlp/SecondFineTune.ipynb#X20sdnNjb2RlLXJlbW90ZQ%3D%3D> result=None>,),kwargs {}:


TypeError: _WandbInit._pause_backend() takes 1 positional argument but 2 were given

In [33]:
import torch

# Get a batch of test examples
test_samples = tokenized_datasets['test'][:5]  # Adjust as needed

# Move inputs to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
inputs = {k: v.to(device) for k, v in test_samples.items() if k in ['input_ids', 'attention_mask']}

# Get outputs with attentions
with torch.no_grad():
    outputs = model(**inputs, output_attentions=True)
    attentions = outputs.attentions  # Tuple of attention tensors

# Process attentions for visualization


Error in callback <bound method _WandbInit._resume_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7fe1d49334f0>> (for pre_run_cell), with arguments args (<ExecutionInfo object at 7fe1d1ef0220, raw_cell="import torch

# Get a batch of test examples
test_.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell://tunnel%2Bmsl-daggy/home-mscluster/mdawood/nlp/SecondFineTune.ipynb#X21sdnNjb2RlLXJlbW90ZQ%3D%3D>,),kwargs {}:


TypeError: _WandbInit._resume_backend() takes 1 positional argument but 2 were given

Error in callback <bound method _WandbInit._pause_backend of <wandb.sdk.wandb_init._WandbInit object at 0x7fe1d49334f0>> (for post_run_cell), with arguments args (<ExecutionResult object at 7fe1d1ef2f20, execution_count=33 error_before_exec=None error_in_exec=None info=<ExecutionInfo object at 7fe1d1ef0220, raw_cell="import torch

# Get a batch of test examples
test_.." store_history=True silent=False shell_futures=True cell_id=vscode-notebook-cell://tunnel%2Bmsl-daggy/home-mscluster/mdawood/nlp/SecondFineTune.ipynb#X21sdnNjb2RlLXJlbW90ZQ%3D%3D> result=None>,),kwargs {}:


TypeError: _WandbInit._pause_backend() takes 1 positional argument but 2 were given