In [1]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [23]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mkedw\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('swahili_train.csv')

# Create a subset of 11,000 rows
subset_df = df.head(9000).copy()

# Display the first fe rows of the dataset
subset_df.head()

Unnamed: 0,premise,hypothesis,label
0,Kidhahania cream skimming ina vipimo viwili vy...,Bidhaa na jiografia ndizo hufanya skimming cre...,1
1,unajua wakati wa msimu na nadhani kwa kiwango ...,Unapoteza vitu kwa kiwango kifuatacho ikiwa wa...,0
2,Mmoja wa nambari zetu atatekeleza maagizo yako...,Mwanachama wa timu yangu atatekeleza maagizo y...,0
3,Unajuaje? Hii yote ni habari yao tena.,Habari hizi ni zao.,0
4,ndio nakuambia nini ingawa ukinunua baadhi ya ...,Viatu vya tenisi vina bei mbalimbali.,1


In [2]:
# Combine the 'premise' column and the 'hypothesis' column into one column
subset_df['combined_text'] = subset_df['premise'] + ' ' + subset_df['hypothesis']

In [3]:
# Define your list of Swahili stopwords
swahili_stopwords = [
    "akasema", "alikuwa", "alisema", "baada", "basi", "bila", "cha", "chini",
    "hadi", "hapo", "hata", "hivyo", "hiyo", "huku", "huo", "ili", "ilikuwa",
    "juu", "kama", "karibu", "katika", "kila", "kima", "kisha", "kubwa", "kutoka",
    "kuwa", "kwa", "kwamba", "kwenda", "kwenye", "la", "lakini", "mara", "mdogo",
    "mimi", "mkubwa", "mmoja", "moja", "muda", "mwenye", "na", "naye", "ndani",
    "ng", "ni", "nini", "nonkungu", "pamoja", "pia", "sana", "sasa", "sauti",
    "tafadhali", "tena", "tu", "vile", "wa", "wakati", "wake", "walikuwa", "wao",
    "watu", "wengine", "wote", "ya", "yake", "yangu", "yao", "yeye", "yule", "za", "zaidi", "zake"
]

In [4]:
import string
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Define a Swahili lemmatizer
swahili_lemmatizer = WordNetLemmatizer()

# Function for Swahili text cleaning
def clean_swahili_text(text):
    # Convert to lowercase
    text = text.lower()
    
    words = text.split()
    
    # Tokenize the text into words
    words = word_tokenize(text)

    # Remove punctuation and convert to lowercase
    words = [word.lower() for word in words if word not in string.punctuation]

    # Remove Swahili stopwords
    words = [word for word in words if word not in swahili_stopwords]

    # Lemmatization (you may need to create a custom lemmatization function)
    words = [swahili_lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

subset_df = subset_df.dropna(subset=['combined_text'])

# Apply Swahili text cleaning to the 'combined_text' column
subset_df['combined_text'] = subset_df['combined_text'].apply(lambda x: clean_swahili_text(x))

In [5]:
import pandas as pd

# Assuming your dataset is stored in a DataFrame called 'subset_df' and the column with class labels is 'label'
class_distribution = subset_df['label'].value_counts()

# Display the class distribution
print(class_distribution)

2    3331
0    3078
1    2589
Name: label, dtype: int64


In [6]:
from sklearn.utils import resample

# Oversample classes 0 and 1
class_0 = subset_df[subset_df['label'] == 0]
class_1 = subset_df[subset_df['label'] == 1]
class_2 = subset_df[subset_df['label'] == 2]

oversampled_class_0 = resample(class_0, replace=True, n_samples=len(class_2), random_state=42)
oversampled_class_1 = resample(class_1, replace=True, n_samples=len(class_2), random_state=42)

oversampled_df = pd.concat([oversampled_class_0, oversampled_class_1, class_2])
oversampled_df = oversampled_df.sample(frac=1, random_state=42)

In [7]:
# Check the class distribution in the oversampled dataset
class_distribution = oversampled_df['label'].value_counts()
print(class_distribution)

0    3331
1    3331
2    3331
Name: label, dtype: int64


In [10]:
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, DataLoader 

# Tokenization function
def tokenize_text(text, tokenizer, max_length):
    tokens = tokenizer(text, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
    return tokens

# Preprocess your data
name = "Davlan/bert-base-multilingual-cased-finetuned-swahili"
tokenizer = BertTokenizer.from_pretrained(name)
max_length = 128  # You can adjust this based on your dataset

X = oversampled_df['combined_text'].astype(str).tolist()  # Use the oversampled data
y = oversampled_df['label']

# Tokenize and split the data
X_tokens = [tokenize_text(text, tokenizer, max_length) for text in X]
X_tensors = torch.cat([t['input_ids'] for t in X_tokens], dim=0)
attention_masks = torch.cat([t['attention_mask'] for t in X_tokens], dim=0)
y_tensor = torch.tensor(y.values)

# Calculate the minimum class count
min_class_count = min(train_class_distribution)

# Calculate the new test_size to maintain balance
test_size = min_class_count / len(y)

# Split your data into training and testing sets with the adjusted test_size
X_train, X_test, y_train, y_test, train_masks, test_masks = train_test_split(
    X_tensors, y_tensor, attention_masks, test_size=test_size, random_state=42, stratify=y_tensor
)

# Split your data into training and testing sets
# X_train, X_test, y_train, y_test, train_masks, test_masks = train_test_split(
#    X_tensors, y_tensor, attention_masks, test_size=0.2, random_state=42, stratify=y_tensor
# )

# Define a PyTorch DataLoader for training and testing
train_dataset_subset = TensorDataset(X_train, train_masks, y_train)
test_dataset_subset = TensorDataset(X_test, test_masks, y_test)
train_dataloader_subset = DataLoader(train_dataset_subset, batch_size=16, shuffle=True)
test_dataloader_subset = DataLoader(test_dataset_subset, batch_size=32)

In [11]:
import numpy as np

# Convert PyTorch tensors to NumPy arrays
y_train_np = y_train.numpy()
y_test_np = y_test.numpy()

# Calculate the class distribution in the training set
train_class_distribution = np.bincount(y_train_np)

# Calculate the class distribution in the testing set
test_class_distribution = np.bincount(y_test_np)

# Display the class distribution
print("Training Class Distribution:")
print(train_class_distribution)

print("\nTesting Class Distribution:")
print(test_class_distribution)

Training Class Distribution:
[2221 2221 2220]

Testing Class Distribution:
[1110 1110 1111]


In [13]:
# Section 2: Model Selection and Loading
from transformers import BertForSequenceClassification

# Load a pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('Davlan/bert-base-multilingual-cased-finetuned-swahili', num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Davlan/bert-base-multilingual-cased-finetuned-swahili and are newly initialized: ['classifier.weight', 'bert.pooler.dense.weight', 'classifier.bias', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import torch
import torch.optim as optim
import torch.nn as nn

# Define class weights (adjust as needed)
class_weights = [1.0, 1.0, 1.0]

# Convert class weights to a tensor
class_weights = torch.tensor(class_weights, dtype=torch.float32)

# Define a loss function (e.g., CrossEntropyLoss) with class weights
criterion = nn.CrossEntropyLoss(weight=class_weights)

# Create an optimizer (e.g., Adam or Lamb)
optimizer = optim.Adam(model.parameters(), lr=1e-5)

# Training loop
num_epochs = 5  # Adjust as needed

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, data in enumerate(train_dataloader_subset, 0):
        inputs, masks, labels = data
        optimizer.zero_grad()
        
        outputs = model(inputs, attention_mask=masks)
        loss = criterion(outputs.logits, labels)  # Apply class weights
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    print(f"Epoch {epoch+1}, Loss: {running_loss / (i + 1)}")


Epoch 1, Loss: 1.0610012429223643


In [38]:
from sklearn.metrics import classification_report, confusion_matrix
import torch

# Check if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set your model to evaluation mode
model.eval()

# Lists to store predictions and true labels
all_predictions = []
all_true_labels = []

# Iterate over the test data
for batch in test_dataloader_subset:
    input_ids, attention_mask, labels = batch

    # Move data to the device
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    labels = labels.to(device)

    # Forward pass
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits

    # Apply softmax to get predicted probabilities
    probabilities = torch.softmax(logits, dim=1)

    # Get the predicted labels
    predicted_labels = torch.argmax(probabilities, dim=1).cpu().numpy()

    # Append to the lists
    all_predictions.extend(predicted_labels)
    all_true_labels.extend(labels.cpu().numpy())

# Calculate and print classification report
report = classification_report(all_true_labels, all_predictions, target_names=['label_0', 'label_1', 'label_2'])
print(report)

# Calculate and print confusion matrix
confusion = confusion_matrix(all_true_labels, all_predictions)
print("Confusion Matrix:")
print(confusion)

              precision    recall  f1-score   support

     label_0       0.81      0.67      0.74       888
     label_1       0.63      0.71      0.67       888
     label_2       0.57      0.60      0.58       888

    accuracy                           0.66      2664
   macro avg       0.67      0.66      0.66      2664
weighted avg       0.67      0.66      0.66      2664

Confusion Matrix:
[[597 116 175]
 [ 34 627 227]
 [105 252 531]]
