## Environment setup and installations

In [None]:
!pip install git+https://github.com/openai/whisper.git

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-68pp50wt
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-68pp50wt
  Resolved https://github.com/openai/whisper.git to commit 90db0de1896c23cbfaf0c58bc2d30665f709f170
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
!pip install transformers datasets tokenizers
import io
import os
import zipfile
import requests
import pandas as pd
import torch
import whisper
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline
# Load the BERT model and tokenizer for corrections
from transformers import BertTokenizer, BertForMaskedLM, pipeline



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Loading data

In [None]:
class AsantiTwiDataset(torch.utils.data.Dataset):
    def __init__(self, zip_url, csv_filename, audio_base_path, processor=None, device="cpu"):
        self.device = device
        self.audio_base_path = audio_base_path
        self.processor = processor

        # Download and extract the dataset
        response = requests.get(zip_url, stream=True)
        response.raise_for_status()
        with zipfile.ZipFile(io.BytesIO(response.content), 'r') as zip_ref:
            zip_ref.extractall('.')

        # Clean the CSV file
        cleaned_csv_filename = f"cleaned_{os.path.basename(csv_filename)}"
        self.clean_csv(csv_filename, cleaned_csv_filename)

        # Load the cleaned CSV
        self.df = pd.read_csv(cleaned_csv_filename)

        # Map columns if needed
        column_mapping = {
            "Audio Filepath": "path",
            "Transcription": "sentence",
        }
        self.df.rename(columns=lambda x: column_mapping.get(x.strip(), x.strip()), inplace=True)

        # Verify required columns
        if 'path' not in self.df.columns or 'sentence' not in self.df.columns:
            raise ValueError("CSV file must contain 'path' and 'sentence' columns.")


    def __len__(self):
        """Returns the total number of samples in the dataset."""
        return len(self.df)

    def __getitem__(self, index):
        """Returns a single data item at a given index."""
        # Get the row from the DataFrame
        row = self.df.iloc[index]

        # Construct the audio file path
        audio_path = os.path.join(self.audio_base_path, row['path'])

        # Get the corresponding sentence
        sentence = row['sentence']

        # Return a dictionary containing the audio path and sentence
        return {'audio_path': audio_path, 'sentence': sentence}

    @staticmethod
    def clean_csv(input_path, output_path):
        """
        Cleans a CSV file by:
        - Replacing tab characters with commas.
        - Filtering rows with inconsistent numbers of fields.
        """
        with open(input_path, "r") as infile:
            lines = infile.readlines()

        # Replace tabs with commas
        clean_lines = [line.replace("\t", ",").replace("lacuna-audios-train/asanti-twi/audios/", "").replace("lacuna-audios-test/asanti-twi/audios/", "") for line in lines]

        # Filter rows with the correct number of fields
        expected_fields = clean_lines[0].count(",") + 1
        valid_lines = [line for line in clean_lines if line.count(",") + 1 == expected_fields]

        # Write cleaned content to a new file
        with open(output_path, "w") as outfile:
            outfile.writelines(valid_lines)


# Dataset URLs and paths
train_zip_url = "https://fisd-dataset.s3.amazonaws.com/fisd-asanti-twi-90p.zip"
train_csv_filename = "fisd-asanti-twi-90p/data.csv"
train_audio_base_path = "fisd-asanti-twi-90p/audios"

test_zip_url = "https://fisd-dataset.s3.amazonaws.com/fisd-asanti-twi-10p.zip"
test_csv_filename = "fisd-asanti-twi-10p/data.csv"
test_audio_base_path = "fisd-asanti-twi-10p/audios"

# Load datasets
test_dataset = AsantiTwiDataset(test_zip_url, test_csv_filename, test_audio_base_path)
train_dataset = AsantiTwiDataset(train_zip_url, train_csv_filename, train_audio_base_path)



In [None]:
testinglabels = "/content/drive/MyDrive/ABENA_Trained/Testlabels.csv"
df = pd.read_csv(testinglabels, delimiter="\t", names=["Index", "path","sentence","translation"], header=0)
display(df)

Unnamed: 0,Index,path,sentence,translation
0,31861,lacuna-audios-test/asanti-twi/audios/AsantiTwi...,Fa thousand ma me,Give me 1000
1,31862,lacuna-audios-test/asanti-twi/audios/AsantiTwi...,Tsɔrɔ phone number wei ma me: 0548992233,Write this phone number for me: 0548992233
2,31863,lacuna-audios-test/asanti-twi/audios/AsantiTwi...,Wanma me ntosoɔ,You did not give me bonus
3,31864,lacuna-audios-test/asanti-twi/audios/AsantiTwi...,Tsɔrɔ phone number wei ma me: 0278759823,Write this phone number for me: 0278759823
4,31865,lacuna-audios-test/asanti-twi/audios/AsantiTwi...,Fa ma ne phone number wei so: 0258934896,Send it to me on this number: 0258934896
5,31866,lacuna-audios-test/asanti-twi/audios/AsantiTwi...,Mepaa'kyɛw me number no yɛ 020,Please my number is 020
6,31867,lacuna-audios-test/asanti-twi/audios/AsantiTwi...,MƐ tɔ Telecel credit 20 cedis,I will buy 20 cedis Telecel credit
7,31868,lacuna-audios-test/asanti-twi/audios/AsantiTwi...,MƐ yɛ Tigo momo,I will do Tigo momo
8,31869,lacuna-audios-test/asanti-twi/audios/AsantiTwi...,Ɔyɛ Burkina ba,He is a burkinabe
9,31870,lacuna-audios-test/asanti-twi/audios/AsantiTwi...,Nipa yɛ bad,People are bad


# Functions to use BERT Model

In [None]:
import transformers.pipelines.pt_utils
def correct_asr_output(text):

    masked_text = text + " [MASK]"  # Add the mask token at the end
    corrected_text = corrector_robako(masked_text)
    corrected_word = corrected_text[0]['token_str']
    corrected_sentence = masked_text.replace("[MASK]", corrected_word)

    corrected_sequence = corrected_sentence.replace("[CLS]", "").replace("[SEP]", "").strip()
    # Extract the predicted token (ignoring the original input and [MASK])
    return corrected_sequence[0][0]


def transcribe_audio(audio_path):
    # Pass options as keyword arguments within transcribe
    model = whisper.load_model("tiny")
    result = model.transcribe(audio_path, language="en", without_timestamps=True)
    return result['text']

In [None]:
# Example usage for selecting a random statement
import random
import os


# Select a random index within the dataset
random_index = random.randint(0, len(test_dataset) - 1)

# Get the data item at the random index
random_item = test_dataset[random_index]

# Access the audio path and sentence from the random item
audio_path = random_item['audio_path']
sentence = random_item['sentence']

print("Random Audio Path:", audio_path)
print("Random Sentence:", sentence)

import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", message="Some weights of the model checkpoint") #Was flooding the output

# Transcribe and correct the random audio
asr_output = transcribe_audio(audio_path)
print("ASR Output:", asr_output)

# Load the BERT Abena model for correction
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline

tokenizer_abena = AutoTokenizer.from_pretrained("Ghana-NLP/abena-base-asante-twi-uncased")
corrector_model_abena = AutoModelForMaskedLM.from_pretrained("Ghana-NLP/abena-base-asante-twi-uncased")
corrector_abena = pipeline("fill-mask", model=corrector_model_abena, tokenizer=tokenizer_abena, device=0)

# Correct ASR output using Abena
corrected_output_abena = corrector_abena(asr_output.replace(" ", " [MASK] "))
print("Corrected Output (Abena):", corrected_output_abena)
print("\n\n\n")

# Load the BERT Robako model for correction
tokenizer_robako = AutoTokenizer.from_pretrained("Ghana-NLP/robako-base-asante-twi-uncased")
corrector_model_robako = AutoModelForMaskedLM.from_pretrained("Ghana-NLP/robako-base-asante-twi-uncased")
corrector_robako = pipeline(task = "fill-mask", model=corrector_model_robako, tokenizer=tokenizer_robako, device=0)

# Correct ASR output using Robako
#loads of errors here while trying to extract just the sentence, so tried different ways to get just the output
#masked_texts = [correct_asr_output(text) for text in asr_output.split()]
#corrected_output_robako = [corrector_robako(text)[0]['token_str'] for text in asr_output.split()]
#corrected_output_robako = " ".join(corrected_output_robako)
#corrected_output_robako = [corrector_robako(text + " [MASK]")[0]['token_str'] for text in asr_output.split()]
corrected_output_robako = [corrector_robako(text + " <mask>")[0]['sequence'] for text in asr_output.split()]
corrected_output_robako = " ".join(corrected_output_robako)
print("Corrected Output (Robako):", corrected_output_robako)


Random Audio Path: fisd-asanti-twi-10p/audios/AsantiTwiFm18-NNYuf3Ef-Tmp128-DWSA5F.ogg
Random Sentence: Nsu kɔm de me
ASR Output:  interviewed people with


Some weights of the model checkpoint at Ghana-NLP/abena-base-asante-twi-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Corrected Output (Abena): [[{'score': 0.058371469378471375, 'token': 12888, 'token_str': 'see', 'sequence': '[CLS] see interviewed [MASK] people [MASK] with [SEP]'}, {'score': 0.040831975638866425, 'token': 10114, 'token_str': 'to', 'sequence': '[CLS] to interviewed [MASK] people [MASK] with [SEP]'}, {'score': 0.03652362897992134, 'token': 10132, 'token_str': 'na', 'sequence': '[CLS] na interviewed [MASK] people [MASK] with [SEP]'}, {'score': 0.021701844409108162, 'token': 10428, 'token_str': 'met', 'sequence': '[CLS] met interviewed [MASK] people [MASK] with [SEP]'}, {'score': 0.02074621431529522, 'token': 10135, 'token_str': 'on', 'sequence': '[CLS] on interviewed [MASK] people [MASK] with [SEP]'}], [{'score': 0.10930045694112778, 'token': 10108, 'token_str': 'of', 'sequence': '[CLS] [MASK] interviewed of people [MASK] with [SEP]'}, {'score': 0.07340721040964127, 'token': 10114, 'token_str': 'to', 'sequence': '[CLS] [MASK] interviewed to people [MASK] with [SEP]'}, {'score': 0.042703

Some weights of the model checkpoint at Ghana-NLP/robako-base-asante-twi-uncased were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Corrected Output (Robako): interviewedt people mma with mu


#Using BERT to correct errors

## Method 1: Language Modelling
Trying to use predicting the next work to correct it (later figured out it was not a meaningful way to get things done)


In [None]:
#for language modelling

import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, BertForMaskedLM, Trainer, TrainingArguments


dataset = Dataset.from_pandas(train_dataset.df)
#dataset = Dataset.from_pandas(test_dataset)

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained("Ghana-NLP/abena-base-asante-twi-uncased")

# Tokenize the dataset
'''def tokenize_function(examples):
    # Specifying max_length here ensures all sequences are the same length
    return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=128) # Added max_length'''

def tokenize_function(examples):
    # Specifying max_length here ensures all sequences are the same length
    # Create labels by replacing masked tokens with -100 (ignore index)
    inputs = tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=128, return_tensors="pt")  # Added max_length, return_tensors="pt"
    inputs["labels"] = inputs.input_ids.detach().clone()
    # create random array of floats with equal dims to input_ids
    rand = torch.rand(inputs.input_ids.shape)
    # mask random 15% where token is not 0 [PAD], 1 [CLS], or 2 [SEP]
    mask_arr = (rand < 0.15) * (inputs.input_ids != 0) * (inputs.input_ids != 1) * (inputs.input_ids != 2)
    # loop through each row in input_ids tensor (cannot do in parallel)
    for i in range(inputs.input_ids.shape[0]):
        # get indices of mask positions from mask array
        selection = torch.flatten(mask_arr[i].nonzero()).tolist()
        # mask input_ids
        inputs.input_ids[i, selection] = tokenizer.mask_token_id
    # where input_ids is not masked, set labels to -100
    inputs["labels"][~mask_arr] = -100

    return inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.2)

# Load the model
model = BertForMaskedLM.from_pretrained("Ghana-NLP/abena-base-asante-twi-uncased")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    weight_decay=0.01,
    report_to="none"
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

# Train the model
trainer.train()

trainer.save_model("/content/drive/MyDrive/ABENA_Trained2")
tokenizer.save_pretrained("/content/drive/MyDrive/ABENA_Trained2")

Map:   0%|          | 0/26332 [00:00<?, ? examples/s]

Some weights of the model checkpoint at Ghana-NLP/abena-base-asante-twi-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch,Training Loss,Validation Loss
1,0.3208,0.11083
2,0.0799,0.07243
3,0.0498,0.055899


Epoch,Training Loss,Validation Loss
1,0.3208,0.11083
2,0.0799,0.07243
3,0.0498,0.055899
4,0.0358,0.054562
5,0.0279,0.062772
6,0.0271,0.057901


('/content/drive/MyDrive/ABENA_Trained2/tokenizer_config.json',
 '/content/drive/MyDrive/ABENA_Trained2/special_tokens_map.json',
 '/content/drive/MyDrive/ABENA_Trained2/vocab.txt',
 '/content/drive/MyDrive/ABENA_Trained2/added_tokens.json')

In [None]:
trainer.save_model("/content/drive/MyDrive/ABENA_Trained")
tokenizer.save_pretrained("/content/drive/MyDrive/ABENA_Trained")

In [None]:
from transformers import BertTokenizer, BertForMaskedLM, pipeline

# Load your fine-tuned model and tokenizer
model_path = "/content/drive/MyDrive/ABENA_Trained"  # Update with your save path

# Load the tokenizer from the configuration file
tokenizer = BertTokenizer.from_pretrained(model_path)

# Load the model, potentially specifying the safetensors file
model = BertForMaskedLM.from_pretrained(model_path, torch_dtype="auto")

# Create a fill-mask pipeline
fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)

# Test with a sentence
test_sentence = "Mɛtumi ayɛ [MASK] million cedis."
predictions = fill_mask(test_sentence)

# Print the predictions
for prediction in predictions:
    print(f"Predicted word: {prediction['token_str']}, Score: {prediction['score']:.4f}")


# Print only the predicted word
predicted_word = predictions[0]['token_str']
print(f"Predicted sentence: {test_sentence.replace('[MASK]', predicted_word)}")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Predicted word: credit, Score: 0.9863
Predicted word: transfer, Score: 0.0032
Predicted word: fi, Score: 0.0022
Predicted word: 5, Score: 0.0007
Predicted word: aka, Score: 0.0006
Predicted sentence: Mɛtumi ayɛ credit million cedis.


In [None]:
import pandas as pd
import random

def mask_random_word(sentence):
    """Masks a single random word in a sentence.

    Args:
        sentence (str): The input sentence.

    Returns:
        str: The sentence with one word randomly masked.
    """
    words = sentence.split()
    if words:  # Check if the sentence is not empty
        random_index = random.randint(0, len(words) - 1)  # Get a random index
        words[random_index] = "[MASK]"  # Replace the word at that index
    return " ".join(words)  # Join the words back into a sentence

In [None]:
import pandas as pd # If using a CSV file
from transformers import BertTokenizer, BertForMaskedLM, pipeline

# Load fine-tuned model and tokenizer
model_path = "/content/drive/MyDrive/ABENA_Trained"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForMaskedLM.from_pretrained(model_path)
fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)


testinglabels = "/content/drive/MyDrive/ABENA_Trained2/Testlabels.csv"
df = pd.read_csv(testinglabels, delimiter="\t", names=["Index", "path","sentence","translation"], header=0)
df["masked_sentence"] = df["sentence"].apply(mask_random_word)


# Prediction loop
#for sentence in sentences:
for index, row in df.iterrows():
    # Replace a word with [MASK] for prediction
    masked_sentence = row["masked_sentence"]
    predictions = fill_mask(masked_sentence)
    #masked_sentence = sentence.replace("target_word", "[MASK]")
    #predictions = fill_mask(masked_sentence)
    predicted_word = predictions[0]['token_str']  # Get the top prediction

    # Compare predicted word with actual word
    print(f"Original Sentence: {row['sentence']}")
    print(f"Masked Sentence: {masked_sentence}")
    print(f"Predicted Word: {predicted_word}")
    print(f"Predicted Sentence: {masked_sentence.replace('[MASK]', predicted_word)}")
    print("-" * 20)  # Separator

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Original Sentence: Fa thousand ma me
Masked Sentence: [MASK] thousand ma me
Predicted Word: to
Predicted Sentence: to thousand ma me
--------------------
Original Sentence: Tsɔrɔ phone number wei ma me: 0548992233
Masked Sentence: [MASK] phone number wei ma me: 0548992233
Predicted Word: me
Predicted Sentence: me phone number wei ma me: 0548992233
--------------------
Original Sentence: Wanma me ntosoɔ
Masked Sentence: Wanma me [MASK]
Predicted Word: bi
Predicted Sentence: Wanma me bi
--------------------
Original Sentence: Tsɔrɔ phone number wei ma me: 0278759823
Masked Sentence: Tsɔrɔ phone [MASK] wei ma me: 0278759823
Predicted Word: number
Predicted Sentence: Tsɔrɔ phone number wei ma me: 0278759823
--------------------
Original Sentence: Fa ma ne phone number wei so: 0258934896
Masked Sentence: Fa ma ne [MASK] number wei so: 0258934896
Predicted Word: number
Predicted Sentence: Fa ma ne number number wei so: 0258934896
--------------------
Original Sentence: Mepaa'kyɛw me number n

##Method 2: Text Correction
This is the actual way to do correct the Whisper output, but had insufficient data for it. Was using the error data rom the finetuning of the whisper model on the financial inclusion dataset.

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, EncoderDecoderModel, Trainer, TrainingArguments


from transformers import BertTokenizer, BertForMaskedLM, pipeline

# Load your fine-tuned model and tokenizer
model_path = "/content/drive/MyDrive/ABENA_Trained"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForMaskedLM.from_pretrained(model_path)
fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [None]:
testinglabels = "/content/drive/MyDrive/ABENA_Trained/hypotheses_references.csv"
dataset = pd.read_csv(testinglabels, names=["Index", "path","sentence","translation"], header=0)

In [None]:
!pip install datasets transformers


# Load a pre-trained BERT model for Asante Twi
model_name = "Ghana-NLP/abena-base-asante-twi-uncased"  # Or "Ghana-NLP/robako-base-asante-twi-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)

# Example test sentence
test_sentence = "Mɛtumi ayɛ [MASK] million cedis."
predictions = fill_mask(test_sentence)
print(f"Predicted sentence: {test_sentence.replace('[MASK]', predictions[0]['token_str'])}")



Some weights of the model checkpoint at Ghana-NLP/abena-base-asante-twi-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Predicted sentence: Mɛtumi ayɛ le million cedis.


In [None]:
# Load dataset
data_path = "/content/drive/MyDrive/ABENA_Trained/hypotheses_references.csv"
df = pd.read_csv(data_path, names=["incorrect", "correct", "incorrect cleaned", "correct cleaned"], header=0)
dataset = Dataset.from_pandas(df)

# Tokenization function
def preprocess_function(examples):
    # Use the 'incorrect cleaned' and 'correct cleaned' columns
    inputs = tokenizer(examples["incorrect cleaned"], padding="max_length", truncation=True, max_length=128)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["correct cleaned"], padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Split into train, validation, and test sets
from datasets import DatasetDict # Import DatasetDict here
train_testvalid = tokenized_datasets.train_test_split(test_size=0.2)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
tokenized_datasets = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'validation': test_valid['train']})


# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit = 2, # Only last 2 models are saved. Older ones are deleted.
    load_best_model_at_end=True,
    save_strategy="epoch",
    report_to="none",
    logging_steps = 10
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"]
)

# Train the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("/content/drive/MyDrive/ABENA_FineTuned")
tokenizer.save_pretrained("/content/drive/MyDrive/ABENA_FineTuned")

Map:   0%|          | 0/240 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,13.2598,8.824094
2,7.977,3.075538
3,3.489,1.138112
4,1.3797,0.896338
5,0.8072,0.827055


There were missing keys in the checkpoint model loaded: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias'].


('/content/drive/MyDrive/ABENA_FineTuned/tokenizer_config.json',
 '/content/drive/MyDrive/ABENA_FineTuned/special_tokens_map.json',
 '/content/drive/MyDrive/ABENA_FineTuned/vocab.txt',
 '/content/drive/MyDrive/ABENA_FineTuned/added_tokens.json',
 '/content/drive/MyDrive/ABENA_FineTuned/tokenizer.json')

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline

# Load the fine-tuned model
fine_tuned_tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/ABENA_FineTuned")
fine_tuned_model = AutoModelForMaskedLM.from_pretrained("/content/drive/MyDrive/ABENA_FineTuned")

corrector = pipeline("fill-mask", model=fine_tuned_model, tokenizer=fine_tuned_tokenizer)

def correct_text(text):
    """Corrects the input text using the fine-tuned model."""
    # Tokenize the input text
    inputs = fine_tuned_tokenizer(text, return_tensors="pt")
    input_ids = inputs["input_ids"][0]  # Access the first element of the tensor

    # Create a copy of the input_ids to avoid modifying the original
    masked_input_ids = input_ids.clone()

    # Randomly mask 15% of the tokens (excluding special tokens)
    rand = torch.rand(input_ids.shape)
    mask_arr = (rand < 0.15) * (input_ids != fine_tuned_tokenizer.cls_token_id) * (input_ids != fine_tuned_tokenizer.sep_token_id)

    # Apply the mask to the selected tokens
    masked_input_ids[mask_arr] = fine_tuned_tokenizer.mask_token_id

    # Run the model to predict the masked tokens
    outputs = fine_tuned_model(masked_input_ids.unsqueeze(0))  # Add batch dimension
    predictions = torch.argmax(outputs.logits, dim=-1)

    # Decode the predicted tokens to get the corrected text
    corrected_text = fine_tuned_tokenizer.decode(predictions[0], skip_special_tokens=True)

    return corrected_text


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [None]:
# Example usage:
text_to_correct = "Me den de Kwoku"
corrected_text = correct_text(text_to_correct)
print(f"Original Text: {text_to_correct}")
print(f"Corrected Text: {corrected_text}")

Original Text: Me den de Kwoku
Corrected Text: . meɛ denɛ woɛ
