# Installations and Environment Setup

In [None]:
!pip install datasets

In [None]:
!pip install transformers datasets accelerate

In [None]:
!pip install evaluate

In [None]:
!pip install jiwer

In [None]:
# Installing necessary libraries
!pip install git+https://github.com/openai/whisper.git
!pip install jiwer datasets transformers evaluate torch torchaudio  accelerate
!pip install torch --upgrade --extra-index-url https://download.pytorch.org/whl/cu118


In [None]:
# Standard library imports
import os
import zipfile
import requests
import io
import re
import locale
import shutil

# Data handling and processing
import pandas as pd
import numpy as np

# Audio processing
import torchaudio
import torchaudio.transforms as T

# Transformers library
from transformers import (
    PreTrainedTokenizerFast,
    WhisperForConditionalGeneration,
    WhisperProcessor,
    WhisperTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    EarlyStoppingCallback,
    TrainingArguments,
    Trainer
)

# Datasets and utilities
from datasets import Dataset, load_dataset, Audio
from torch.utils.data import Dataset as TorchDataset, DataLoader
from tqdm.notebook import tqdm

# Evaluation and metrics
import evaluate
from evaluate import load
import jiwer
from jiwer import wer
from whisper.normalizers import EnglishTextNormalizer


# IPython display
from IPython.display import Audio, display, HTML

# PyTorch
import torch
import torch.nested


from tokenizers import ByteLevelBPETokenizer
from pathlib import Path


# Loading Twi Bible Dataset

In [None]:
ds = load_dataset("kojo-george/asante-twi-tts")

In [None]:
train_file_path = Path("asante_twi_tts_train.txt")
test_file_path = Path("asante_twi_tts_test.txt")
val_file_path = Path("asante_twi_tts_val.txt")

# Extracting text from the dataset and saving to files
with train_file_path.open("w") as f:
    for item in ds["train"]:
        f.write(item["text"] + "\n")

with test_file_path.open("w") as f:
    for item in ds["test"]:
        f.write(item["text"] + "\n")

with val_file_path.open("w") as f:
    for item in ds["validation"]:
        f.write(item["text"] + "\n")

train_save_path = '/content/drive/MyDrive/datasets/asante_twi_train.txt'
test_save_path = '/content/drive/MyDrive/datasets/asante_twi_test.txt'
val_save_path = '/content/drive/MyDrive/datasets/asante_twi_val.txt'


shutil.copy(str(train_file_path), train_save_path)
shutil.copy(str(test_file_path), test_save_path)
shutil.copy(str(val_file_path), val_save_path)

# Training Tokenizer on Dataset and converting to a type compatible with HuggingFace


In [None]:
# Initializing the tokenizer
tokenizer = ByteLevelBPETokenizer()

# Training the tokenizer
tokenizer.train(
    files=[str(train_file_path), str(test_file_path), str(val_file_path)],
    vocab_size=12000, # 12000 because the data itself has just about 28000 rows
    min_frequency=2, # smaller minimum frequency because of the smaller size of data
    special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"],
)

save_path = '/content/drive/MyDrive/tokenizers/asante_twi_jw_tokenizer'
tokenizer.save_model(save_path)

In [None]:
#Had a lot of erros using the tokenizer, so converting it to a PreTrainedTokenizerFast type per recommendation
class CustomTokenizer(PreTrainedTokenizerFast):
    def __init__(self, tokenizer_file, **kwargs):
          tokenizer_object=ByteLevelBPETokenizer.from_file(
              os.path.join(tokenizer_file, "vocab.json"),
              os.path.join(tokenizer_file, "merges.txt"))
          super().__init__(
          tokenizer_object=tokenizer_object,
          bos_token="<s>", # Adding necessary attributes for PreTrainedTokenizerFast
          eos_token="</s>",
          unk_token="<unk>",
          pad_token="<pad>",
          mask_token="<mask>",
          **kwargs # other attributes
          )
          self.tokenizer_object = tokenizer_object

# Loading the tokenizer using the custom class
tokenizer_file = "/content/drive/MyDrive/tokenizers/asante_twi_jw_tokenizer"
tokenizer = CustomTokenizer(tokenizer_file)

save_path = '/content/drive/MyDrive/tokenizers/atjt2'
tokenizer.save_pretrained(save_path)

newsavepath = Path("/content/drive/MyDrive/tokenizers/atjt2_fixed")
#newsavepath.mkdir(exist_ok=True)
tokenizer.tokenizer_object.save_model(str(newsavepath))

# Using functions from finetuning Whisper on the Financial Inclusion Data Only

In [None]:
#Loading Tokenizer
tokenizer = CustomTokenizer("/content/drive/MyDrive/tokenizers/atjt2_fixed")

# Loading the Whisper processor
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny", language="ak", task="transcribe") # ak is the language code for Akan
processor.tokenizer = tokenizer #setting the tokenizer to my custom tokenizer


In [None]:
def prepare_dataset(batch):
    # loading and resampling audio data from 48 to 16kHz
    audio = batch["audio"]
    # computing log-Mel input features from input audio array
    batch["input_features"] = processor.feature_extractor(audio["array"], sampling_rate=16000, return_tensors = "pt").input_features[0]
    # encoding target text to label ids
    batch["labels"] = processor(text=batch["text"]).input_ids
    return batch

#trying to make a copy of the dataset so if it corrupts I don't have to rerun that whole part since it takes forever
'''modified_ds = ds.copy()

# Cast the "audio" column to Audio with the desired sampling rate
modified_ds = modified_ds.map(
    lambda example: {"audio": example["audio"].cast_column("audio", Audio(sampling_rate=16000))},
    batched=False
)'''
ds = ds.cast_column("audio", Audio(sampling_rate = 16000)) #resampling
dsnew = ds.map(prepare_dataset, remove_columns=ds.column_names["train"], num_proc=4)

In [None]:
def prepare_model_and_dataset(customprocessor):
    # Loading Whisper model and processor
    processor = customprocessor
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny").to("cuda" if torch.cuda.is_available() else "cpu")
    model.config.forced_decoder_ids = None  # Reseting forced decoder IDs
    model.config.suppress_tokens = []  # Reseting suppressed tokens
    model.config.vocab_size = tokenizer.vocab_size  # Updating vocab size

    train_dataset = ds["train"]
    test_dataset = ds["test"]
    val_dataset = ds["validation"]

    return model, processor, train_dataset, test_dataset

print("Data prepared and model loaded.")

In [None]:
def prepare_training_args(output_dir='content/drive/MyDrive/model_with_tokenizer'):
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        num_train_epochs=10,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,

        # Learning rate scheduling and regularization
        learning_rate=1e-4,  # Small learning rate to prevent rapid overfitting
        weight_decay=0.01,   # L2 regularization to penalize large weights
        lr_scheduler_type="linear",  # Gradual learning rate reduction
        warmup_steps=500,    # Gradual learning rate increase initially

        # Model saving and evaluation
        load_best_model_at_end=True,
        metric_for_best_model="wer",  # Using Word Error Rate for model selection


        gradient_accumulation_steps=2,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        predict_with_generate=True,
        generation_max_length=225,
        fp16=True,
        report_to=[]
    )
    return training_args

In [None]:
def data_collator(batch):
    required_length = 3000  # Fixed length required by Whisper

    # Pad or truncate input features
    input_features = torch.stack([
        torch.nn.functional.pad(
            item["input_features"],
            (0, required_length - item["input_features"].shape[-1]),  # Pad to the right
            mode="constant",
            value=0  # Padding value
        )[:, :required_length]  # Truncate if longer than 3000
        for item in batch
    ])

    # Pad labels to the maximum sequence length in the batch
    labels = torch.nn.utils.rnn.pad_sequence(
        [item["labels"] for item in batch],
        batch_first=True,
        padding_value=processor.tokenizer.pad_token_id
    )
    labels[labels == processor.tokenizer.pad_token_id] = -100  # Mask padding tokens for loss calculation

    return {
        "input_features": input_features,  # Padded to required length
        "labels": labels
    }


In [None]:
def compute_metrics(pred):
    # pred contains predictions and label_ids
    decoded_preds = processor.batch_decode(pred.predictions, skip_special_tokens=True)
    decoded_labels = processor.batch_decode(pred.label_ids, skip_special_tokens=True)

    # Normalizing text
    normalizer = EnglishTextNormalizer()
    decoded_preds_clean = [normalizer(text) for text in decoded_preds]
    decoded_labels_clean = [normalizer(text) for text in decoded_labels]

    # Computing WER
    wer = jiwer.wer(decoded_labels_clean, decoded_preds_clean)

    # Computing CER
    cer = jiwer.cer(decoded_labels_clean, decoded_preds_clean)

    # Print the results (optional)
    print(f"WER: {wer * 100:.2f} %")
    print(f"CER: {cer * 100:.2f} %")

    return {"wer": wer, "cer": cer}

In [None]:
def train_whisper_model():
    #Using a smaller Whisper model to reduce complexity
    model, processor, train_dataset, test_dataset = prepare_model_and_dataset(processor)

    #Freezing initial layers of the model
    for param in model.base_model.parameters():
        param.requires_grad = False  # Freeze initial layers

    # Unfreezing last few layers for fine-tuning
    for param in model.base_model.encoder.layers[-2:].parameters():
        param.requires_grad = True

    training_args = prepare_training_args()

    # Custom data collator for batching to pad appropriately
    def collate_fn(batch):
         # Filter out None values
        batch = [b for b in batch if b is not None]
        if len(batch) == 0:
            # Handling the case where all items in the batch are None
            return {}  # or return some default value

        input_features = torch.stack([x[0] for x in batch])
        labels = torch.nn.utils.rnn.pad_sequence(
            [x[1] for x in batch],
            batch_first=True,
            padding_value=processor.tokenizer.pad_token_id
        )
        return {
            "input_features": input_features,
            "labels": labels
        }

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        data_collator=collate_fn,
        compute_metrics=compute_metrics,
        processing_class=processor
    )


    # Adding early stopping callback --adding it earlier in the training arguments was causing errors
    trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=3))

    trainer.train()

    # Save model
    trainer.save_model('content/drive/MyDrive/asomsem')

    return model, processor

In [None]:
# Execute training
trained_model, trained_processor = train_whisper_model()
print("Model training completed!")

# Saving the model and tokenizer to Google Drive
model_save_path = 'content/drive/MyDrive/asomsem'  # Path for model
tokenizer_save_path = 'content/drive/MyDrive/asomsem_tokenizer'  # Path for tokenizer

trained_model.save_pretrained(model_save_path)  # Save model
trained_processor.save_pretrained(tokenizer_save_path)  # Save tokenizer

print(f"Model saved to: {model_save_path}")
print(f"Tokenizer saved to: {tokenizer_save_path}")

# Finetuning on the financial inclusion dataset

In [None]:

import os
import zipfile
import requests
import io
import pandas as pd
import torch
from tokenizers import ByteLevelBPETokenizer
from transformers import RobertaConfig, RobertaTokenizerFast, RobertaForMaskedLM, Trainer, TrainingArguments ,DataCollatorForLanguageModeling
from datasets import Dataset
from torch.utils.data import Dataset as TorchDataset

In [None]:
class AsantiTwiDataset(TorchDataset):
    def __init__(self, zip_url, csv_filename, audio_base_path, tokenizer=None):
        self.tokenizer = tokenizer
        # Download and extract dataset
        response = requests.get(zip_url, stream=True)
        response.raise_for_status()
        with zipfile.ZipFile(io.BytesIO(response.content), 'r') as zip_ref:
            zip_ref.extractall('.')

        # Clean the CSV file
        cleaned_csv_filename = f"cleaned_{os.path.basename(csv_filename)}"
        self._clean_csv(csv_filename, cleaned_csv_filename)

        # Load and preprocess data
        self.df = pd.read_csv(cleaned_csv_filename)
        self.df.rename(columns={"Audio Filepath": "path", "Transcription": "sentence"}, inplace=True)
        self.texts = self.df["sentence"].tolist()
        self.audio_base_path = audio_base_path

    def _clean_csv(self, input_path, output_path):
        with open(input_path, "r") as infile:
            lines = infile.readlines()

        # Replace tabs with commas and clean paths
        clean_lines = [
            line.replace("\t", ",")
                .replace("lacuna-audios-train/asanti-twi/audios/", "")
                .replace("lacuna-audios-test/asanti-twi/audios/", "")
            for line in lines
        ]

        # Filter rows with the correct number of fields
        expected_fields = clean_lines[0].count(",") + 1
        valid_lines = [line for line in clean_lines if line.count(",") + 1 == expected_fields]

        # Write cleaned content to a new file
        with open(output_path, "w") as outfile:
            outfile.writelines(valid_lines)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        audio_path = os.path.join(self.audio_base_path, self.df.iloc[idx]['path'])

        # Load audio
        waveform, sr = torchaudio.load(audio_path, normalize=True)
        if sr != 16000:
            resampler = T.Resample(orig_freq=sr, new_freq=16000)
            waveform = resampler(waveform)

        # Feature extraction (Mel Spectrogram with 80 Mel frequency bins)
        mel_spectrogram = T.MelSpectrogram(n_mels=80)(waveform)  # Set n_mels=80 as expected by Whisper

        # Ensure the correct shape [batch_size, n_mels, time]
        mel_spectrogram = mel_spectrogram.squeeze(0)  # Remove channel dimension if it's 1

        # Tokenize text to get labels
        labels = self.tokenizer.encode(text, add_special_tokens=False)  # Directly get token IDs

        return {"input_features": mel_spectrogram, "labels": labels}


In [None]:
zip_url = "https://fisd-dataset.s3.amazonaws.com/fisd-asanti-twi-90p.zip"  # Training dataset URL
csv_filename = "fisd-asanti-twi-90p/data.csv"
audio_base_path = "fisd-asanti-twi-90p/audios"
tokenizer_dir = "content/drive/MyDrive/tokenizers/asante_twi_jw_tokenizer"
model_dir = "content/drive/MyDrive/asomsem"
output_dir = "content/drive/MyDrive/asomsem_financial"

# Step 1: Prepare tokenizer
financial_train_dataset = AsantiTwiDataset(zip_url, csv_filename, audio_base_path)
tokenizer = WhisperTokenizer.from_pretrained(tokenizer_dir)

financial_inclusion_texts = financial_train_dataset.texts

In [None]:
from tokenizers import ByteLevelBPETokenizer

def fine_tune_existing_tokenizer(texts, tokenizer_dir):
    # Load the existing tokenizer from the directory
    tokenizer = ByteLevelBPETokenizer.from_file(
        os.path.join(tokenizer_dir, "vocab.json"),
        os.path.join(tokenizer_dir, "merges.txt")
    )

    # Fine-tune the tokenizer on the new texts
    tokenizer.train_from_iterator(
        texts,
        vocab_size=tokenizer.get_vocab_size(),  # Keep the original vocabulary size
        min_frequency=2,     # Adjust as needed
        special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"],
    )

    # Save the fine-tuned tokenizer
    tokenizer.save_model(tokenizer_dir)

    return tokenizer

#financial_inclusion_texts = financial_train_dataset # Done earlier
tokenizer = fine_tune_existing_tokenizer(financial_inclusion_texts, tokenizer_dir)

In [None]:
# Prepare and split dataset
def prepare_and_split_dataset(zip_url, csv_filename, audio_base_path, tokenizer):
    dataset = AsantiTwiDataset(zip_url, csv_filename, audio_base_path, tokenizer=tokenizer)
    texts = dataset.texts

    # Create Hugging Face dataset
    dataset_dict = {"text": texts}
    huggingface_dataset = Dataset.from_dict(dataset_dict)

    # Tokenize the dataset
    def tokenize_function(examples):
        return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

    tokenized_dataset = huggingface_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

    # Split into train and validation sets
    train_test_split = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
    train_dataset = train_test_split["train"]
    validation_dataset = train_test_split["test"]

    return train_dataset, validation_dataset

In [None]:
'''from tokenizers import ByteLevelBPETokenizer

def fine_tune_existing_tokenizer(texts, tokenizer_dir):
    # Load the existing tokenizer from the directory
    tokenizer = ByteLevelBPETokenizer.from_file(
        os.path.join(tokenizer_dir, "vocab.json"),
        os.path.join(tokenizer_dir, "merges.txt")
    )

    # Fine-tune the tokenizer on the new texts
    tokenizer.train_from_iterator(
        texts,
        vocab_size=tokenizer.get_vocab_size(),  # Keep the original vocabulary size
        min_frequency=2,     # Adjust as needed
        special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"],
    )

    # Save the fine-tuned tokenizer
    tokenizer.save_model(tokenizer_dir)

    return tokenizer

# Fine-tune your existing tokenizer
tokenizer = fine_tune_existing_tokenizer(all_texts, tokenizer_dir)  # Use your current texts and tokenizer directory'''

# Functions I am not using at the moment

In [None]:
#from datasets import load_dataset, Audio
#from transformers import WhisperProcessor, WhisperForConditionalGeneration, TrainingArguments, Trainer
#In case the above fails because  of the multiprocessing again
# Load the tokenizer

'''from tokenizers import ByteLevelBPETokenizer
tokenizer_vocab_path = "/content/drive/MyDrive/tokenizers/asante_twi_jw_tokenizer/vocab.json"
tokenizer_merges_path = "/content/drive/MyDrive/tokenizers/asante_twi_jw_tokenizer/merges.txt"

def prepare_dataset(batch, tokenizer_vocab_path=tokenizer_vocab_path, tokenizer_merges_path=tokenizer_merges_path):
    try:
        # Re-initialize tokenizer and processor within the function
        tokenizer = ByteLevelBPETokenizer.from_file(tokenizer_vocab_path, tokenizer_merges_path)
        processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="ak", task="transcribe") # ak is the language code for Akan
        processor.tokenizer = tokenizer

        # load and resample audio data from 48 to 16kHz
        audio = batch["audio"]
        # compute log-Mel input features from input audio array
        batch["input_features"] = processor.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
        # encode target text to label ids
        batch["labels"] = processor.tokenizer.encode(batch["text"]).ids
        return batch
    except Exception as e:
        print(f"Error processing batch: {batch}")  # Print the problematic batch
        print(f"Error: {e}")
        raise e

ds = ds.cast_column("audio", Audio(sampling_rate=16000))
ds = ds.map(prepare_dataset, remove_columns=ds.column_names["train"], num_proc=1, fn_kwargs={'tokenizer_vocab_path': tokenizer_vocab_path, 'tokenizer_merges_path': tokenizer_merges_path})
# Switch back to num_proc=4 once the error is resolved.'''

In [None]:
'''from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer, padding="longest", return_tensors="pt")

# Adjust data_collator's __call__ if needed for Whisper:
def custom_data_collator_call(self, features):
    input_features = [{"input_features": feature["input_features"]} for feature in features]
    #label_features = [{"input_ids": feature["labels"]} for feature in features]  # Change 'labels' to 'input_ids'
    input_features = self.tokenizer.pad(input_features, padding=self.padding, return_tensors=self.return_tensors)

    # Process labels (as input_ids)
    labels = [{"input_ids": feature["labels"]} for feature in features]
    labels = self.tokenizer.pad(labels, padding=self.padding, max_length=None, pad_to_multiple_of=None, return_tensors=self.return_tensors)

    # Combine padded features into a batch
    batch = {
        "input_features": input_features["input_features"],
        "labels": labels["input_ids"],
        "attention_mask": labels["attention_mask"]
    }

    return batch

data_collator.__call__ = custom_data_collator_call.__get__(data_collator, DataCollatorWithPadding)'''
'''# The change is here. We pad the label_features with the tokenizer and pass the input_features as is.
    # We also include the attention mask in label_features to make sure it is included in the output.
    label_features = self.tokenizer.pad(label_features, padding=self.padding, return_tensors=self.return_tensors)

    # Combine padded features into a batch
    batch = {
        "input_features": torch.tensor([feature["input_features"] for feature in features]),
        "labels": label_features["input_ids"],
        "attention_mask": label_features["attention_mask"]  # add attention mask from label_features
    }

    return batch

data_collator.__call__ = custom_data_collator_call.__get__(data_collator, DataCollatorWithPadding)'''


In [None]:
# 1. Define Training Arguments
training_args = TrainingArguments(
    output_dir="content/drive/MyDrive/asomsem",  # Change to your desired output directory
    per_device_train_batch_size=16,  # Adjust batch size based on your resources
    gradient_accumulation_steps=1,  # Increase if you have memory constraints
    gradient_checkpointing=True,  # Enable gradient checkpointing for memory efficiency
    fp16=True,  # Enable mixed precision training if your hardware supports it
    eval_strategy="steps",
    eval_steps=1000,  # Evaluate every 1000 steps
    learning_rate=1e-5,  # Adjust learning rate as needed
    weight_decay=0.0,
    warmup_steps=500,
    save_steps=1000,  # Save checkpoints every 1000 steps
    logging_dir="./logs",  # Directory for storing logs
    num_train_epochs=3, # Adjust the number of training epochs
    push_to_hub=False, # Set to True if you want to push your model to Hugging Face Hub
    report_to="none"
)

In [None]:
# 4. Define Trainer and Train
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dsnew["train"],
    eval_dataset=dsnew["validation"],
    data_collator=data_collator
    #report_to = "none"
)

trainer.train()

# 5. Save the Model
trainer.save_model("content/drive/MyDrive/asomsem")  # Save the trained model
