## mounting google drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## clean cuda

In [2]:
import torch
torch.cuda.empty_cache()
import gc
gc.collect()
torch.cuda.empty_cache()


## install

In [3]:
!pip install jiwer
!pip install openpyxl

Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading jiwer-3.1.0-py3-none-any.whl (22 kB)
Downloading rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.1.0 rapidfuzz-3.12.2


## imports

In [4]:
import pandas as pd
from transformers import MT5Tokenizer, MT5ForConditionalGeneration
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset,DataLoader
from torch.optim import Adam
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
from torch.nn.functional import pad as pad_tokens

## determining environment


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


## Define paths

In [6]:
FILE_PATH='/content/drive/MyDrive/ASR_English_Trainining_Data.xlsx'

## Reading Dataset

In [7]:
df = pd.read_excel(FILE_PATH)
print(df.columns)
df

Index(['ASR Output', 'Correct Sentence'], dtype='object')


Unnamed: 0,ASR Output,Correct Sentence
0,professor floyd pointed out that mass lesions ...,Professor Floyd pointed out that mass lesions...
1,the norsemen considered the rainbow as a bridg...,The Norsemen considered the rainbow as a brid...
2,she can scoop these things into three red bags...,She can scoop these things into three red bag...
3,ilay is one of five whisky distilling localities,Islay is one of five whisky distilling locali...
4,messangers from molly and molly,Messages from Molly and Mollie
...,...,...
2092,the annual bulletin of the comparative law bur...,The Annual Bulletin of the Comparative Law Bu...
2093,flights from vaclave havell eirport prague to ...,Flights from Václav Havel Airport Prague to O...
2094,the wide yellow band is wide since red and gre...,The wide yellow band is wide since red and gr...
2095,are you ready to find out who'll be the musica...,Are you ready to find out who will be the mus...


## temporary splitting dataset for try out with 50 rows

In [8]:
# df = df[0:50]
df

Unnamed: 0,ASR Output,Correct Sentence
0,professor floyd pointed out that mass lesions ...,Professor Floyd pointed out that mass lesions...
1,the norsemen considered the rainbow as a bridg...,The Norsemen considered the rainbow as a brid...
2,she can scoop these things into three red bags...,She can scoop these things into three red bag...
3,ilay is one of five whisky distilling localities,Islay is one of five whisky distilling locali...
4,messangers from molly and molly,Messages from Molly and Mollie
...,...,...
2092,the annual bulletin of the comparative law bur...,The Annual Bulletin of the Comparative Law Bu...
2093,flights from vaclave havell eirport prague to ...,Flights from Václav Havel Airport Prague to O...
2094,the wide yellow band is wide since red and gre...,The wide yellow band is wide since red and gr...
2095,are you ready to find out who'll be the musica...,Are you ready to find out who will be the mus...


## Seperating input output

In [9]:
X,y = df['ASR Output'],df['Correct Sentence']
X=X.tolist()
y=y.tolist()
print("X",X)
print("y",y)



### Preprocessing data
* After above tryout, found that some white spacing and all issues, so we need to trim the data first


In [10]:
# trim each element of X and y
X = [sentence.lower().strip() for sentence in X]
y = [sentence.lower().strip() for sentence in y]
print("X",X)
print("y",y)



### Train & Test split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Load model & tokenizer (mbart)

In [12]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")
print(model.to(device))
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50")

tokenizer.src_lang = "en_XX"
tokenizer.tgt_lang = "en_XX"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x MBartEncoderLayer(
          (self_attn): MBartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

### tokenizing some sentences to analyze what is the ouptut and how tokenized input output looks like

In [13]:
sentence = X[0]
print("Sentence : ",sentence)
tokenized_sentence = tokenizer(sentence)
print("Tokenized Sentence : ",tokenized_sentence)

Sentence :  professor floyd pointed out that mass lesions are a possible cause for epileptic siigures
Tokenized Sentence :  {'input_ids': [250004, 16030, 21917, 11073, 6275, 297, 1810, 450, 46889, 199, 17514, 621, 10, 7722, 22304, 100, 25277, 133, 112569, 78, 872, 30891, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


### Tokenize Data

In [14]:
def calculate_max_token_length(X, y, tokenizer):
    # Tokenize all input and target sentences without padding or truncation
    input_lengths = [len(tokenizer.encode(sentence)) for sentence in X]
    target_lengths = [len(tokenizer.encode(sentence)) for sentence in y]

    # Find the maximum length
    max_input_length = max(input_lengths)
    max_target_length = max(target_lengths)

    # Use the larger of the two as the max token length
    max_token_length = max(max_input_length, max_target_length)

    return max_token_length

# Calculate max token length
max_token_length = calculate_max_token_length(X_train + X_test, y_train + y_test, tokenizer)
print(f"Max token length: {max_token_length}")

Max token length: 51


In [15]:
def tokenize_data(X, y, tokenizer, max_length=max_token_length):
    X = [f"{sentence}" for sentence in X]
    inputs = tokenizer(X, padding="max_length", max_length=max_length, truncation=True, return_tensors="pt")
    targets = tokenizer(y, padding="max_length", max_length=max_length, truncation=True, return_tensors="pt")
    return inputs['input_ids'], targets['input_ids']

# Tokenize training and testing data
X_train_tokenized, y_train_tokenized = tokenize_data(X_train, y_train, tokenizer)
X_test_tokenized, y_test_tokenized = tokenize_data(X_test, y_test, tokenizer)

In [16]:
print(X_train[0])
print(y_train[0])
print(X_train_tokenized[0])
print(y_train_tokenized[0])
print(X_test_tokenized[0])
print(y_test_tokenized[0])

such transformations of men into wolves in pagan colt were associated with the devil from the early mediaeval perspective
such transformations of men into wolves in pagan cult were associated with the devil from the early medieval perspective
tensor([250004,   6044, 167201,      7,    111,    453,   3934,  13924,   3132,
            23,   2070,     66,   3365,     18,   3542, 137272,    678,     70,
             8,   5115,   1295,     70,  39395,   2450,     13,   1405,  80280,
             2,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1])
tensor([250004,   6044, 167201,      7,    111,    453,   3934,  13924,   3132,
            23,   2070,     66,  58984,   3542, 137272,    678,     70,      8,
          5115,   1295,     70,  39395,  92264,  80280,      2,      1,      1,
             1,      1,      1,      1,      1,      1,     

## Loading model

In [17]:
import os


In [18]:
from transformers import MBartForConditionalGeneration
import torch

# Load the model
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
model = model.to(device)

print("Model and tokenizer loaded successfully!")

Model and tokenizer loaded successfully!


In [19]:
def correct_asr(text, model, tokenizer, device):
    input_text = f"{text}"  #prefix for T5 to understand the task
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    print(inputs)

    # Generate corrected text
    with torch.no_grad():
        generated_ids = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=512)

    # Decode the generated text
    corrected_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return corrected_text.lower()

# Example inputs
input_texts = [
    "in this how far away is the moon",
    "happy mayday"
]

# Perform inference on example inputs
for text in input_texts:
    corrected_text = correct_asr(text, model, tokenizer, device)
    print(f"Input: {text}")
    print(f"Corrected: {corrected_text}")
    print("-" * 50)

{'input_ids': tensor([[250004,     23,    903,   3642,   2060,  16065,     83,     70,      6,
          59533,      2]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}
Input: in this how far away is the moon
Corrected: in this how far away is the moon
--------------------------------------------------
{'input_ids': tensor([[250004,  17723,   1543,   5636,      2]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1]], device='cuda:0')}
Input: happy mayday
Corrected: happy mayday - happy mayday
--------------------------------------------------


## Dataset

In [20]:
class ASRDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return (self.inputs[idx]), (self.targets[idx])

train_dataset = ASRDataset(X_train_tokenized,y_train_tokenized)
test_dataset = ASRDataset(X_test_tokenized,y_test_tokenized)

train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True)

print("ith item of dataset : ",train_dataset[3])
print("length of dataset : ",len(train_dataset))


ith item of dataset :  (tensor([250004,   9742,    164,    621,  25225,    214,  63335,   1831,     23,
          2446, 147643,  80583,      7,    756,   6602,  68807,      2,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1]), tensor([250004, 135474,      7,    621,  25225,    214,  63335,   1831,     23,
          2446, 147643,  80583,      7,    756,   6602,  10932,      2,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1]))
length of dataset :  1677


In [21]:
num_epochs = 3
learning_rate = 5e-5

# Optimizer
optimizer = Adam(model.parameters(), lr=learning_rate)

# Loss function
# criterion = nn.MSELoss()
criterion = nn.CrossEntropyLoss()
# import torch.nn.functional as F
# criterion = F.mse_loss

optimizer
criterion

CrossEntropyLoss()

### code to get accuracy on test data

In [22]:
!pip install jiwer
import numpy as np
from jiwer import wer, cer

def train(model, dataloader, optimizer, device, tokenizer):
    model.train()
    total_loss = 0
    total_tokens = 0
    matched_tokens = 0
    total_wer = 0
    total_cer = 0
    total_sentences = 0
    incorrect_sentences = 0

    for batch_idx, batch in enumerate(dataloader):
        optimizer.zero_grad()

        input_ids = batch[0].to(device)
        attention_mask = (input_ids != tokenizer.pad_token_id).int().to(device)
        labels = batch[1].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Generate model's output
        with torch.no_grad():
            generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_token_length)

        # Decode input, target, and generated output
        input_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
        target_text = tokenizer.decode(labels[0], skip_special_tokens=True)
        generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

        # Convert target and generated text to lowercase
        target_text = target_text.lower()
        generated_text = generated_text.lower()

        # Print tokenized input and output for debugging
        # if batch_idx % 10 == 0:
            # print(f"Tokenized Input IDs: {input_ids[0]}")
            # print(f"Tokenized Target IDs: {labels[0]}")
            # print(f"Generated Token IDs: {generated_ids[0]}")

        # Calculate token-level accuracy
        predicted_tokens = generated_ids[0]
        expected_tokens = labels[0]

        # Find the minimum length of the two tensors
        min_length = min(len(predicted_tokens), len(expected_tokens))

        # Truncate both tensors to the minimum length
        predicted_tokens = predicted_tokens[:min_length]
        expected_tokens = expected_tokens[:min_length]

        curr_total_tokens = len(expected_tokens)
        curr_matched_tokens = (predicted_tokens == expected_tokens).sum().item()

        total_tokens += curr_total_tokens
        matched_tokens += curr_matched_tokens

        # Calculate WER and CER
        curr_wer = wer(target_text, generated_text)
        curr_cer = cer(target_text, generated_text)

        total_wer += curr_wer
        total_cer += curr_cer

        # Calculate SER
        if generated_text != target_text:
            incorrect_sentences += 1
        total_sentences += 1

        # Print input, target, and generated output every few batches (e.g., every 10 batches)
        if batch_idx % 10 == 0:
            print(f"Batch {batch_idx}:")
            print(f"Input: {input_text}")
            print(f"Target: {target_text}")
            print(f"Generated: {generated_text}")
            print(f"WER: {curr_wer:.4f}, CER: {curr_cer:.4f}")
            print("-" * 50)

    # Calculate overall metrics
    accuracy = (matched_tokens / total_tokens) * 100.0
    avg_wer = (total_wer / total_sentences) * 100.0
    avg_cer = (total_cer / total_sentences) * 100.0
    ser = (incorrect_sentences / total_sentences) * 100.0

    print(f"Training Metrics:")
    print(f"Accuracy: {accuracy:.2f}%")
    print(f"Average WER: {avg_wer:.2f}%")
    print(f"Average CER: {avg_cer:.2f}%")
    print(f"SER: {ser:.2f}%")
    print("=" * 50)

    return total_loss / len(dataloader)

# Training for a few epochs
for epoch in range(3):
    train_loss = train(model, train_loader, optimizer, device, tokenizer)
    print(f"Epoch {epoch + 1}, Loss: {train_loss}")

Batch 0:
Input: there are two cities with nonstock flights from port mauseby
Target: there are two cities with nonstop flights from port moresby
Generated: there are two cities with nonstock flights from port mause
WER: 0.2000, CER: 0.1186
--------------------------------------------------
Batch 10:
Input: mandella chose sabatage because it was the least harmful action did not involve killing and offered the best hope for racial reconciliation afterwards
Target: mandela chose sabotage because it was the least harmful action did not involve killing and offered the best hope for racial reconciliation afterwards
Generated: the best hope for racial reconciliation afterwards
WER: 0.6957, CER: 0.6644
--------------------------------------------------
Batch 20:
Input: others have tried to explain the phenomenon physically
Target: others have tried to explain the phenomenon physically
Generated: bölmə bölmə bölmə bölmə bölmə bölmə bölmə bölmə bölmə bölmə bölmə bölmə bölmə bölmə bölmə bölmə böl

## Save Model

In [23]:
import os
import torch

# Define the directory to save the model
save_directory = "/content/drive/MyDrive/ASR_T5_Model"

# Create the directory if it doesn't exist
os.makedirs(save_directory, exist_ok=True)

model_path = os.path.join(save_directory, "mbart_asr_model_english.pth")
torch.save(model.state_dict(), model_path)

# Save the tokenizer
tokenizer_path = os.path.join(save_directory, "mbart_asr_model_english")
tokenizer.save_pretrained(tokenizer_path)

print(f"Model saved  {model_path}")
print(f"Tokenizer saved at {tokenizer_path}")


Model saved  /content/drive/MyDrive/ASR_T5_Model/mbart_asr_model_english.pth
Tokenizer saved at /content/drive/MyDrive/ASR_T5_Model/mbart_asr_model_english


## Trained on Custom Dataset:

In [None]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
import torch.nn as nn
from jiwer import wer, cer

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Define paths
MODEL_PATH = "/content/drive/MyDrive/ASR_T5_Model/mbart_asr_model_hindi.pth"
TOKENIZER_PATH = "/content/drive/MyDrive/ASR_T5_Model/mbart_asr_model_hindi"
NEW_DATASET_PATH = "/content/drive/MyDrive/datasets/asr_correction_dataset_2_cleaned.csv"

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the saved model and tokenizer
tokenizer = T5Tokenizer.from_pretrained(TOKENIZER_PATH)

model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")  # Use the base model or your custom model
model.load_state_dict(torch.load(MODEL_PATH, map_location=device))

model.to(device)
model.eval()

# Load the new dataset
df = pd.read_csv(NEW_DATASET_PATH)
print(df.head())

# Preprocess the dataset
X = df['predicted_transcript'].tolist()
y = df['actual_sentence'].tolist()

# Trim and lowercase the data
X = [sentence.lower().strip() for sentence in X]
y = [sentence.lower().strip() for sentence in y]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize the data
def calculate_max_token_length(X, y, tokenizer):
    input_lengths = [len(tokenizer.encode(sentence)) for sentence in X]
    target_lengths = [len(tokenizer.encode(sentence)) for sentence in y]
    max_token_length = max(max(input_lengths), max(target_lengths))
    return max_token_length

max_token_length = calculate_max_token_length(X_train + X_test, y_train + y_test, tokenizer)
print(f"Max token length: {max_token_length}")

def tokenize_data(X, y, tokenizer, max_length=max_token_length):
    inputs = tokenizer(X, padding="max_length", max_length=max_length, truncation=True, return_tensors="pt")
    targets = tokenizer(y, padding="max_length", max_length=max_length, truncation=True, return_tensors="pt")
    return inputs['input_ids'], targets['input_ids']

X_train_tokenized, y_train_tokenized = tokenize_data(X_train, y_train, tokenizer)
X_test_tokenized, y_test_tokenized = tokenize_data(X_test, y_test, tokenizer)

# Dataset class
class ASRDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]

train_dataset = ASRDataset(X_train_tokenized, y_train_tokenized)
test_dataset = ASRDataset(X_test_tokenized, y_test_tokenized)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

def train(model, dataloader, optimizer, device, tokenizer):
    model.train()
    total_loss = 0
    total_tokens = 0
    matched_tokens = 0
    total_wer = 0
    total_cer = 0
    total_sentences = 0
    incorrect_sentences = 0

    for batch_idx, batch in enumerate(dataloader):
        optimizer.zero_grad()

        input_ids = batch[0].to(device)
        attention_mask = (input_ids != tokenizer.pad_token_id).int().to(device)
        labels = batch[1].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Generate model's output
        with torch.no_grad():
            generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_token_length)

        # Decode input, target, and generated output
        input_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
        target_text = tokenizer.decode(labels[0], skip_special_tokens=True)
        generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

        # Convert target and generated text to lowercase
        target_text = target_text.lower()
        generated_text = generated_text.lower()

        # Print tokenized input and output for debugging
        if batch_idx % 10 == 0:
            print(f"Tokenized Input IDs: {input_ids[0]}")
            print(f"Tokenized Target IDs: {labels[0]}")
            print(f"Generated Token IDs: {generated_ids[0]}")

        # Calculate token-level accuracy
        predicted_tokens = generated_ids[0]
        expected_tokens = labels[0]

        # Find the minimum length of the two tensors
        min_length = min(len(predicted_tokens), len(expected_tokens))

        # Truncate both tensors to the minimum length
        predicted_tokens = predicted_tokens[:min_length]
        expected_tokens = expected_tokens[:min_length]

        curr_total_tokens = len(expected_tokens)
        curr_matched_tokens = (predicted_tokens == expected_tokens).sum().item()

        total_tokens += curr_total_tokens
        matched_tokens += curr_matched_tokens

        # Calculate WER and CER
        curr_wer = wer(target_text, generated_text)
        curr_cer = cer(target_text, generated_text)

        total_wer += curr_wer
        total_cer += curr_cer

        # Calculate SER
        if generated_text != target_text:
            incorrect_sentences += 1
        total_sentences += 1

        # Print input, target, and generated output every few batches (e.g., every 10 batches)
        if batch_idx % 10 == 0:
            print(f"Batch {batch_idx}:")
            print(f"Input: {input_text}")
            print(f"Target: {target_text}")
            print(f"Generated: {generated_text}")
            print(f"WER: {curr_wer:.4f}, CER: {curr_cer:.4f}")
            print("-" * 50)

    # Calculate overall metrics
    accuracy = (matched_tokens / total_tokens) * 100.0
    avg_wer = (total_wer / total_sentences) * 100.0
    avg_cer = (total_cer / total_sentences) * 100.0
    ser = (incorrect_sentences / total_sentences) * 100.0

    print(f"Training Metrics:")
    print(f"Accuracy: {accuracy:.2f}%")
    print(f"Average WER: {avg_wer:.2f}%")
    print(f"Average CER: {avg_cer:.2f}%")
    print(f"SER: {ser:.2f}%")
    print("=" * 50)

    return total_loss / len(dataloader)

# Training for a few epochs
for epoch in range(3):
    train_loss = train(model, train_loader, optimizer, device, tokenizer)
    print(f"Epoch {epoch + 1}, Loss: {train_loss}")

# Save the fine-tuned model
fine_tuned_model_path = "/content/drive/MyDrive/ASR_T5_Model/flan_t5_asr_model_fine_tuned.pth"
torch.save(model.state_dict(), fine_tuned_model_path)
print(f"Fine-tuned model saved to {fine_tuned_model_path}")

# Evaluate Code

In [24]:
input_file_path = "/content/drive/MyDrive/ASR_EVAL_DATA/english_eval.txt"

EVAL_FILE_PATH = input_file_path
f = open(EVAL_FILE_PATH, "r")
eval_content = f.read()
X_eval = eval_content.split('\n')
print(len(X_eval))
print(max([len(x) for x in X_eval]))


1001
370


In [26]:
def split_sentence(sentence, max_words):
  splitted_sentences = []
  for i in range(0, len(sentence.split()), max_words):
    curr_sentence = ' '.join(sentence.split()[i:i+max_words])
    splitted_sentences.append(curr_sentence)
  return splitted_sentences


In [27]:
X_eval_splitted = []
for sentence in X_eval:
  X_eval_splitted.append(split_sentence(sentence,10))
X_eval_splitted

[["OR FOR ELON WHO'S BEN TALKING ABOUT OPTIMISTS THE ROBOT",
  'POTENTIALY THE INTELIGENCE OF THE ROBOT AND THEN YOU HAV'],
 ['WHAT CAN WE SAY ABOUT OPEN WEIGHTS TO HELP US',
  'UNDERSTAND WHAT THE WEIGHTS ARE ABLE TO DO YES IN',
  "TERMS OF STEALING PEOPLE'S DATA YES SO THESE WEIGHTS THAT",
  'YOU CAN DOWNWARD FROM HUGING FACE OR OTHER PLATFORMS ARE'],
 ['SHIPS AS YOU HAVE MORE SPECIALIZATION REQUIRED AND THE COST',
  'OF FABS CONTINUES TO GROW YOU NED SOMEONE WHO IS',
  'LAZER FOCUSED ON BUILDING THE BEST PROCES TECHNOLOGY AND MAKING',
  'IT AS FLEXIBLE AS POSIBLE I THINK HE COUN SAY',
  'IT SIMPLY IT IS THE COST PER FAB GOES'],
 ['STICK INTEMOUSLY FOR WINDOWS'],
 ['I THINK A LOT OF THE A I INDUSTRYS GOING',
  'THROUGH THIS CHALENGE OF COMUNICATIONS RIGHT NOW WHERE OPEN A',
  'I MAKES FUN OF THEIR OWN NAMING SCHEMES THEY HAVE',
  'G P T FORO THEY'],
 ["VEN HUNDRED WATS AND THAT'S JUST PERG P AND THE",
  "THERE'S AL THE ASOCIATED STUF AROUND IT SO THAS ONCE",
  "YOU COUNT AL THAT IT'S

In [28]:
def infer(model, sentence):
  tokenized_sentence = tokenizer(sentence, return_tensors="pt")
  input_ids=(tokenized_sentence['input_ids']).to(device)
  attention_mask=(tokenized_sentence['attention_mask']).to(device)

  output = model.generate(
      input_ids=input_ids,
      attention_mask=attention_mask,
      max_length=512,
      num_beams=5,
      early_stopping=True,
  )

  # Decode the output tokens to text
  decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
  return decoded_output

In [29]:
def pre(sentence):
  return sentence.lower().strip()
def post(sentence):
  return sentence.upper()

predicted_outputs = []
index = 0
for sentence_batch in X_eval_splitted:
  index += 1
  curr_batch = []
  for sentence in sentence_batch:
    output_text = infer(model,pre(sentence))
    print("   ",output_text)
    cleaned_text = output_text.replace("<|endoftext|>", "")
    cleaned_text = post(cleaned_text)
    curr_batch.append(cleaned_text)
  predicted_output = ' '.join(curr_batch)
  predicted_outputs.append(predicted_output)
  print(index, " / ", len(X_eval_splitted))
  # print(predicted_output)

predicted_outputs

    or for elyon who's ben talking about optimists the robot
    potentially the brain is powered
1  /  1001
    what can we say about open weights to help us
    understand what the weights are able to do it
    there are terms of steal people's data yes so these weights that
    you can down from huging face or other platforms are
2  /  1001
    ships are more specialization required and the cost
    of fabs continues to grow you as a zombie
    lather focus on building the best process technology and making
    it is flexible as posible i think he coun say
    it is the cost per hour
3  /  1001
    stick intemously for windows
4  /  1001
    i think a lot of the awe industry is going
    through this troubleshoots of communications right now where open a
    i makes fun of their own naming schemes they have
    gp foru they
5  /  1001
    set out hundred wats and that's just perg app and the
    there's al the fourth around it so a sign
    you count al that it is roughly like twelv

["OR FOR ELYON WHO'S BEN TALKING ABOUT OPTIMISTS THE ROBOT POTENTIALLY THE BRAIN IS POWERED",
 "WHAT CAN WE SAY ABOUT OPEN WEIGHTS TO HELP US UNDERSTAND WHAT THE WEIGHTS ARE ABLE TO DO IT THERE ARE TERMS OF STEAL PEOPLE'S DATA YES SO THESE WEIGHTS THAT YOU CAN DOWN FROM HUGING FACE OR OTHER PLATFORMS ARE",
 'SHIPS ARE MORE SPECIALIZATION REQUIRED AND THE COST OF FABS CONTINUES TO GROW YOU AS A ZOMBIE LATHER FOCUS ON BUILDING THE BEST PROCESS TECHNOLOGY AND MAKING IT IS FLEXIBLE AS POSIBLE I THINK HE COUN SAY IT IS THE COST PER HOUR',
 'STICK INTEMOUSLY FOR WINDOWS',
 'I THINK A LOT OF THE AWE INDUSTRY IS GOING THROUGH THIS TROUBLESHOOTS OF COMMUNICATIONS RIGHT NOW WHERE OPEN A I MAKES FUN OF THEIR OWN NAMING SCHEMES THEY HAVE GP FORU THEY',
 "SET OUT HUNDRED WATS AND THAT'S JUST PERG APP AND THE THERE'S AL THE FOURTH AROUND IT SO A SIGN YOU COUNT AL THAT IT IS ROUGHLY LIKE TWELVE HUNDRED TO FOURTEEN HUNDRED",
 "WREKING YOU KNOW A LOT OF LIKE HAAS NETWORK GREAT YOU KNOW NOTHING ABOUT NA

In [30]:
# join prediected_outputs with '\n'
print(predicted_outputs)
english_out = '\n'.join(predicted_outputs)
english_out

["OR FOR ELYON WHO'S BEN TALKING ABOUT OPTIMISTS THE ROBOT POTENTIALLY THE BRAIN IS POWERED", "WHAT CAN WE SAY ABOUT OPEN WEIGHTS TO HELP US UNDERSTAND WHAT THE WEIGHTS ARE ABLE TO DO IT THERE ARE TERMS OF STEAL PEOPLE'S DATA YES SO THESE WEIGHTS THAT YOU CAN DOWN FROM HUGING FACE OR OTHER PLATFORMS ARE", 'SHIPS ARE MORE SPECIALIZATION REQUIRED AND THE COST OF FABS CONTINUES TO GROW YOU AS A ZOMBIE LATHER FOCUS ON BUILDING THE BEST PROCESS TECHNOLOGY AND MAKING IT IS FLEXIBLE AS POSIBLE I THINK HE COUN SAY IT IS THE COST PER HOUR', 'STICK INTEMOUSLY FOR WINDOWS', 'I THINK A LOT OF THE AWE INDUSTRY IS GOING THROUGH THIS TROUBLESHOOTS OF COMMUNICATIONS RIGHT NOW WHERE OPEN A I MAKES FUN OF THEIR OWN NAMING SCHEMES THEY HAVE GP FORU THEY', "SET OUT HUNDRED WATS AND THAT'S JUST PERG APP AND THE THERE'S AL THE FOURTH AROUND IT SO A SIGN YOU COUNT AL THAT IT IS ROUGHLY LIKE TWELVE HUNDRED TO FOURTEEN HUNDRED", "WREKING YOU KNOW A LOT OF LIKE HAAS NETWORK GREAT YOU KNOW NOTHING ABOUT NAE NETW

"OR FOR ELYON WHO'S BEN TALKING ABOUT OPTIMISTS THE ROBOT POTENTIALLY THE BRAIN IS POWERED\nWHAT CAN WE SAY ABOUT OPEN WEIGHTS TO HELP US UNDERSTAND WHAT THE WEIGHTS ARE ABLE TO DO IT THERE ARE TERMS OF STEAL PEOPLE'S DATA YES SO THESE WEIGHTS THAT YOU CAN DOWN FROM HUGING FACE OR OTHER PLATFORMS ARE\nSHIPS ARE MORE SPECIALIZATION REQUIRED AND THE COST OF FABS CONTINUES TO GROW YOU AS A ZOMBIE LATHER FOCUS ON BUILDING THE BEST PROCESS TECHNOLOGY AND MAKING IT IS FLEXIBLE AS POSIBLE I THINK HE COUN SAY IT IS THE COST PER HOUR\nSTICK INTEMOUSLY FOR WINDOWS\nI THINK A LOT OF THE AWE INDUSTRY IS GOING THROUGH THIS TROUBLESHOOTS OF COMMUNICATIONS RIGHT NOW WHERE OPEN A I MAKES FUN OF THEIR OWN NAMING SCHEMES THEY HAVE GP FORU THEY\nSET OUT HUNDRED WATS AND THAT'S JUST PERG APP AND THE THERE'S AL THE FOURTH AROUND IT SO A SIGN YOU COUNT AL THAT IT IS ROUGHLY LIKE TWELVE HUNDRED TO FOURTEEN HUNDRED\nWREKING YOU KNOW A LOT OF LIKE HAAS NETWORK GREAT YOU KNOW NOTHING ABOUT NAE NETWORKS MENTIONI

In [34]:
OUT_FILE_PATH = '/content/drive/MyDrive/ASR_EVAL_DATA/english_mbart.txt'
f = open(OUT_FILE_PATH, "w")
f.write(english_out)
f.close()