## Adding the dependecies

In [None]:
!pip install datasets
!pip install evaluate
!pip install jiwer

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/542.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/542.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━[0m [32m419.8/542.0 kB[0m [31m6.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

## Importing libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import time
%matplotlib inline

For creating the dataset used in training the Whisper model, we took reference from the Hugging Face article at https://huggingface.co/blog/fine-tune-whisper. We would like to extend our gratitude to Sanchit Gandhi for this amazing article, as it was immensely helpful in creating the dataset for the model.

## Importing WhisperFeatureExtractor and WhisperTokenizer from the whisper-base model

WhisperFeatureExtractor returns a tensor of shape (80, 3000) for audio samples.

WhisperTokenizer tokenizes the prompt text, it is based in BPE tokenizer used in GPT-2

In [None]:
## import feature extractor
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")

## Load WhisperTokenizer
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base", language="English", task="transcribe")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Mounting the google drive (Run this cell only on Google colab)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Loading the Whisper Processor

In [None]:
## Combine To Create A WhisperProcessor
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-base", language='English', task="transcribe")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Loading the required data to train on the RAM

In [None]:
data_created_dir = "/content/drive/MyDrive/Dataset/created_data/"
# data = pd.read_csv("/content/drive/MyDrive/Dataset/created_data/dataset_common_final.csv")
data = pd.read_pickle("/content/drive/MyDrive/Dataset/created_data/data_cleaned.pkl")
# audio_path = "/content/drive/MyDrive/Dataset/created_data/samples_array_final.npz"
# audio = np.load(audio_path)

In [None]:
data.head()

Unnamed: 0,sex,subject,control,promptFileName,prompt,sample_rate,samples,micType
5,0,F01,0,./Dataset/F/F01/Session1/prompts/0006.txt,stick,16000,"[0.0024414062, -0.004058838, -0.001373291, -0....",0
7,0,F01,0,./Dataset/F/F01/Session1/prompts/0008.txt,except in the winter when the ooze or snow or ...,16000,"[-0.0007324219, -0.0013427734, -0.0026245117, ...",0
8,0,F01,0,./Dataset/F/F01/Session1/prompts/0009.txt,pat,16000,"[0.00030517578, -0.0026855469, 0.00018310547, ...",0
9,0,F01,0,./Dataset/F/F01/Session1/prompts/0010.txt,up,16000,"[0.0024719238, 0.00021362305, 0.0010070801, -0...",0
10,0,F01,0,./Dataset/F/F01/Session1/prompts/0011.txt,meat,16000,"[-0.00076293945, -0.0008544922, -0.00024414062...",0


## Preprocessing steps

In [None]:
contractions = {
        "ain't": "am not",
        "aren't": "are not",
        "can't": "cannot",
        "can't've": "cannot have",
        "'cause": "because",
        "could've": "could have",
        "couldn't": "could not",
        "couldn't've": "could not have",
        "didn't": "did not",
        "doesn't": "does not",
        "don't": "do not",
        "hadn't": "had not",
        "hadn't've": "had not have",
        "hasn't": "has not",
        "haven't": "have not",
        "he'd": "he would",
        "he'd've": "he would have",
        "he'll": "he will",
        "he'll've": "he will have",
        "he's": "he is",
        "how'd": "how did",
        "how'd'y": "how do you",
        "how'll": "how will",
        "how's": "how does",
        "i'd": "i would",
        "i'd've": "i would have",
        "i'll": "i will",
        "i'll've": "i will have",
        "i'm": "i am",
        "i've": "i have",
        "isn't": "is not",
        "it'd": "it would",
        "it'd've": "it would have",
        "it'll": "it will",
        "it'll've": "it will have",
        "it's": "it is",
        "let's": "let us",
        "ma'am": "madam",
        "mayn't": "may not",
        "might've": "might have",
        "mightn't": "might not",
        "mightn't've": "might not have",
        "must've": "must have",
        "mustn't": "must not",
        "mustn't've": "must not have",
        "needn't": "need not",
        "needn't've": "need not have",
        "o'clock": "of the clock",
        "oughtn't": "ought not",
        "oughtn't've": "ought not have",
        "shan't": "shall not",
        "sha'n't": "shall not",
        "shan't've": "shall not have",
        "she'd": "she would",
        "she'd've": "she would have",
        "she'll": "she will",
        "she'll've": "she will have",
        "she's": "she is",
        "should've": "should have",
        "shouldn't": "should not",
        "shouldn't've": "should not have",
        "so've": "so have",
        "so's": "so is",
        "that'd": "that would",
        "that'd've": "that would have",
        "that's": "that is",
        "there'd": "there would",
        "there'd've": "there would have",
        "there's": "there is",
        "they'd": "they would",
        "they'd've": "they would have",
        "they'll": "they will",
        "they'll've": "they will have",
        "they're": "they are",
        "they've": "they have",
        "to've": "to have",
        "wasn't": "was not",
        " u ": " you ",
        " ur ": " your ",
        " n ": " and ",
        "you're": "you are",
        "we're": "we are",
        "we'll": "we will",
        "you'd": "you would",
        "weren't": "were not",
        "we've": "we have",
        "you'll": "you will",
        "we'd": "we would"
}

def replaceContractions(x):
    if x is None:
        return x
    x = x.split()
    for i in range(len(x)):
        if x[i] in contractions:
            x[i] = contractions[x[i]]
    x = " ".join(x)
    return x

def replaceSquareBrackers(x):
    if x is None:
        return x
    if x.find('[')!=-1 or x.find(']')!=-1:
        return None
    return x

def replaceImagesLinks(x):
    if x is None:
        return x
    if x.find('/')!=-1:
        return None
    return x

In [None]:
data.prompt = data.prompt.str.replace('\n', '') #newline
data.prompt = data.prompt.str.replace('\x1b', '') #tab
data.prompt = data.prompt.str.replace('!', '')
data.prompt = data.prompt.str.replace(';', '')
data.prompt = data.prompt.str.replace(',', '')
data.prompt = data.prompt.str.replace('-', ' ')
data.prompt = data.prompt.str.replace(".", '', regex=False)
data.prompt = data.prompt.str.replace("?", '', regex=False)
data.prompt = data.prompt.str.lower()
data.prompt = data.prompt.str.replace("'s", '')
data.prompt = data.prompt.apply(replaceContractions)
data.prompt = data.prompt.apply(replaceSquareBrackers)
data.prompt = data.prompt.str.replace("'", '')
data.prompt = data.prompt.apply(replaceImagesLinks)
data.prompt = data.prompt.str.replace("13th", "thirteenth")
data.prompt = data.prompt.str.replace('"', '')
# remove all rows with promopts which has '/'
# remove all rows with square brackets
text = data.prompt.tolist()
text = [x for x in data['prompt'].tolist() if x is not None]
text = " ".join(text)
chars = sorted(list(set(text)))
print(chars)

[' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [None]:
data.dropna(inplace=True)

In [None]:
# taking the audio samples of Dysarthric patients
data = data[data.control == 0]

## Convert dataframe to Datasets type defined in Pytorch

In [None]:
from datasets import Dataset
import pandas as pd
from datasets import Audio
import gc
from sklearn.model_selection import train_test_split

In [None]:
train_data, tmp_data = train_test_split(data, test_size=0.3)
val_data, test_data = train_test_split(tmp_data, test_size=0.3)
del tmp_data

In [None]:
train_data.shape, val_data.shape, test_data.shape

((5345, 8), (1603, 8), (688, 8))

## Save the processed data to load them while training

In [None]:
train_data.to_pickle(data_created_dir + "final_dataset/train_data_complete.pkl")
val_data.to_pickle(data_created_dir + "final_dataset/val_data_complete.pkl")
test_data.to_pickle(data_created_dir + "final_dataset/test_data_complete.pkl")

In [None]:
train_data = pd.read_pickle(data_created_dir + "final_dataset/train_data_complete.pkl")
val_data = pd.read_pickle(data_created_dir + "final_dataset/val_data_complete.pkl")
test_data = pd.read_pickle(data_created_dir + "final_dataset/test_data_complete.pkl")

In [None]:
del data

In [None]:
columns = ['samples', 'prompt']
train_df = train_data[columns]
val_df = val_data[columns]
test_df = test_data[columns]

del train_data
del val_data
del test_data

In [None]:
## convert the pandas dataframes to dataset
train_dataset = Dataset.from_pandas(train_df)
del train_df

val_dataset = Dataset.from_pandas(val_df)
del val_df

test_dataset = Dataset.from_pandas(test_df)
del test_df

In [None]:
def prepare_dataset(examples):
    # compute log-Mel input features from input audio array
    audio = examples["samples"]
    examples["input_features"] = feature_extractor(
        audio, sampling_rate=16000).input_features[0]
    del examples["samples"]
    sentences = examples["prompt"]

    # encode target text to label ids
    examples["labels"] = tokenizer(sentences).input_ids
    del examples["prompt"]
    return examples

In [None]:
train_dataset = train_dataset.map(prepare_dataset, num_proc=1)
train_dataset.save_to_disk(data_created_dir + "final_dataset/train_dataset_complete")
del train_dataset

val_dataset = val_dataset.map(prepare_dataset, num_proc=1)
val_dataset.save_to_disk(data_created_dir + "final_dataset/val_dataset_complete")
del val_dataset

test_dataset = test_dataset.map(prepare_dataset, num_proc=1)
test_dataset.save_to_disk(data_created_dir + "final_dataset/test_dataset_complete")
del test_dataset

# print(train_dataset)
# print(val_dataset)
# print(test_dataset)

Map:   0%|          | 0/5345 [00:00<?, ? examples/s]