In [4]:
import warnings
warnings.filterwarnings('ignore')

from datasets import Dataset
import pandas as pd
import numpy as np

from tqdm import tqdm
import torch

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [2]:
# MODEL_CHECKPOINT = "Helsinki-NLP/opus-mt-en-hi"
MODEL_CHECKPOINT = "google/flan-t5-small"

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)

In [None]:
text_to_translate = "Hello, how are you today?"

def translate(text):
    input_ids = tokenizer(text, return_tensors="pt")["input_ids"]
    output = model.generate(input_ids)
    translated_text = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
    return translated_text

print(translate(text_to_translate))

In [4]:
df = pd.read_csv('data/audios/train.csv')
df.describe()

Unnamed: 0,file,og_transcription,ms_asr_transcript/ion
count,4729,4729,4729
unique,4729,705,2120
top,./data/audios/test/1249120_43453425_58166571.wav,When I stand up too quickly I start to feel di...,No speech could be recognized
freq,1,26,29


In [5]:
df.dropna(inplace=True)
df.head()

Unnamed: 0,file,og_transcription,ms_asr_transcript/ion
0,./data/audios/test/1249120_43453425_58166571.wav,When I remember her I feel down,"When I remember her, I feel."
1,./data/audios/test/1249120_43719934_43347848.wav,When I carry heavy things I feel like breaking...,When I carry heavy things I feel like breaking...
2,./data/audios/test/1249120_43719934_53187202.wav,there is too much pain when i move my arm,There is so much pain when I move my arm.
3,./data/audios/test/1249120_31349958_55816195.wav,My son had his lip pierced and it is swollen a...,My flip Pierce into this one is inside on his ...
4,./data/audios/test/1249120_43719934_82524191.wav,My muscles in my lower back are aching,My muscles in my lower back are aching.


In [6]:
SOURCE = 'ms_asr_transcription'

TARGET = 'og_transcription'

In [7]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df[SOURCE], df[TARGET], test_size=0.10,
                                                    shuffle=True,
                                                    random_state=100)

In [8]:
def prep_data_for_model_fine_tuning(source_lang: list, target_lang: list) -> list:

    data_dict = dict()
    data_dict['data'] = []

    for sr_text, tr_text in zip(source_lang, target_lang):
        temp_dict = dict()
        temp_dict[SOURCE] = sr_text
        temp_dict[TARGET] = tr_text

        data_dict['data'].append(temp_dict)

    return data_dict

In [9]:
training_data = prep_data_for_model_fine_tuning(x_train.values, y_train.values)
testing_data = prep_data_for_model_fine_tuning(x_test.values, y_test.values)

In [10]:
training_data['data'][:5]

[{'ms_asr_transcript/ion': 'I feel like the around.',
  'og_transcription': 'I feel like the world goes round and round'},
 {'ms_asr_transcript/ion': "Fell's skull is cracked like nuts.",
  'og_transcription': 'fell skull is cracked like nuts'},
 {'ms_asr_transcript/ion': 'Severe pain in the upper left side of chest and may have pain to back.',
  'og_transcription': 'severe pain in the upper left side of chest and may have pain to back'},
 {'ms_asr_transcript/ion': 'When I get up I see my skin vague.',
  'og_transcription': 'When i get up i see my skin vague'},
 {'ms_asr_transcript/ion': 'When I eat I feel my stomach hurts.',
  'og_transcription': 'When i eat i feel my stomach hurts'}]

In [11]:
MAX_INPUT_LENGTH = 128

def generate_model_ready_dataset(dataset: list, source: str, target: str,
                                 model_checkpoint: str,
                                 tokenizer: AutoTokenizer):

    preped_data = []

    for row in dataset:
        inputs = row[source]
        targets = row[target]

        model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH,
                                 truncation=True, padding=True)

        model_inputs['data'] = row

        # setup the tokenizer for targets
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(targets, max_length=MAX_INPUT_LENGTH,
                                 truncation=True, padding=True)
            model_inputs['labels'] = labels['input_ids']

        preped_data.append(model_inputs)

    return preped_data

In [None]:
train_data = generate_model_ready_dataset(dataset=training_data['data'],
                                        tokenizer=tokenizer,
                                        source=SOURCE,
                                        target=TARGET,
                                        model_checkpoint=MODEL_CHECKPOINT)

test_data = generate_model_ready_dataset(dataset=testing_data['data'],
                                        tokenizer=tokenizer,
                                        source=SOURCE,
                                        target=TARGET,
                                        model_checkpoint=MODEL_CHECKPOINT)

In [13]:
# train_data
# test_data

In [None]:
train_df = pd.DataFrame.from_records(train_data)
train_df.info()

In [None]:
test_df = pd.DataFrame.from_records(test_data)
test_df.info()

In [16]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

train_dataset

Dataset({
    features: ['attention_mask', 'data', 'input_ids', 'labels'],
    num_rows: 4256
})

In [17]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)

MODEL_INTENT = 'asr_correction'

trained_model_name = f'{MODEL_CHECKPOINT.split('/')[-1]}-finetuned-{MODEL_INTENT}'
model_args = Seq2SeqTrainingArguments(
    trained_model_name,
    # evaluation_strategy='epoch',
    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.02,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [18]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [None]:
trainer = Seq2SeqTrainer(
    model.to(device),
    model_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

In [20]:
trainer.save_model(f"{MODEL_CHECKPOINT}-finetuned-{MODEL_INTENT}")