## **Multimodal Speech Accuracy and Emotion Analyzer (MSAEA)**
A modular AI pipeline that simultaneously transcribes speech and detects speaker sentiment, designed to provide granular feedback for intent analysis.



In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [None]:
rootDir = '/content/drive/MyDrive/MSAEA';

In [None]:
!pip install -U transformers



In [None]:
from transformers import pipeline

In [None]:
!pip install evaluate jiwer



In [None]:
reference = "Hello I am DB";
prediction = "Hello IMDB";

In [None]:
import evaluate
werCalc = evaluate.load("wer");

In [None]:
wer = werCalc.compute(references = [reference], predictions = [prediction]);
wer

0.75

In [None]:
!pip install -U bitsandbytes accelerate



In [None]:
from transformers import AutoModelForSpeechSeq2Seq, BitsAndBytesConfig

In [None]:
bnbQuantConfig = BitsAndBytesConfig(load_in_8bit = True);

In [None]:
asr = AutoModelForSpeechSeq2Seq.from_pretrained('openai/whisper-tiny', quantization_config = bnbQuantConfig);

model.safetensors:   0%|          | 0.00/151M [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

In [None]:
!pip install peft



In [None]:
from peft import LoraConfig, prepare_model_for_kbit_training

In [None]:
asr = prepare_model_for_kbit_training(asr);

In [None]:
loraConfig = LoraConfig(r = 8, lora_alpha = 32, target_modules = ["q_proj", "v_proj"]);

In [None]:
from peft import get_peft_model

In [None]:
peftAsr = get_peft_model(asr, loraConfig);



In [None]:
peftAsr.print_trainable_parameters();

trainable params: 147,456 || all params: 37,908,096 || trainable%: 0.3890


In [None]:
!pip install datasets



In [None]:
from datasets import load_dataset, Audio
from transformers import AutoProcessor

In [None]:
dummySet = load_dataset('hf-internal-testing/librispeech_asr_dummy', split = 'validation');

README.md:   0%|          | 0.00/520 [00:00<?, ?B/s]

clean/validation-00000-of-00001.parquet:   0%|          | 0.00/9.19M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/73 [00:00<?, ? examples/s]

In [None]:
processor = AutoProcessor.from_pretrained('openai/whisper-tiny');

preprocessor_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

In [None]:
dummySet.features['audio'].sampling_rate

16000

In [None]:
!apt-get update && apt-get install -y ffmpeg
!pip install -U "datasets[audio]" torchcodec

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://cli.github.com/packages stable InRelease
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:4 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading packag

In [None]:
dummySet[0]['audio']['array']

array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00042725, 0.00057983,
       0.0010376 ], dtype=float32)

In [None]:
input = processor(dummySet[0]['audio']['array'], sampling_rate = 16000, return_tensors = 'pt');

In [None]:
input

{'input_features': tensor([[[ 1.1933e-01, -9.4576e-02, -1.0978e-01,  ..., -8.0603e-01,
          -8.0603e-01, -8.0603e-01],
         [ 4.9347e-04, -8.9271e-02, -6.7290e-02,  ..., -8.0603e-01,
          -8.0603e-01, -8.0603e-01],
         [-1.5326e-01, -2.0804e-01, -2.2227e-01,  ..., -8.0603e-01,
          -8.0603e-01, -8.0603e-01],
         ...,
         [-8.0603e-01, -8.0603e-01, -7.9997e-01,  ..., -8.0603e-01,
          -8.0603e-01, -8.0603e-01],
         [-8.0603e-01, -7.7211e-01, -8.0603e-01,  ..., -8.0603e-01,
          -8.0603e-01, -8.0603e-01],
         [-8.0603e-01, -8.0603e-01, -8.0603e-01,  ..., -8.0603e-01,
          -8.0603e-01, -8.0603e-01]]])}

In [None]:
input['input_features'].shape

torch.Size([1, 80, 3000])

In [None]:
def prepare_dataset(row):
    row["input_features"] = processor(row["audio"]['array'], sampling_rate = 16000, return_tensors = 'pt')['input_features'][0];
    row["labels"] = processor(text = row['text'])['input_ids'];

    return row

In [None]:
dummySet.features

{'file': Value('string'),
 'audio': Audio(sampling_rate=16000, decode=True, num_channels=None, stream_index=None),
 'text': Value('string'),
 'speaker_id': Value('int64'),
 'chapter_id': Value('int64'),
 'id': Value('string')}

In [None]:
dummySet = dummySet.map(
    prepare_dataset,
    remove_columns = ['audio', 'text', 'file', 'speaker_id', 'chapter_id', 'id']
)

Map:   0%|          | 0/73 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorForSeq2Seq

In [None]:
# dataCollator = DataCollatorForSeq2Seq(tokenizer = processor.tokenizer,
#                                       model = peftAsr);
#The above data collator didn't work because:
# You are using DataCollatorForSeq2Seq. This tool is built for Text-to-Text tasks. It assumes everything (inputs and outputs) is text.
# When it sees your audio data (input_features), it tries to feed it into the text tokenizer to "pad" it.
# The tokenizer panics because it doesn't speak "Audio," leading to the error provided ['input_features'].

import torch
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # 1. Handle Audio Inputs (input_features)
        # We extract them first so we don't mess them up
        input_features = [{"input_features": feature["input_features"]} for feature in features]

        # Use the feature extractor to pad the audio (if needed)
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # 2. Handle Text Labels (labels)
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        # Use the tokenizer to pad the text labels
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # mask padding with -100 so the model ignores it in the loss calculation
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # 3. Combine them
        batch["labels"] = labels

        return batch

dataCollator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

In [None]:
trainArgs = Seq2SeqTrainingArguments(
    output_dir = f'{rootDir}/whisperTinyFineTuned',
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 2,
    learning_rate = 1e-5,
    eval_strategy = 'steps',
    predict_with_generate = True
)

In [None]:
split = dummySet.train_test_split(test_size=0.2);

In [None]:
trainSet = split['train'];
testSet = split['test'];

In [None]:
dataCollator([trainSet[0], trainSet[1]])

You're using a WhisperTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_features': tensor([[[ 0.0318,  0.0552, -0.0907,  ..., -0.6842, -0.6842, -0.6842],
         [-0.0510,  0.0531, -0.0699,  ..., -0.6842, -0.6842, -0.6842],
         [-0.2554, -0.0615, -0.2631,  ..., -0.6842, -0.6842, -0.6842],
         ...,
         [-0.3923, -0.2903, -0.3415,  ..., -0.6842, -0.6842, -0.6842],
         [-0.4345, -0.3876, -0.4162,  ..., -0.6842, -0.6842, -0.6842],
         [-0.4286, -0.4003, -0.4541,  ..., -0.6842, -0.6842, -0.6842]],

        [[ 0.3198,  0.2728,  0.0152,  ..., -0.7283, -0.7283, -0.7283],
         [ 0.2352,  0.2563,  0.3428,  ..., -0.7283, -0.7283, -0.7283],
         [ 0.1674,  0.2605,  0.3514,  ..., -0.7283, -0.7283, -0.7283],
         ...,
         [-0.5256, -0.4830, -0.4489,  ..., -0.7283, -0.7283, -0.7283],
         [-0.4409, -0.3987, -0.3475,  ..., -0.7283, -0.7283, -0.7283],
         [-0.3888, -0.3489, -0.4050,  ..., -0.7283, -0.7283, -0.7283]]]), 'labels': tensor([[50258, 50363,  3927,  6205, 35538,    53, 15167,  3447, 10940,  8229,
       

In [None]:
trainSet

Dataset({
    features: ['input_features', 'labels'],
    num_rows: 58
})

In [None]:
testSet

Dataset({
    features: ['input_features', 'labels'],
    num_rows: 15
})

In [None]:
import re

def metrics(pred):
  predIds = pred.predictions;
  labelIds = pred.label_ids;
  labelIds[labelIds == -100] = processor.tokenizer.pad_token_id;
  pred = processor.tokenizer.batch_decode(predIds, skip_special_tokens = True);
  ref = processor.tokenizer.batch_decode(labelIds, skip_special_tokens = True);

  # NORMALIZE: Upper case + Remove punctuation
  # We use regex to keep only A-Z and spaces
  pred = [re.sub(r"[^A-Z\s]", "", s.upper()) for s in pred]
  ref = [re.sub(r"[^A-Z\s]", "", s.upper()) for s in ref]

  wer = 100*werCalc.compute(predictions = pred, references = ref)

  return {"wer": wer};

In [None]:
trainer = Seq2SeqTrainer(
    model = peftAsr,
    args = trainArgs,
    train_dataset = trainSet,
    eval_dataset = testSet,
    data_collator = dataCollator,
    compute_metrics = metrics,
    processing_class = processor.feature_extractor
)

In [None]:
trainer.train()

  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss


TrainOutput(global_step=12, training_loss=3.7653586069742837, metrics={'train_runtime': 30.3647, 'train_samples_per_second': 5.73, 'train_steps_per_second': 0.395, 'total_flos': 4320632586240000.0, 'train_loss': 3.7653586069742837, 'epoch': 3.0})

In [None]:
trainer.evaluate()



{'eval_loss': 3.6602745056152344,
 'eval_wer': 13.94422310756972,
 'eval_runtime': 5.3735,
 'eval_samples_per_second': 2.791,
 'eval_steps_per_second': 0.372,
 'epoch': 3.0}

In [None]:
import torch

# 1. Pick the first sample from the test set
sample = testSet[0]

# 2. Prepare the input
# Convert the list to a tensor, add a "batch" dimension [1, 80, 3000], and send to GPU
input_features = torch.tensor(sample["input_features"]).unsqueeze(0).to("cuda")

# 3. Generate predictions
# We use .generate() to let the model predict the text
pred_ids = peftAsr.generate(input_features=input_features)

# 4. Decode back to text
pred_text = processor.batch_decode(pred_ids, skip_special_tokens=True)[0]

# Decode the correct label (handling the -100 padding we added earlier)
label_ids = sample["labels"]
# Replace -100 with the pad token id so decoding doesn't crash
label_ids = [l if l != -100 else processor.tokenizer.pad_token_id for l in label_ids]
ref_text = processor.decode(label_ids, skip_special_tokens=True)

print(f"Reference:  {ref_text}")
print(f"Prediction: {pred_text}")



Reference:  THE METAL FOREST IS IN THE GREAT DOMED CAVERN THE LARGEST IN ALL OUR DOMINIONS REPLIED KALIKO
Prediction:  The Middle Forest is in the Great Dome to Cavern, the largest and all our dominions, replied Calico.


In [None]:
from huggingface_hub import login

# 1. Login (Paste your 'Write' token when asked)
login()

# 2. Pick a name for your model on the hub
my_model_name = "whisper-tiny-msaea-finetuned"

# 3. Push the Model AND the Processor (very important!)
peftAsr.push_to_hub(my_model_name)
processor.push_to_hub(my_model_name)

print(f"🎉 Model saved to: https://huggingface.co/{my_model_name}")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...adapter_model.safetensors: 100%|##########|  596kB /  596kB            

No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


🎉 Model saved to: https://huggingface.co/whisper-tiny-msaea-finetuned
