In [1]:
import streamlit as st
import torch
import torch.nn as nn
import librosa
import numpy as np
import joblib
from transformers import DistilBertTokenizer, DistilBertModel
import whisper

In [2]:
distilbert_model = DistilBertModel.from_pretrained('./distilbert_model')
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('./distilbert_tokenizer')
mfcc_model = torch.load('./ffnn_model.pth')
svm_model = joblib.load('ML models/SVM_Classifier.joblib')
rf_model = joblib.load('ML models/Random_Forest_Classifier.joblib')
dt_model = joblib.load('ML models/Decision_Tree_Classifier.joblib')
nb_model = joblib.load('Naive_Bayes_Classifier.joblib')
whisper_model = whisper.load_model("base")

FileNotFoundError: [Errno 2] No such file or directory: './SVM_Classifier.joblib'

In [4]:
class MFCC_Network(nn.Module):
    def __init__(self, input_dim):
        super(MFCC_Network, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.dropout = nn.Dropout(0.25)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x
    
input_dim=100
mfcc_model = MFCC_Network(input_dim)
mfcc_model.load_state_dict(torch.load('./ffnn_model.pth'))

<All keys matched successfully>

In [5]:
audio_path = "D:\\Time Series EDA\\Clipped_Audio\\WomenWhoStutter\\2\\WomenWhoStutter_2_6.wav"

In [6]:
y, sr = librosa.load(audio_path, sr=None)
mfcc_features = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=100)
mfcc_features = np.mean(mfcc_features, axis=1)
mfcc_features_tensor = torch.tensor(mfcc_features, dtype=torch.float32).unsqueeze(0)
mfcc_model.eval()

with torch.no_grad():
    mfcc_embeddings = mfcc_model(mfcc_features_tensor).numpy()

In [7]:
result = whisper_model.transcribe(audio_path)
transcript = result['text']

In [8]:
inputs = distilbert_tokenizer(transcript, padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
    text_embeddings = distilbert_model(**inputs).last_hidden_state.mean(dim=1).numpy()

In [9]:
combined_embeddings = np.concatenate([text_embeddings, mfcc_embeddings], axis=1)

In [10]:
classes = {0:'Non-Stutter', 1:'Stutter'}

In [11]:
prediction = rf_model.predict(combined_embeddings)
print(f"Prediction: {classes[prediction[0]]}")

Prediction: Non-Stutter


In [12]:
prediction = svm_model.predict(combined_embeddings)
print(f"Prediction: {classes[prediction[0]]}")

Prediction: Stutter


In [13]:
prediction = dt_model.predict(combined_embeddings)
print(f"Prediction: {classes[prediction[0]]}")

Prediction: Non-Stutter
