In [43]:
# load the 'prononciation' files
# generate the df with mfcc, vector, row mean
# train all models from additional_features file
# collect 5 test samples
# augment to 5*7 = 35
# predict 

### Pronounciation files

In [44]:
import os

files = []
for filename in os.listdir("augmented_audio/"):
    if "pronounciation" in filename.lower():
        files.append("augmented_audio/" + filename)

files

['augmented_audio/Abhishek10Pronounciation.wavoutput_augmented.wav',
 'augmented_audio/Abhishek10Pronounciation.wavoutput_compressed.wav',
 'augmented_audio/Abhishek10Pronounciation.wavoutput_cropped.wav',
 'augmented_audio/Abhishek10Pronounciation.wavoutput_noisy.wav',
 'augmented_audio/Abhishek10Pronounciation.wavoutput_pitch_shifted.wav',
 'augmented_audio/Abhishek10Pronounciation.wavoutput_speed_changed.wav',
 'augmented_audio/Abhishek10Pronounciation.wavoutput_stretched.wav',
 'augmented_audio/Abhishek1Pronounciation.wavoutput_augmented.wav',
 'augmented_audio/Abhishek1Pronounciation.wavoutput_compressed.wav',
 'augmented_audio/Abhishek1Pronounciation.wavoutput_cropped.wav',
 'augmented_audio/Abhishek1Pronounciation.wavoutput_noisy.wav',
 'augmented_audio/Abhishek1Pronounciation.wavoutput_pitch_shifted.wav',
 'augmented_audio/Abhishek1Pronounciation.wavoutput_speed_changed.wav',
 'augmented_audio/Abhishek1Pronounciation.wavoutput_stretched.wav',
 'augmented_audio/Abhishek2Pronounc

### Creating df of Pronounciation files - MFCC, Vector

In [45]:
import os
import librosa
import pandas as pd
import re
import numpy as np
from scipy.io.wavfile import read
from sklearn import preprocessing
import python_speech_features as mfcc

def extract_mfcc(file_path, n_mfcc=25):
    y, sr = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    return mfccs.flatten()

# Function to extract features (MFCC and delta coefficients)
def extract_features(audio, rate):
    mfcc_feature = mfcc.mfcc(audio, rate, 0.025, 0.01, 20, nfft=1200, appendEnergy=True)
    mfcc_feature = preprocessing.scale(mfcc_feature)
    delta = calculate_delta(mfcc_feature)
    combined = np.hstack((mfcc_feature, delta))
    return combined.flatten()

# Function to calculate delta coefficients
def calculate_delta(array):
    rows, cols = array.shape
    deltas = np.zeros((rows, 20))
    n = 2
    for i in range(rows):
        index = []
        j = 1
        while j <= n:
            if i - j < 0:
                first = 0
            else:
                first = i - j
            if i + j > rows - 1:
                second = rows - 1
            else:
                second = i + j
            index.append((second, first))
            j += 1
        deltas[i] = (array[index[0][0]] - array[index[0][1]] + (2 * (array[index[1][0]] - array[index[1][1]]))) / 10
    return deltas

# Specify the folder path
folder_path = "augmented_audio/"

# List all files in the folder
files = os.listdir(folder_path)

# Create a dictionary to store data for each word
word_data = {}

# Traverse through each file
for file_name in files:
    if "pronounciation" in file_name.lower():
        if file_name.lower().endswith(".wav"):
            # Parse the file name to extract information
            pattern = r'([A-Za-z]+)\d+([A-Za-z]+)'
            # Use re.match to find the pattern in the file name
            match = re.match(pattern, file_name)
            if match:
                # Extract the name and word from the matched groups
                name = match.group(1)
                word = match.group(2)
            
            # Check if the word is already in the dictionary
            if word not in word_data:
                word_data[word] = {'Name': [], 'MFCC': [], 'Vector': []}

            # Load the original audio and extract MFCC
            input_file_path = os.path.join(folder_path, file_name)
            mfccs = extract_mfcc(input_file_path)
            
            # Load the original audio
            sr, audio = read(os.path.join(folder_path, file_name))
            
            # Extract features (MFCC and delta coefficients)
            features = extract_features(audio, sr)
            
            # Add data to the dictionary
            word_data[word]['Name'].append(name)
            word_data[word]['MFCC'].append(mfccs)
            word_data[word]['Vector'].append(features)  # Add the vector values

# Create DataFrames for each word
word_dfs = {}
for word, data in word_data.items():
    df = pd.DataFrame(data)
    word_dfs[word] = df

# Display DataFrames for each word
for word, df in word_dfs.items():
    print(f"\nWord: {word}")
    print(df)
df

KeyboardInterrupt: 

### Training all models on Vector

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

df_vector = df.copy()

# Assuming df is your DataFrame with 'Vector' and 'Label' columns
X = df_vector['MFCC'].values
y = df_vector['Name']

# Define a custom padding function
def pad_sequences_with_mean(sequences, max_length):
    padded_sequences = np.zeros((len(sequences), max_length))
    
    for i, seq in enumerate(sequences):
        seq_len = len(seq)
        if seq_len > 0:
            mean_value = np.mean(seq)
            padded_sequences[i, :seq_len] = seq
            padded_sequences[i, seq_len:] = mean_value
    
    return padded_sequences

# Find the maximum length of sequences
max_length = max(len(seq) for seq in X)

# Pad the sequences with the mean value
X_padded = pad_sequences_with_mean(X, max_length)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=22)

# Initialize various classifiers and create a dictionary to store trained models
trained_models = {}

classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'MLP Classifier': MLPClassifier(),
    'AdaBoost Classifier': AdaBoostClassifier(),
    'Bagging Classifier': BaggingClassifier(),
    'Extra Trees Classifier': ExtraTreesClassifier(),
    'Quadratic Discriminant Analysis': QuadraticDiscriminantAnalysis()
}

# Train and evaluate each classifier
for name, classifier in classifiers.items():
    print(f"\nTraining and evaluating {name}...")
    
    # Fit the model on the training data
    classifier.fit(X_train, y_train)
    
    # Make predictions on the test data
    y_pred = classifier.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)
    
    # Save the trained model in the dictionary
    trained_models[name] = classifier
    
    # Print the evaluation metrics
    print(f"Accuracy: {accuracy:.2f}")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("Classification Report:")
    print(classification_rep)



Training and evaluating Logistic Regression...
Accuracy: 1.00
Confusion Matrix:
[[12  0  0]
 [ 0 15  0]
 [ 0  0 15]]
Classification Report:
              precision    recall  f1-score   support

    Abhishek       1.00      1.00      1.00        12
        Arun       1.00      1.00      1.00        15
    Sunamdha       1.00      1.00      1.00        15

    accuracy                           1.00        42
   macro avg       1.00      1.00      1.00        42
weighted avg       1.00      1.00      1.00        42


Training and evaluating Decision Tree...
Accuracy: 0.88
Confusion Matrix:
[[10  1  1]
 [ 2 13  0]
 [ 1  0 14]]
Classification Report:
              precision    recall  f1-score   support

    Abhishek       0.77      0.83      0.80        12
        Arun       0.93      0.87      0.90        15
    Sunamdha       0.93      0.93      0.93        15

    accuracy                           0.88        42
   macro avg       0.88      0.88      0.88        42
weighted avg    



Accuracy: 0.60
Confusion Matrix:
[[12  0  0]
 [ 3 11  1]
 [13  0  2]]
Classification Report:
              precision    recall  f1-score   support

    Abhishek       0.43      1.00      0.60        12
        Arun       1.00      0.73      0.85        15
    Sunamdha       0.67      0.13      0.22        15

    accuracy                           0.60        42
   macro avg       0.70      0.62      0.56        42
weighted avg       0.72      0.60      0.55        42


Training and evaluating Bagging Classifier...
Accuracy: 0.95
Confusion Matrix:
[[11  1  0]
 [ 1 14  0]
 [ 0  0 15]]
Classification Report:
              precision    recall  f1-score   support

    Abhishek       0.92      0.92      0.92        12
        Arun       0.93      0.93      0.93        15
    Sunamdha       1.00      1.00      1.00        15

    accuracy                           0.95        42
   macro avg       0.95      0.95      0.95        42
weighted avg       0.95      0.95      0.95        42


Trai



In [None]:
max_length

4325

In [None]:
trained_models

{'Logistic Regression': LogisticRegression(),
 'Decision Tree': DecisionTreeClassifier(),
 'Random Forest': RandomForestClassifier(),
 'Gradient Boosting': GradientBoostingClassifier(),
 'Support Vector Machine': SVC(),
 'K-Nearest Neighbors': KNeighborsClassifier(),
 'Naive Bayes': GaussianNB(),
 'MLP Classifier': MLPClassifier(),
 'AdaBoost Classifier': AdaBoostClassifier(),
 'Bagging Classifier': BaggingClassifier(),
 'Extra Trees Classifier': ExtraTreesClassifier(),
 'Quadratic Discriminant Analysis': QuadraticDiscriminantAnalysis()}

### Collect test sample files

In [None]:
import sounddevice as sd
from scipy.io.wavfile import write
import wavio as wv

name = "Sunamdha"
freq = 44100
duration = 2

words = ['Pronounciation'] 

for i in range(10):
    file_name = "audio_files/test/pronounciationTest/" + name + str(i+1) + word + '.wav'
    print("Recording file " + file_name)
    print(f'{word} - {i+1}')
    recording = sd.rec(int(duration * freq), samplerate=freq, channels=1)
    sd.wait()
    write(file_name, freq, recording)
    print("Recorded file " + file_name)

### Predict live audio clip with all models

In [60]:
import os
import librosa
import pandas as pd
import re
import numpy as np
from scipy.io.wavfile import read
from sklearn import preprocessing
import python_speech_features as mfcc

def extract_mfcc(file_path, n_mfcc=25):
    y, sr = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    return mfccs.flatten()

# Function to extract features (MFCC and delta coefficients)
def extract_features(audio, rate):
    mfcc_feature = mfcc.mfcc(audio, rate, 0.025, 0.01, 20, nfft=1200, appendEnergy=True)
    mfcc_feature = preprocessing.scale(mfcc_feature)
    delta = calculate_delta(mfcc_feature)
    combined = np.hstack((mfcc_feature, delta))
    return combined.flatten()

# Function to calculate delta coefficients
def calculate_delta(array):
    rows, cols = array.shape
    deltas = np.zeros((rows, 20))
    n = 2
    for i in range(rows):
        index = []
        j = 1
        while j <= n:
            if i - j < 0:
                first = 0
            else:
                first = i - j
            if i + j > rows - 1:
                second = rows - 1
            else:
                second = i + j
            index.append((second, first))
            j += 1
        deltas[i] = (array[index[0][0]] - array[index[0][1]] + (2 * (array[index[1][0]] - array[index[1][1]]))) / 10
    return deltas

def create_df_test_vec(folder_path):
    files = os.listdir(folder_path)
    # vector_data = {'Vector': []}
    mfcc_data = {'MFCC': []}

    for file_name in files:
        if file_name.lower().endswith(".wav"):
            # MFCC Dataset
            input_file_path = os.path.join(folder_path, file_name)
            mfccs = extract_mfcc(input_file_path)
            mfcc_data['MFCC'].append(mfccs)

            # Vector Dataset
            # Load the original audio
            # sr, audio = read(os.path.join(folder_path, file_name))
            # # Extract features (MFCC and delta coefficients)
            # features = extract_features(audio, sr)
            # # Add vector to the dictionary
            # vector_data['Vector'].append(features)  # Add the vector values

    # Create DataFrame
    df_test_mfcc = pd.DataFrame(mfcc_data)

    return df_test_mfcc

# Specify the folder path
folder_path = "audio_files/test/pronounciationTest/"

# Create DataFrame
df_test_mfcc = create_df_test_vec(folder_path)

In [None]:
df_test_mfcc['MFCC'][0].shape

(4325,)

In [None]:
X_test_vec = df_test_mfcc['MFCC'].values

max_length_test = max(len(seq) for seq in X_test_vec)
max_length_test

4325

In [None]:
type(X_test_vec)

numpy.ndarray

In [61]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Assuming df_test_vec is your DataFrame with the 'Vector' column
X_test_vec = df_test_mfcc['MFCC'].values

# Find the maximum length of sequences for the test data
max_length_test = max(len(seq) for seq in X_test_vec)

# Pad or truncate the test sequences to match the training data's max_length
X_test_padded = pad_sequences_with_mean(X_test_vec, max_length_test)

# Initialize a dictionary to store predictions
predictions = {}

# Make predictions using the trained models
for name, model in trained_models.items():
    # Ensure the test_vector has the same shape as the training data
    # For example, you may need to pad or reshape it
    # test_vector_padded = pad_sequences_with_mean(X_test_vec, max_length_test)
    
    # Make the prediction
    y_pred = model.predict(X_test_padded)
    
    # Store the predictions in the dictionary
    predictions[name] = y_pred

# Create a DataFrame from the predictions
df_predictions = pd.DataFrame(predictions)

# Display the predicted usernames
df_predictions

Unnamed: 0,Logistic Regression,Decision Tree,Random Forest,Gradient Boosting,Support Vector Machine,K-Nearest Neighbors,Naive Bayes,MLP Classifier,AdaBoost Classifier,Bagging Classifier,Extra Trees Classifier,Quadratic Discriminant Analysis
0,Abhishek,Arun,Arun,Arun,Abhishek,Abhishek,Arun,Abhishek,Arun,Arun,Arun,Abhishek
1,Abhishek,Arun,Arun,Arun,Sunamdha,Sunamdha,Arun,Abhishek,Arun,Arun,Arun,Abhishek
2,Abhishek,Sunamdha,Sunamdha,Abhishek,Sunamdha,Sunamdha,Arun,Abhishek,Arun,Sunamdha,Arun,Abhishek
3,Abhishek,Arun,Arun,Arun,Sunamdha,Sunamdha,Arun,Abhishek,Arun,Arun,Abhishek,Abhishek
4,Abhishek,Arun,Arun,Arun,Sunamdha,Sunamdha,Arun,Abhishek,Arun,Arun,Abhishek,Abhishek
5,Abhishek,Arun,Arun,Arun,Abhishek,Abhishek,Arun,Abhishek,Arun,Arun,Arun,Abhishek
6,Abhishek,Arun,Arun,Arun,Abhishek,Abhishek,Arun,Abhishek,Arun,Arun,Arun,Abhishek
7,Abhishek,Arun,Arun,Arun,Abhishek,Abhishek,Arun,Abhishek,Arun,Arun,Arun,Sunamdha
8,Abhishek,Arun,Sunamdha,Abhishek,Sunamdha,Sunamdha,Arun,Abhishek,Arun,Arun,Abhishek,Abhishek
9,Arun,Arun,Sunamdha,Arun,Sunamdha,Sunamdha,Arun,Abhishek,Arun,Arun,Abhishek,Abhishek


Recording file audio_files/test/pronounciationTest/Sunamdha1Pronounciation.wav
Pronounciation - 1
Recorded file audio_files/test/pronounciationTest/Sunamdha1Pronounciation.wav
Recording file audio_files/test/pronounciationTest/Sunamdha2Pronounciation.wav
Pronounciation - 2
Recorded file audio_files/test/pronounciationTest/Sunamdha2Pronounciation.wav
Recording file audio_files/test/pronounciationTest/Sunamdha3Pronounciation.wav
Pronounciation - 3
Recorded file audio_files/test/pronounciationTest/Sunamdha3Pronounciation.wav
Recording file audio_files/test/pronounciationTest/Sunamdha4Pronounciation.wav
Pronounciation - 4
Recorded file audio_files/test/pronounciationTest/Sunamdha4Pronounciation.wav
Recording file audio_files/test/pronounciationTest/Sunamdha5Pronounciation.wav
Pronounciation - 5
Recorded file audio_files/test/pronounciationTest/Sunamdha5Pronounciation.wav
Recording file audio_files/test/pronounciationTest/Sunamdha6Pronounciation.wav
Pronounciation - 6
Recorded file audio_fi