In [17]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib
import ast

# Directory containing audio files
directory_path = "audio_files"

# List of words
words = ['Environment', 'Archives', 'Pronounciation', 'Hour', 'Wednesday', 'Violence', 'Tomb', 'Suite', 'Iron', 'Reciept', 'Chores']

# Initialize a dictionary to store trained models
trained_models = {}

# Define a list of classifiers
classifiers = {
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(kernel='linear', C=1),
    "KNN": KNeighborsClassifier(n_neighbors=3),
    "LogisticRegression": LogisticRegression(max_iter=1000)
}

# Train models for each word
for word in words:
    # Load the dataframe for the current word
    df = pd.read_csv("df/" + f"{word}_dataframe.csv")

    # Convert the string representation of MFCC values back to list of lists
    df["MFCC"] = df["MFCC"].apply(ast.literal_eval)

    # Flatten the MFCC values
    df["MFCC"] = df["MFCC"].apply(lambda x: [item for sublist in x for item in sublist])

    # Extract features (MFCC values)
    X = np.array(df["MFCC"].tolist())
    y = df["Name"]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print(f"\nTraining models for word '{word}':")
    
    highest_accuracy = 0.0
    best_model_name = None
    best_model = None
    
    # Train and evaluate each model
    for model_name, classifier in classifiers.items():
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)

        print(f"Model: {model_name} - Accuracy: {accuracy}")

        # Check if the current model has the highest accuracy
        if accuracy > highest_accuracy:
            highest_accuracy = accuracy
            best_model_name = model_name
            best_model = classifier

    # Save the best model
    if best_model is not None:
        model_filename = f"models/{word}_{best_model_name}_model.joblib"
        joblib.dump(best_model, model_filename)
        trained_models[word] = model_filename
        print(f"Best model saved to '{model_filename}' with highest accuracy: {highest_accuracy}\n")

# Save the dictionary of trained models
models_info_filename = "models/trained_models_info.joblib"
joblib.dump(trained_models, models_info_filename)
print(f"\nTrained models information saved to '{models_info_filename}'")


Training models for word 'Environment':
Model: RandomForest - Accuracy: 1.0
Model: SVM - Accuracy: 1.0
Model: KNN - Accuracy: 1.0
Model: LogisticRegression - Accuracy: 1.0
Best model saved to 'models/Environment_RandomForest_model.joblib' with highest accuracy: 1.0


Training models for word 'Archives':
Model: RandomForest - Accuracy: 0.6666666666666666
Model: SVM - Accuracy: 0.6666666666666666
Model: KNN - Accuracy: 0.8333333333333334
Model: LogisticRegression - Accuracy: 0.6666666666666666


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Best model saved to 'models/Archives_KNN_model.joblib' with highest accuracy: 0.8333333333333334


Training models for word 'Pronounciation':
Model: RandomForest - Accuracy: 1.0
Model: SVM - Accuracy: 0.6666666666666666
Model: KNN - Accuracy: 0.6666666666666666


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model: LogisticRegression - Accuracy: 0.8333333333333334
Best model saved to 'models/Pronounciation_RandomForest_model.joblib' with highest accuracy: 1.0


Training models for word 'Hour':
Model: RandomForest - Accuracy: 1.0
Model: SVM - Accuracy: 0.8333333333333334
Model: KNN - Accuracy: 0.5
Model: LogisticRegression - Accuracy: 0.8333333333333334
Best model saved to 'models/Hour_RandomForest_model.joblib' with highest accuracy: 1.0


Training models for word 'Wednesday':
Model: RandomForest - Accuracy: 1.0
Model: SVM - Accuracy: 1.0
Model: KNN - Accuracy: 0.6666666666666666


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model: LogisticRegression - Accuracy: 1.0
Best model saved to 'models/Wednesday_RandomForest_model.joblib' with highest accuracy: 1.0


Training models for word 'Violence':
Model: RandomForest - Accuracy: 1.0
Model: SVM - Accuracy: 1.0
Model: KNN - Accuracy: 0.6666666666666666
Model: LogisticRegression - Accuracy: 1.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Best model saved to 'models/Violence_RandomForest_model.joblib' with highest accuracy: 1.0


Training models for word 'Tomb':
Model: RandomForest - Accuracy: 1.0
Model: SVM - Accuracy: 0.8333333333333334
Model: KNN - Accuracy: 0.8333333333333334
Model: LogisticRegression - Accuracy: 0.8333333333333334
Best model saved to 'models/Tomb_RandomForest_model.joblib' with highest accuracy: 1.0


Training models for word 'Suite':
Model: RandomForest - Accuracy: 1.0
Model: SVM - Accuracy: 0.8333333333333334
Model: KNN - Accuracy: 0.6666666666666666
Model: LogisticRegression - Accuracy: 0.8333333333333334
Best model saved to 'models/Suite_RandomForest_model.joblib' with highest accuracy: 1.0


Training models for word 'Iron':
Model: RandomForest - Accuracy: 1.0
Model: SVM - Accuracy: 1.0
Model: KNN - Accuracy: 0.8333333333333334
Model: LogisticRegression - Accuracy: 1.0
Best model saved to 'models/Iron_RandomForest_model.joblib' with highest accuracy: 1.0


Training models for word 'Reciept':
Mo