<a href="https://colab.research.google.com/github/DavidCachiaEnriquez/ict3909/blob/main/supervisedModelTraining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd /content/drive/MyDrive/Final Year Project/

Mounted at /content/drive
/content/drive/MyDrive/Final Year Project


In [None]:
# Dataset Name
datasetName = "RecolaLabelledFull"

In [None]:
import pandas as pd

# Dataset
dataset = pd.read_csv("Datasets/" + datasetName + '.csv')

# Features
audioFeatures = dataset.filter(regex=f'^{"ComPar"}|{"audio_speech"}', axis=1)
visualFeatures = dataset.filter(regex=f'^{"VIDEO"}|{"Face_detection"}', axis=1)
physiologyFeatures = dataset.filter(regex=f'^{"ECG"}|{"EDA"}', axis=1)
allFeatures = dataset.filter(regex=f'^{"ComPar"}|{"audio_speech"}|{"VIDEO"}|{"Face_detection"}|{"ECG"}|{"EDA"}', axis=1)

# Labels
targetArousal = dataset["classLabelArousal"]
targetValence = dataset["classLabelValence"]

# Number of Folds
folds = 9

# Groups
groups = list(dataset["Participant"])

In [None]:
# Grouping data for easy running

# Files
filesGroup = ["Audio-Arousal.csv", "Visual-Arousal.csv", "Physiology-Arousal.csv", "All-Arousal.csv",
             "Audio-Valence.csv", "Visual-Valence.csv", "Physiology-Valence.csv", "All-Valence.csv"]

# Features
featuresGroup = [audioFeatures, visualFeatures, physiologyFeatures, allFeatures,
                 audioFeatures, visualFeatures, physiologyFeatures, allFeatures]

# Targets
targetsGroup = [targetArousal, targetArousal, targetArousal, targetArousal,
                targetValence, targetValence, targetValence, targetValence]

# Binary Logistic Regression

In [None]:
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def binaryLogisticRegression(feature, label, folds, groups, fileName, folder):

  # Initialize lists to store evaluation metrics for each fold
  foldTrainAccuracies = []
  foldTestAccuracies = []
  foldConfMatrices = []

  # Create a GroupKFold object for cross-validation
  group_kfold = GroupKFold(n_splits=folds)

  for train_index, test_index in group_kfold.split(feature, label, groups):
    inputTrain, inputTest = feature.iloc[train_index], feature.iloc[test_index]
    targetTrain, targetTest = label.iloc[train_index], label.iloc[test_index]

    model = LogisticRegression(solver='liblinear', max_iter=100000)
    model.fit(inputTrain, targetTrain)

    # Training accuracy
    y_train_pred = model.predict(inputTrain)
    train_accuracy = accuracy_score(targetTrain, y_train_pred)
    foldTrainAccuracies.append(train_accuracy)

    # Testing accuracy
    y_test_pred = model.predict(inputTest)
    test_accuracy = accuracy_score(targetTest, y_test_pred)
    foldTestAccuracies.append(test_accuracy)

    confusion_matrix_result = confusion_matrix(targetTest, y_test_pred)
    classification_report_result = classification_report(targetTest, y_test_pred)

    # Store evaluation metrics for this fold
    foldConfMatrices.append(confusion_matrix_result)

  # Storing results in CSV per fold
  for counter in range(len(foldTrainAccuracies)):
    entry = {"fold":counter+1, "trainingAccuracy": foldTrainAccuracies[counter], "testingAccuracy": foldTestAccuracies[counter]}
    entryDF = pd.DataFrame([entry])
    csvFile = pd.concat([csvFile, entryDF], ignore_index=True)

  csvFile.to_csv("SupervisedResults/" + folder + "/BinaryLogisticRegression/" + fileName, index=False)

  # Storing mean values and confusion matrix in txt file
  txtFile = "SupervisedResults/" + folder + "/BinaryLogisticRegression/" + fileName[:len(fileName)-4] + ".txt"
  MTrainA = "\nMean Train Accuracy: " + str(np.mean(foldTrainAccuracies))
  MTestA = "\nMean Test Accuracy: " + str(np.mean(foldTestAccuracies))
  CM = "\nConfusion Matrices:\n" + str(np.mean(foldConfMatrices, axis=0))

  with open(txtFile, "w") as file:
    file.write(MTrainA)
    file.write(MTestA)
    file.write(CM)
    file.close

In [None]:
import time

def runBLR(features, labels, folds, groups, fileNames, datasetName):
  for counter in range(len(fileNames)):
    start = time.time()
    print("File name: " + fileNames[counter])
    binaryLogisticRegression(features[counter], labels[counter], folds, groups, fileNames[counter], datasetName)
    end = time.time()
    print("Time taken: " + str(end-start) + "\n")

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

def randomForest(feature, label, folds, groups, fileName, folder):
    csvFile = pd.DataFrame({"fold": [], "trainingAccuracy": [], "testingAccuracy": []})

    # Initialize lists to store evaluation metrics for each fold
    foldTrainAccuracies = []
    foldTestAccuracies = []
    foldConfMatrices = []

    # Create a GroupKFold object for cross-validation
    group_kfold = GroupKFold(n_splits=folds)

    for train_index, test_index in group_kfold.split(feature, label, groups):
        inputTrain, inputTest = feature.iloc[train_index], feature.iloc[test_index]
        targetTrain, targetTest = label.iloc[train_index], label.iloc[test_index]

        model = RandomForestClassifier(n_estimators=50, random_state=42)
        model.fit(inputTrain, targetTrain)

        # Training accuracy
        y_train_pred = model.predict(inputTrain)
        train_accuracy = accuracy_score(targetTrain, y_train_pred)
        foldTrainAccuracies.append(train_accuracy)

        # Testing accuracy
        y_test_pred = model.predict(inputTest)
        test_accuracy = accuracy_score(targetTest, y_test_pred)
        foldTestAccuracies.append(test_accuracy)

        confusion_matrix_result = confusion_matrix(targetTest, y_test_pred)
        classification_report_result = classification_report(targetTest, y_test_pred)

        # Store evaluation metrics for this fold
        foldConfMatrices.append(confusion_matrix_result)

    for counter in range(len(foldTrainAccuracies)):
        entry = {"fold":counter+1, "trainingAccuracy": foldTrainAccuracies[counter], "testingAccuracy": foldTestAccuracies[counter]}
        entryDF = pd.DataFrame([entry])
        csvFile = pd.concat([csvFile, entryDF], ignore_index=True)

    csvFile.to_csv("SupervisedResults/" + folder + "/RandomForest/" + fileName, index=False)


    txtFile = "SupervisedResults/" + folder + "/RandomForest/" + fileName[:len(fileName)-4] + ".txt"
    MTrainA = "\nMean Train Accuracy: " + str(np.mean(foldTrainAccuracies))
    MTestA = "\nMean Test Accuracy: " + str(np.mean(foldTestAccuracies))
    CM = "\nConfusion Matrices:\n" + str(np.mean(foldConfMatrices, axis=0))

    with open(txtFile, "w") as file:
      file.write(MTrainA)
      file.write(MTestA)
      file.write(CM)
      file.close

In [None]:
def runRF(features, labels, folds, groups, fileNames, datasetName):
  for counter in range(len(fileNames)):
    start = time.time()
    print("File name: " + fileNames[counter])
    randomForest(features[counter], labels[counter], folds, groups, fileNames[counter], datasetName)
    end = time.time()
    print("Time taken: " + str(end-start) + "\n")

# Neural Network  
  

In [None]:
from sklearn.neural_network import MLPClassifier

def neuralNetwork(feature, label, folds, groups, fileName, folder):
    csvFile = pd.DataFrame({"fold": [], "trainingAccuracy": [], "testingAccuracy": []})

    # Initialize lists to store evaluation metrics for each fold
    foldTrainAccuracies = []
    foldTestAccuracies = []
    foldConfMatrices = []

    # Create a GroupKFold object for cross-validation
    group_kfold = GroupKFold(n_splits=folds)

    for train_index, test_index in group_kfold.split(feature, label, groups):
        inputTrain, inputTest = feature.iloc[train_index], feature.iloc[test_index]
        targetTrain, targetTest = label.iloc[train_index], label.iloc[test_index]

        model = MLPClassifier(hidden_layer_sizes=(32,), max_iter=10000, random_state=42)
        model.fit(inputTrain, targetTrain)

        # Training accuracy
        y_train_pred = model.predict(inputTrain)
        train_accuracy = accuracy_score(targetTrain, y_train_pred)
        foldTrainAccuracies.append(train_accuracy)

        # Testing accuracy
        y_test_pred = model.predict(inputTest)
        test_accuracy = accuracy_score(targetTest, y_test_pred)
        foldTestAccuracies.append(test_accuracy)

        confusion_matrix_result = confusion_matrix(targetTest, y_test_pred)
        classification_report_result = classification_report(targetTest, y_test_pred)

        # Store evaluation metrics for this fold
        foldConfMatrices.append(confusion_matrix_result)

    for counter in range(len(foldTrainAccuracies)):
        entry = {"fold":counter+1, "trainingAccuracy": foldTrainAccuracies[counter], "testingAccuracy": foldTestAccuracies[counter]}
        entryDF = pd.DataFrame([entry])
        csvFile = pd.concat([csvFile, entryDF], ignore_index=True)

    csvFile.to_csv("SupervisedResults/" + folder + "/NeuralNetwork/" + fileName, index=False)


    txtFile = "SupervisedResults/" + folder + "/NeuralNetwork/" + fileName[:len(fileName)-4] + ".txt"
    MTrainA = "\nMean Train Accuracy: " + str(np.mean(foldTrainAccuracies))
    MTestA = "\nMean Test Accuracy: " + str(np.mean(foldTestAccuracies))
    CM = "\nConfusion Matrices:\n" + str(np.mean(foldConfMatrices, axis=0))

    with open(txtFile, "w") as file:
      file.write(MTrainA)
      file.write(MTestA)
      file.write(CM)
      file.close

In [None]:
def runNN(features, labels, folds, groups, fileNames, datasetName):
  for counter in range(len(fileNames)):
    start = time.time()
    print("File name: " + fileNames[counter])
    neuralNetwork(features[counter], labels[counter], folds, groups, fileNames[counter], datasetName)
    end = time.time()
    print("Time taken: " + str(end-start) + "\n")

---

In [None]:
# runBLR(featuresGroup, targetsGroup, folds, groups, filesGroup, datasetName)
# runRF(featuresGroup[0], targetsGroup[0], folds, groups, filesGroup[0], datasetName)
# runNN(featuresGroup[0], targetsGroup[0], folds, groups, filesGroup[0], datasetName)