In [2]:
import pandas as pd
import os
import scipy

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
def calculateResults(frame):
    results = {"Accuracy":[], "F1 Score":[]}    
    for _, row in frame.iterrows():
        matrixString = row["confusionMatrix"]

        tempString = ""
        for char in matrixString:
            tempString += char
            if char == "[": tempString += " "
        matrixString = tempString.replace("[", "").replace("]", "").split()        

        trueConfMatrix = [[matrixString[0], matrixString[1]], [matrixString[2], matrixString[3]]]   
        TP, FP, FN, TN = float(trueConfMatrix[0][0]), float(trueConfMatrix[0][1]), float(trueConfMatrix[1][0]), float(trueConfMatrix[1][1])

        accuracy = (TP + TN)/(TP + FP + FN + TN) if (TP + FP + FN + TN) != 0 else 0
        precision = (TP)/(TP + FP) if (TP + FP) != 0 else 0
        recall = (TP)/(TP + FN) if (TP + FN) != 0 else 0
        f1Score = 2 * ((precision*recall)/(precision+recall)) if (precision+recall) != 0 else 0
        
        results["Accuracy"].append(accuracy)
        results["F1 Score"].append(f1Score)

    return results

## RECOLA

In [4]:
finalFrame = []

# Different Parameters
inputs = ["Audio", "Visual", "Phys", "All"]
targets = ["Arousal", "Valence"]
baseEstimator = ["BLR", "RF", "NN"]
labelledCount = ['4', "8", "12"]

supervisedModels = ["Binary Logistic Regression", "Random Forest", "Neural Network"]
semiSupervisedModels = ["Co-Training", "Tri-Training", "SSGMM", "Assemble", "SemiBoost"]

# To get Supervised
for model in supervisedModels:
    for input in inputs:
        for target in targets:
            
            baseCode = None
            if model == 'Binary Logistic Regression': baseCode = "BLR"
            elif model == 'Random Forest': baseCode = "RF"
            elif model == 'Neural Network': baseCode = "NN"
            
            # Supervised
            supervisedTest = baseCode + "_" + input + "_" + target

            folderLocation = "Datasets/RECOLA/Supervised Models/" + model + "/Per Fold Results"
            listOfFiles = os.listdir(folderLocation)

            fileName = None
            for file in listOfFiles:
                if input in file and target in file:
                    fileName = file
            fileLocation = folderLocation + "/" + fileName
            superFrame = pd.read_csv(fileLocation); superFrame = superFrame.drop(columns=["trainAccuracy", "trainPrecision", "testAccuracy", "testPrecision"])

            supervisedResults = calculateResults(superFrame)

            # Semi-Supervised
            for semiModel in semiSupervisedModels:
                for count in labelledCount:
                    
                    if baseCode == "NN" and semiModel == "Assemble":
                        pass
                    else:
                        semiSupervisedTest = semiModel + "_" + count

                        semiFolderLocation = "Datasets/RECOLA/Semi-Supervised Models/" + semiModel + "/Per Fold Results"
                        listOfSemiFiles = os.listdir(semiFolderLocation)

                        semiFileName = None
                        for file in listOfSemiFiles:
                            if input in file and target in file and baseCode in file and count in file:
                                semiFileName = file
                            elif semiModel == "SSGMM":
                                if input in file and target in file and count in file:
                                    semiFileName = file
                        
                        semiFileLocation = semiFolderLocation + "/" + semiFileName
                        semiSuperFrame = pd.read_csv(semiFileLocation); semiSuperFrame = semiSuperFrame.drop(columns=["trainAccuracy", "trainPrecision", "testAccuracy", "testPrecision"])

                        semiSupervisedResults = calculateResults(semiSuperFrame)

                        # Create Entry
                        testName = supervisedTest + " - " + semiSupervisedTest

                        tempDict = {}
                        tempDict["Test Name"] = testName
                        tempDict["Accuracy"] = scipy.stats.ttest_ind(supervisedResults["Accuracy"], semiSupervisedResults["Accuracy"])[1]
                        tempDict["F1 Score"] = scipy.stats.ttest_ind(supervisedResults["F1 Score"], semiSupervisedResults["F1 Score"])[1]
                        
                        tempDict["Significance: Accuracy"] = 1 if tempDict["Accuracy"] > 0.05 else 0
                        tempDict["Significance: F1 Score"] = 1 if tempDict["F1 Score"] > 0.05 else 0
                        finalFrame.append(tempDict)

finalFrame = pd.DataFrame(finalFrame)
finalFrame.to_csv("Results/RECOLA_Significance.csv", index=False)

## AGAIN

In [5]:
finalFrame = []

# Different Parameters
games = ["TopDown", "Shootout", "Heist!"]
baseEstimator = ["BLR", "RF", "NN"]
labelledCount = ['4', "8", "12"]

supervisedModels = ["Binary Logistic Regression", "Random Forest", "Neural Network"]
semiSupervisedModels = ["Co-Training", "Tri-Training", "SSGMM", "Assemble", "SemiBoost"]

# To get Supervised
for model in supervisedModels:
    for game in games:
            
        baseCode = None
        if model == 'Binary Logistic Regression': baseCode = "BLR"
        elif model == 'Random Forest': baseCode = "RF"
        elif model == 'Neural Network': baseCode = "NN"
        
        # Supervised
        supervisedTest = baseCode + "_" + game

        folderLocation = "Datasets/AGAIN/Supervised Models/" + game + "/" + model + "/Per Fold Results"
        listOfFiles = os.listdir(folderLocation)
        
        fileName = None
        for file in listOfFiles:
            if game in file:
                fileName = file
        fileLocation = folderLocation + "/" + fileName
        superFrame = pd.read_csv(fileLocation); superFrame = superFrame.drop(columns=["trainAccuracy", "trainPrecision", "testAccuracy", "testPrecision"])

        supervisedResults = calculateResults(superFrame)

        # Semi-Supervised
        for semiModel in semiSupervisedModels:
            for count in labelledCount:
                
                if baseCode == "NN" and semiModel == "Assemble":
                    pass
                else:
                    semiSupervisedTest = semiModel + "_" + count

                    semiFolderLocation = "Datasets/AGAIN/Semi-Supervised Models/" + game + "/" + semiModel + "/Per Fold Results"
                    listOfSemiFiles = os.listdir(semiFolderLocation)

                    semiFileName = None
                    for file in listOfSemiFiles:
                        if game in file and baseCode in file and count in file:
                            semiFileName = file
                        elif semiModel == "SSGMM":
                            if game in file and count in file:
                                semiFileName = file
                    
                    semiFileLocation = semiFolderLocation + "/" + semiFileName
                    semiSuperFrame = pd.read_csv(semiFileLocation); semiSuperFrame = semiSuperFrame.drop(columns=["trainAccuracy", "trainPrecision", "testAccuracy", "testPrecision"])

                    semiSupervisedResults = calculateResults(semiSuperFrame)

                    # Create Entry
                    testName = supervisedTest + " - " + semiSupervisedTest

                    tempDict = {}
                    tempDict["Test Name"] = testName
                    tempDict["Accuracy"] = scipy.stats.ttest_ind(supervisedResults["Accuracy"], semiSupervisedResults["Accuracy"])[1]
                    tempDict["F1 Score"] = scipy.stats.ttest_ind(supervisedResults["F1 Score"], semiSupervisedResults["F1 Score"])[1]
                    
                    tempDict["Significance: Accuracy"] = 1 if tempDict["Accuracy"] > 0.05 else 0
                    tempDict["Significance: F1 Score"] = 1 if tempDict["F1 Score"] > 0.05 else 0
                    finalFrame.append(tempDict)

finalFrame = pd.DataFrame(finalFrame)
finalFrame.to_csv("Results/AGAIN_Significance.csv", index=False)

## Evaluate

In [6]:
def countChecker(list):
    totalLength = len(list)
    passCounter = 0
    for row in list:
        if row == 1: passCounter += 1

    return passCounter, totalLength

In [33]:
def countCalculator(nameList, binaryList):
    totalLength = len(binaryList)

    overallCount = 0
    CoTrainingCount, TriTrainingCount, SSGMMCount, AssembleCount, SemiBoostCount = 0, 0, 0, 0, 0
    CoTrainingTotal, TriTrainingTotal, SSGMMTotal, AssembleTotal, SemiBoostTotal = 0, 0, 0, 0, 0
    for index in range(totalLength):
        if binaryList[index] == 1:
            overallCount += 1
            if "Co-Training" in nameList[index]: CoTrainingCount += 1
            if "Tri-Training" in nameList[index]: TriTrainingCount += 1
            if "SSGMM" in nameList[index]: SSGMMCount += 1
            if "Assemble" in nameList[index]: AssembleCount += 1
            if "SemiBoost" in nameList[index]: SemiBoostCount += 1
        
        if "Co-Training" in nameList[index]: CoTrainingTotal += 1
        if "Tri-Training" in nameList[index]: TriTrainingTotal += 1
        if "SSGMM" in nameList[index]: SSGMMTotal += 1
        if "Assemble" in nameList[index]: AssembleTotal += 1
        if "SemiBoost" in nameList[index]: SemiBoostTotal += 1


    return totalLength, overallCount, CoTrainingCount, TriTrainingCount, SSGMMCount, AssembleCount, SemiBoostCount, CoTrainingTotal, TriTrainingTotal, SSGMMTotal, AssembleTotal, SemiBoostTotal


In [44]:
print("R E C O L A: Statistical Significance Evaluation"); frame = pd.read_csv("Results/RECOLA_Significance.csv")

totalLength, overallCount, CoTrainingCount, TriTrainingCount, SSGMMCount, AssembleCount, SemiBoostCount, CoTrainingTotal, TriTrainingTotal, SSGMMTotal, AssembleTotal, SemiBoostTotal = countCalculator(list(frame["Test Name"]), list(frame["Significance: Accuracy"]))
print("Overall Significance: " + str(overallCount) + "/" + str(totalLength) + " (" + str(round((overallCount/totalLength)*100, 2)) + "%)")
print("Co-Training Significance: " + str(CoTrainingCount) + "/" + str(CoTrainingTotal) + " (" + str(round((CoTrainingCount/CoTrainingTotal)*100, 2)) + "%)")
print("Tri-Training Significance: " + str(TriTrainingCount) + "/" + str(TriTrainingTotal) + " (" + str(round((TriTrainingCount/TriTrainingTotal)*100, 2)) + "%)")
print("SSGMM Significance: " + str(SSGMMCount) + "/" + str(SSGMMTotal) + " (" + str(round((SSGMMCount/SSGMMTotal)*100, 2)) + "%)")
print("Assemble Significance: " + str(AssembleCount) + "/" + str(AssembleTotal) + " (" + str(round((AssembleCount/AssembleTotal)*100, 2)) + "%)")
print("SemiBoost Significance: " + str(SemiBoostCount) + "/" + str(SemiBoostTotal) + " (" + str(round((SemiBoostCount/SemiBoostTotal)*100, 2)) + "%)")

print("\n\nA G A I N: Statistical Significance Evaluation"); frame = pd.read_csv("Results/AGAIN_Significance.csv")

totalLength, overallCount, CoTrainingCount, TriTrainingCount, SSGMMCount, AssembleCount, SemiBoostCount, CoTrainingTotal, TriTrainingTotal, SSGMMTotal, AssembleTotal, SemiBoostTotal = countCalculator(list(frame["Test Name"]), list(frame["Significance: Accuracy"]))
print("Overall Significance: " + str(overallCount) + "/" + str(totalLength) + " (" + str(round((overallCount/totalLength)*100, 2)) + "%)")
print("Co-Training Significance: " + str(CoTrainingCount) + "/" + str(CoTrainingTotal) + " (" + str(round((CoTrainingCount/CoTrainingTotal)*100, 2)) + "%)")
print("Tri-Training Significance: " + str(TriTrainingCount) + "/" + str(TriTrainingTotal) + " (" + str(round((TriTrainingCount/TriTrainingTotal)*100, 2)) + "%)")
print("SSGMM Significance: " + str(SSGMMCount) + "/" + str(SSGMMTotal) + " (" + str(round((SSGMMCount/SSGMMTotal)*100, 2)) + "%)")
print("Assemble Significance: " + str(AssembleCount) + "/" + str(AssembleTotal) + " (" + str(round((AssembleCount/AssembleTotal)*100, 2)) + "%)")
print("SemiBoost Significance: " + str(SemiBoostCount) + "/" + str(SemiBoostTotal) + " (" + str(round((SemiBoostCount/SemiBoostTotal)*100, 2)) + "%)")

R E C O L A: Statistical Significance Evaluation
Overall Significance: 271/336 (80.65%)
Co-Training Significance: 68/72 (94.44%)
Tri-Training Significance: 71/72 (98.61%)
SSGMM Significance: 38/72 (52.78%)
Assemble Significance: 43/48 (89.58%)
SemiBoost Significance: 51/72 (70.83%)


A G A I N: Statistical Significance Evaluation
Overall Significance: 80/126 (63.49%)
Co-Training Significance: 22/27 (81.48%)
Tri-Training Significance: 26/27 (96.3%)
SSGMM Significance: 0/27 (0.0%)
Assemble Significance: 18/18 (100.0%)
SemiBoost Significance: 14/27 (51.85%)
