In [205]:
import sklearn
import statistics
import csv
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import cross_val_predict, GroupKFold




In [206]:
pd.set_option('display.max_columns', None)

In [207]:
logo = LeaveOneGroupOut()
cv = logo

In [208]:
def getDataset(path):
    weights_set_file = path
    df_weights = pd.read_csv(weights_set_file)
    return df_weights


In [209]:
def getX(df_weights):
    #X = df_weights.iloc[:, 109:] #FOR VERBAL
    #X = df_weights.iloc[:, 21:109]  #Acoustic 
    X = df_weights.iloc[:, 21:]    #verbal + acoustic
    X = X.to_numpy()
    return X

In [210]:
def getUtteranceID(df_weights):
    utterance_ids = pd.DataFrame(df_weights['utteranceID'], columns=['utteranceID'])
    utterance_ids = utterance_ids.to_numpy()
    utterance_ids = utterance_ids.reshape(-1)
    return utterance_ids

In [211]:
def getGroups(df_weights):
    groups = pd.DataFrame(df_weights['Group'], columns=['Group'])
    groups = groups.to_numpy()
    groups = groups.reshape(-1)
    return groups


In [212]:
def getGroupIndices(groups):
    groupnames = np.unique(groups)
    groupDict = {}
    for name in groupnames:
        groupDict[name] = []
    for i in range(len(groups)):
        groupDict[groups[i]].append(i)
    return groupDict

In [204]:
bert_features = r"WhisperFin/Whisper_Prosodic/Whisper_Final_Prosodic.csv"
prosodic_features = r"WhisperFin/Whisper_Prosodic/Whisper_Final_Prosodic.csv"
# bp_features = r"C:\Users\Bbykitty\OneDrive - Colostate\Research\iSAT-CSU\bert-opensmile_cps.csv"

In [213]:
def getBinaryCVScores(clf, X, y, groups, num_classes):
    group_kfold = GroupKFold(n_splits=5)
    calibrated_clf = CalibratedClassifierCV(clf, method='sigmoid', cv=5)
    ovr_clf = OneVsRestClassifier(calibrated_clf)

    # Perform cross-validation
    y_scores = cross_val_predict(ovr_clf, X, y, groups=groups, cv=group_kfold, method='predict_proba', n_jobs=-1)

    roc_auc_ovr = {}
    roc_avg = {}
    roc_sd = {}
    group_indices = getGroupIndices(groups)
    for groupname in group_indices:
        roc_auc_ovr[groupname] = {}
        for class_id in range(num_classes):
            # Adjust the indexing to correctly reference the slices for each class
            prob_tmp = y_scores[group_indices[groupname], class_id]
            true_max_tmp = y[group_indices[groupname], class_id]
            try:
                roc_auc_ovr[groupname][class_id] = roc_auc_score(true_max_tmp, prob_tmp)
            except ValueError as e:
                print(f"Issue with class {class_id}: {e}")
                roc_auc_ovr[groupname][class_id] = 0
    for class_id in range(num_classes):
        class_aurocs = [roc_auc_ovr[groupname][class_id] for groupname in roc_auc_ovr if roc_auc_ovr[groupname][class_id] > 0]
        roc_sd[class_id] = statistics.stdev(class_aurocs) if len(class_aurocs) > 1 else 0
        roc_avg[class_id] = sum(class_aurocs) / len(class_aurocs) if class_aurocs else 0
        print(f"Class {class_id} Aurocs: {class_aurocs}")
        print(f"Average Class {class_id}: {roc_avg[class_id]}")

    return roc_auc_ovr, roc_avg, roc_sd


In [214]:
def getYBinary(df_weights):
    y = df_weights.iloc[:, 2:21]
    print(y.columns)
    y = y.to_numpy()
    y_facets = []
    for i in range(len(y)):
        const = 0
        neg = 0
        maintain = 0
        if(1 in y[i][:5]):
            const = 1
        if(1 in y[i][5:12]):
            neg = 1
        if(1 in y[i][12:]):
            maintain = 1
        y_facets.append([const,neg,maintain])
    y = np.array(y_facets)
    print(y.shape)
    return y

In [215]:
def getBinaryModelScores(clf, path, num_classes):
    df_weights = getDataset(path)
    y = getYBinary(df_weights)
    X = getX(df_weights)
    groups = getGroups(df_weights)
    roc_folds, roc_avg, roc_sd = getBinaryCVScores(clf, X, y, groups, num_classes)
    # print("Average: ", roc_avg)
    print("Group Results: ", roc_folds)
    print("Standard Deviation: ", roc_sd)
    return roc_folds, roc_avg, roc_sd

In [216]:
# Set up the LinearSVC classifier with specified parameters
p_bi_svc = LinearSVC(C=1.0, max_iter=1000, random_state=1)

# Get model scores for the LinearSVC classifier
p_bi_svc_folds, p_bi_svc_avg, p_bi_svc_sd = getBinaryModelScores(p_bi_svc, bert_features, 3)

Index(['CPS_CONST_EstablishesCG_Confirms',
       'CPS_CONST_EstablishesCG_Interrupts',
       'CPS_CONST_SharesU_CorrectSolutions',
       'CPS_CONST_SharesU_IncorrectSolutions', 'CPS_CONST_SharesU_Situation',
       'CPS_MAINTAIN_FulfillsR_Apologizes',
       'CPS_MAINTAIN_FulfillsR_InitiatesOffTopic',
       'CPS_MAINTAIN_FulfillsR_JoinsOffTopic',
       'CPS_MAINTAIN_FulfillsR_Support', 'CPS_MAINTAIN_Initiative_Compliments',
       'CPS_MAINTAIN_Initiative_Criticizes',
       'CPS_MAINTAIN_Initiative_Suggestions', 'CPS_NEG_MonitorsE_GivingUp',
       'CPS_NEG_MonitorsE_Results', 'CPS_NEG_MonitorsE_Save',
       'CPS_NEG_MonitorsE_Strategizes', 'CPS_NEG_Responds_QuestionsOthers',
       'CPS_NEG_Responds_Reasons', 'CPS_NEG_Responds_Responds'],
      dtype='object')
(2932, 3)
Class 0 Aurocs: [0.5733450740702373, 0.5526315789473684, 0.5735042735042735, 0.5754583254583254, 0.5889787664307382, 0.5598783299808664, 0.5753569539925967, 0.5535991531404375, 0.623969703720205, 0.5845588235294

In [32]:
p_bi_gb = GradientBoostingClassifier(n_estimators=132, learning_rate=0.1, max_depth=3, max_features='sqrt', random_state=1)
p_bi_gb_folds, p_bi_gb_avg, p_bi_gb_sd = getBinaryModelScores(p_bi_gb, prosodic_features, 3)

Index(['CPS_CONST_SharesU_Situation', 'CPS_CONST_SharesU_CorrectSolutions',
       'CPS_CONST_SharesU_IncorrectSolutions',
       'CPS_CONST_EstablishesCG_Confirms',
       'CPS_CONST_EstablishesCG_Interrupts', 'CPS_NEG_Responds_Reasons',
       'CPS_NEG_Responds_QuestionsOthers', 'CPS_NEG_Responds_Responds',
       'CPS_NEG_MonitorsE_Results', 'CPS_NEG_MonitorsE_Strategizes',
       'CPS_NEG_MonitorsE_Save', 'CPS_NEG_MonitorsE_GivingUp',
       'CPS_MAINTAIN_Initiative_Suggestions',
       'CPS_MAINTAIN_Initiative_Compliments',
       'CPS_MAINTAIN_Initiative_Criticizes', 'CPS_MAINTAIN_FulfillsR_Support',
       'CPS_MAINTAIN_FulfillsR_Apologizes',
       'CPS_MAINTAIN_FulfillsR_InitiatesOffTopic',
       'CPS_MAINTAIN_FulfillsR_JoinsOffTopic'],
      dtype='object')
(1822, 3)
Class 0 Aurocs: [0.8295855379188712, 0.7764057750759877, 0.798901098901099, 0.8542857142857142, 0.8696330991412959, 0.8774995496306972, 0.7668985471558729, 0.8213095699341341, 0.8392857142857143, 0.8937833714721

In [13]:
#{'criterion': 'entropy', 'max_features': 'log2', 'n_estimators': 164, 'type': 'randomforest'}
p_bi_rf = RandomForestClassifier(n_estimators=132, criterion="gini",max_features='sqrt', random_state=1)
p_bi_rf_folds, p_bi_rf_avg, p_bi_rf_sd = getBinaryModelScores(p_bi_rf, prosodic_features, 3)

Index(['CPS_CONST_SharesU_Situation', 'CPS_CONST_SharesU_CorrectSolutions',
       'CPS_CONST_SharesU_IncorrectSolutions',
       'CPS_CONST_EstablishesCG_Confirms',
       'CPS_CONST_EstablishesCG_Interrupts', 'CPS_NEG_Responds_Reasons',
       'CPS_NEG_Responds_QuestionsOthers', 'CPS_NEG_Responds_Responds',
       'CPS_NEG_MonitorsE_Results', 'CPS_NEG_MonitorsE_Strategizes',
       'CPS_NEG_MonitorsE_Save', 'CPS_NEG_MonitorsE_GivingUp',
       'CPS_MAINTAIN_Initiative_Suggestions',
       'CPS_MAINTAIN_Initiative_Compliments',
       'CPS_MAINTAIN_Initiative_Criticizes', 'CPS_MAINTAIN_FulfillsR_Support',
       'CPS_MAINTAIN_FulfillsR_Apologizes',
       'CPS_MAINTAIN_FulfillsR_InitiatesOffTopic',
       'CPS_MAINTAIN_FulfillsR_JoinsOffTopic'],
      dtype='object')
(1294, 3)
Class 0 Aurocs: [0.784688995215311, 0.6769495412844037, 0.7301240608072689, 0.8034534534534534, 0.8153142589118199, 0.8095538720538721, 0.7614274604683314, 0.8665624999999999]
Average Class 0: 0.7810092677743075

In [None]:
#{'algorithm': 'SAMME', 'learning_rate': 0.26326530612244897, 'n_estimators': 52}
p_bi_ab = AdaBoostClassifier(n_estimators=85, learning_rate=1.0, algorithm="SAMME")
p_bi_ab_folds, p_bi_ab_avg, p_bi_ab_sd = getBinaryModelScores(p_bi_ab, prosodic_features, 3)

In [None]:
def modelPredictions(clf, features):
    feature_df = getDataset(features)
    y = getYBinary(feature_df)
    X = getX(feature_df)
    groups = getGroups(feature_df)
    predictions = cross_val_predict(clf, X, y, groups=groups, n_jobs=20,cv=cv, verbose=0)
    print(predictions)
    return predictions

In [None]:
def compareTwoModels(clf, features, header, clf2, features2, header2):
    predictions = modelPredictions(clf, features)
    predictions2 = modelPredictions(clf2, features2)
    with open(header + "_vs_" + header2 + ".csv", 'w', newline = '') as predictionsCSV:
        writer = csv.writer(predictionsCSV)
        writer.writerow([header, header2])
        for utterance_num in range(len(predictions)):
            writer.writerow([predictions[utterance_num], predictions2[utterance_num]])

In [None]:
def compareThreeModels(clf, features, header, clf2, features2, header2, clf3, features3, header3):
    feature_df = getDataset(features)
    y = getY(feature_df)
    utteranceID = getUtteranceID(feature_df)
    predictions = modelPredictions(clf, features)
    predictions2 = modelPredictions(clf2, features2)
    predictions3 = modelPredictions(clf3, features3)
    with open(header + "_vs_" + header2 + "_vs_" + header3 + ".csv", 'w', newline = '') as predictionsCSV:
        writer = csv.writer(predictionsCSV)
        writer.writerow(["Utterance", "True", header, header2, header3])
        for utterance_num in range(len(predictions)):
            writer.writerow([utteranceID[utterance_num],y[utterance_num], predictions[utterance_num], predictions2[utterance_num], predictions3[utterance_num]])

In [None]:
b_mn_rf = RandomForestClassifier(n_estimators=68, criterion="entropy",max_features='sqrt', random_state=1)
p_mn_rf = RandomForestClassifier(n_estimators=164, criterion="entropy",max_features='log2', random_state=1)
bp_mn_rf = RandomForestClassifier(n_estimators=148, criterion="entropy",max_features='log2', random_state=1)
compareThreeModels(b_mn_rf, bert_features, "BERT", p_mn_rf, prosodic_features, "Prosodic", bp_mn_rf, bp_features, "BERT-Prosodic")