In [41]:
import pandas as pd
import numpy as np
from scipy import stats

from imblearn.over_sampling import BorderlineSMOTE
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC

from sklearn.metrics import r2_score, balanced_accuracy_score

In [42]:
LABELS = pd.read_csv('data/labels.csv')
SMOTE = BorderlineSMOTE()

In [43]:
def load_raw_data(csv_path: str, feat_to_predict: str):
    # Load the entire csv file
    data = pd.read_csv(csv_path)

    # Join labels and data so that truth values are aligned with original data
    lbls = LABELS[['id', feat_to_predict]]
    temp = data.merge(lbls, on='id', how='left')
    temp = temp.dropna()
    # Extract only the ground truths
    labels = pd.DataFrame(temp[feat_to_predict], columns=[feat_to_predict])
    data = temp.drop(feat_to_predict, axis=1)
    data = data.set_index('id')
    data = pd.DataFrame(StandardScaler().fit_transform(data), columns=data.columns, index=data.index)

    if feat_to_predict == 'Sex_Category':
        labels['Sex_Category'] = labels['Sex_Category'].map({'Male': 0, 'Female': 1})

    return data, labels

In [44]:
def train_and_test(X, y, model, scoring_f, smote):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y if smote else None)
    if smote: X_train, y_train = SMOTE.fit_resample(X_train, y_train)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    return scoring_f(y_test, y_pred)


In [45]:
def evaluate(dataset: str, target: str, features: list[str]):
    X, y = load_raw_data(f'data/{dataset}', target)
    X = X[features]
    regr = len(y[target].value_counts()) > 2

    model = LinearRegression() if regr else SVC()
    scoring_f = r2_score if regr else balanced_accuracy_score
    metric = 'R2 Score' if regr else "Accuracy"

    results = []
    for _ in range(1000):
        results.append(train_and_test(X, y[target], model, scoring_f, not regr))

    avg, stdev, sk, krt = np.mean(results), np.std(results), stats.skew(results), stats.kurtosis(results)
    print(f'  - {metric} Average:   {"" if avg<0 else " "}{avg:0.4f}')
    print(f'  - {metric} StDev:     {"" if stdev<0 else " "}{stdev:0.4f}')
    print(f'  - {metric} Skew:      {"" if sk<0 else " "}{sk:0.4f}')
    print(f'  - {metric} Kurtosis:  {"" if krt<0 else " "}{krt:0.4f}')
    print()

# MOCA_impairment features

In [46]:
dataset = 'no_audio_corrected.csv'
feats = [
    '# unique tokens (participant)', '# DATE (participant)', '# AUX (participant)', '# CCONJ (participant)',
    'RatioVerb', '# VERB (participant)', 'proportion_below_threshold_0.5', 'VP_to_AUX_ADJP (participant)'
]

print('Age')
evaluate(dataset, 'Age_at_testing', feats)

print('Gender')
evaluate(dataset, 'Sex_Category', feats)

print('Education')
evaluate(dataset, 'Educ', feats)


Age
  - R2 Score Average:   -0.2687
  - R2 Score StDev:      0.2610
  - R2 Score Skew:      -2.8738
  - R2 Score Kurtosis:   15.7814

Gender
  - Accuracy Average:    0.6430
  - Accuracy StDev:      0.0754
  - Accuracy Skew:      -0.0327
  - Accuracy Kurtosis:  -0.0260

Education
  - R2 Score Average:   -0.2641
  - R2 Score StDev:      0.2630
  - R2 Score Skew:      -1.5857
  - R2 Score Kurtosis:   3.4251



# ABAB<sub>42</sub>/ABAB<sub>40</sub>

In [47]:
dataset = 'no_audio_corrected.csv'
feats = [
    'RatioVerb', 'RatioNoun', 'VP_to_AUX_VP (participant)', '# PROPN (participant)', 'MATTR (participant)',
    '# NUM (participant)', '# TIME (participant)', '# unique tokens (participant)', 'VPTypeRate'
]

print('Age')
evaluate(dataset, 'Age_at_testing', feats)

print('Gender')
evaluate(dataset, 'Sex_Category', feats)

print('Education')
evaluate(dataset, 'Educ', feats)

Age
  - R2 Score Average:   -0.4386
  - R2 Score StDev:      0.4782
  - R2 Score Skew:      -1.8909
  - R2 Score Kurtosis:   4.4367

Gender
  - Accuracy Average:    0.6371
  - Accuracy StDev:      0.0777
  - Accuracy Skew:      -0.0543
  - Accuracy Kurtosis:  -0.0682

Education
  - R2 Score Average:   -0.3104
  - R2 Score StDev:      0.3068
  - R2 Score Skew:      -2.0742
  - R2 Score Kurtosis:   6.6792



# tTau/AB<sub>42</sub>

In [48]:
dataset = 'full_corrected.csv'
feats = [
    'PU', 'UP', '1F0std', 'avgdurvoiced', 'stddurpause', 'maxdurpause', 'PVU', 'VP'
]

print('Age')
evaluate(dataset, 'Age_at_testing', feats)

print('Gender')
evaluate(dataset, 'Sex_Category', feats)

print('Education')
evaluate(dataset, 'Educ', feats)

Age
  - R2 Score Average:   -0.4250
  - R2 Score StDev:      0.3498
  - R2 Score Skew:      -1.9450
  - R2 Score Kurtosis:   7.3815

Gender
  - Accuracy Average:    0.4978
  - Accuracy StDev:      0.0749
  - Accuracy Skew:      -0.1279
  - Accuracy Kurtosis:  -0.1198

Education
  - R2 Score Average:   -0.3439
  - R2 Score StDev:      0.7508
  - R2 Score Skew:      -7.0141
  - R2 Score Kurtosis:   71.3173



# pTau

In [49]:
dataset = 'full_corrected.csv'
feats = [
    'PU', 'UP', 'Vrate', 'VP_to_AUX_VP (participant)', 'VP_to_AUX (participant)', 'skwdurvoiced', 'kurtosisdurvoiced', 'RatioVerb'
]

print('Age')
evaluate(dataset, 'Age_at_testing', feats)

print('Gender')
evaluate(dataset, 'Sex_Category', feats)

print('Education')
evaluate(dataset, 'Educ', feats)

Age
  - R2 Score Average:   -0.4530
  - R2 Score StDev:      0.3845
  - R2 Score Skew:      -3.1088
  - R2 Score Kurtosis:   23.1970

Gender
  - Accuracy Average:    0.7202
  - Accuracy StDev:      0.0724
  - Accuracy Skew:      -0.2556
  - Accuracy Kurtosis:  -0.1049

Education
  - R2 Score Average:   -0.2808
  - R2 Score StDev:      0.2344
  - R2 Score Skew:      -1.7216
  - R2 Score Kurtosis:   4.8241

