In [None]:
import pandas as pd
import numpy as np

# Split dataframe into features (X) and target (y)
def X_y_split(df, target="CDR", drop_cols=["Delay", "Subject ID", "Group", "Visit", "Cohort"]):
    
    X = df.drop(drop_cols, axis=1)
    y = X.pop(target)
    
    return X, y

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from imblearn.metrics import macro_averaged_mean_absolute_error

# Perform model validation based on k-fold cross-validation
def validate_model(model, X, y, metric=macro_averaged_mean_absolute_error, **kwargs):
    
    scorer = make_scorer(metric)
    score = cross_val_score(
        model,
        X, y,
        scoring=scorer,
        **kwargs
    )
    
    return score

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

# Generate confusion matrix based on k-fold cross validation
def make_confusion_matrix(model, X, y, **kwargs):
    
    y_pred = cross_val_predict(
        model,
        X, y,
        **kwargs
    )
    
    conf_mat = confusion_matrix(y, y_pred)
    
    return conf_mat

In [None]:
from sklearn.dummy import DummyClassifier
from imblearn.metrics import macro_averaged_mean_absolute_error

# Compute baseline performance based on given strategy
def baseline_performance(X, y, strategy="most_frequent", metric=macro_averaged_mean_absolute_error):
    
    dummy_clf = DummyClassifier(strategy=strategy)
    dummy_clf.fit(X, y)
    y_pred = dummy_clf.predict(X)
        
    return metric(y, y_pred)

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Score K-means clustering with elbow and silhouette scores
def score_cluster(X_train, n_clusters):
    
    kmeans = KMeans(n_clusters=n_clusters, random_state=123).fit(X_train)
    elbow = kmeans.inertia_
    silhouette = silhouette_score(X_train, kmeans.predict(X_train), random_state=123)
    
    return [elbow, silhouette]

In [None]:
from seaborn import scatterplot

def plot_pca(red_dims, hue, ax, palette="colorblind", s=15):
    
    scatterplot(
        x=red_dims[:, 0], y=red_dims[:, 1],
        hue=hue, palette=palette,
        s=s, ax=ax
    )

In [None]:
from tensorflow.random import set_seed
from os import environ
from random import seed
import numpy as np

# Prepare a reproducible environment for model training and validation
def reproduce_environment(random_state):

    # Seed value
    # Apparently you may use different seed values at each stage
    random_state = 123

    # 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
    environ["PYTHONHASHSEED"] = str(random_state)

    # 2. Set the `python` built-in pseudo-random generator at a fixed value
    seed(random_state)

    # 3. Set the `numpy` pseudo-random generator at a fixed value
    np.random.seed(random_state)

    # 4. Set the `tensorflow` pseudo-random generator at a fixed value
    set_seed(random_state)