In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import random
from imblearn.over_sampling import RandomOverSampler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, recall_score, accuracy_score, roc_auc_score

# **Preprocessing**

## Specify the paths to csv files for clinical variables and CT findings

The csv files should be created separately according to the following classification method.
*   training and validation, expansion
*   training and validation, no expansion
*   test, expansion
*   test, no expansion

Only the following clinical variables and CT findings should be included in the csv files.
*   Anticoagulant use
*   Systolic blood pressure, mmHg
*   Diastolic blood pressure, mmHg
*   PT-INR
*   Time from onset to baseline CT scan, h
*   Intrahematoma hypodensities
*   Hematoma location
*   Hematoma volume on baseline CT, mL





In [None]:
path_expansion_train = "PATH"
path_no_expansion_train = "PATH"
path_expansion_test = "PATH"
path_no_expansion_test = "PATH"

## Read data from CSV files

In [None]:
df_expansion_train = pd.read_csv(path_expansion_train)
df_no_expansion_train = pd.read_csv(path_no_expansion_train)
df_expansion_test = pd.read_csv(path_expansion_test)
df_no_expansion_test = pd.read_csv(path_no_expansion_test)

## Standardize continuous variables

In [None]:
scaler = StandardScaler()

scaler.fit(pd.concat([df_expansion_train[["Systolic blood pressure, mmHg"]], df_no_expansion_train[["Systolic blood pressure, mmHg"]]]))
df_expansion_train[["Systolic blood pressure, mmHg"]] = scaler.transform(df_expansion_train[["Systolic blood pressure, mmHg"]])
df_no_expansion_train[["Systolic blood pressure, mmHg"]] = scaler.transform(df_no_expansion_train[["Systolic blood pressure, mmHg"]])
df_expansion_test[["Systolic blood pressure, mmHg"]] = scaler.transform(df_expansion_test[["Systolic blood pressure, mmHg"]])
df_no_expansion_test[["Systolic blood pressure, mmHg"]] = scaler.transform(df_no_expansion_test[["Systolic blood pressure, mmHg"]])

scaler.fit(pd.concat([df_expansion_train[["Diastolic blood pressure, mmHg"]], df_no_expansion_train[["Diastolic blood pressure, mmHg"]]]))
df_expansion_train[["Diastolic blood pressure, mmHg"]] = scaler.transform(df_expansion_train[["Diastolic blood pressure, mmHg"]])
df_no_expansion_train[["Diastolic blood pressure, mmHg"]] = scaler.transform(df_no_expansion_train[["Diastolic blood pressure, mmHg"]])
df_expansion_test[["Diastolic blood pressure, mmHg"]] = scaler.transform(df_expansion_test[["Diastolic blood pressure, mmHg"]])
df_no_expansion_test[["Diastolic blood pressure, mmHg"]] = scaler.transform(df_no_expansion_test[["Diastolic blood pressure, mmHg"]])

scaler.fit(pd.concat([df_expansion_train[["PT-INR"]], df_no_expansion_train[["PT-INR"]]]))
df_expansion_train[["PT-INR"]] = scaler.transform(df_expansion_train[["PT-INR"]])
df_no_expansion_train[["PT-INR"]] = scaler.transform(df_no_expansion_train[["PT-INR"]])
df_expansion_test[["PT-INR"]] = scaler.transform(df_expansion_test[["PT-INR"]])
df_no_expansion_test[["PT-INR"]] = scaler.transform(df_no_expansion_test[["PT-INR"]])

scaler.fit(pd.concat([df_expansion_train[["Time from onset to baseline CT scan, h"]], df_no_expansion_train[["Time from onset to baseline CT scan, h"]]]))
df_expansion_train[["Time from onset to baseline CT scan, h"]] = scaler.transform(df_expansion_train[["Time from onset to baseline CT scan, h"]])
df_no_expansion_train[["Time from onset to baseline CT scan, h"]] = scaler.transform(df_no_expansion_train[["Time from onset to baseline CT scan, h"]])
df_expansion_test[["Time from onset to baseline CT scan, h"]] = scaler.transform(df_expansion_test[["Time from onset to baseline CT scan, h"]])
df_no_expansion_test[["Time from onset to baseline CT scan, h"]] = scaler.transform(df_no_expansion_test[["Time from onset to baseline CT scan, h"]])

scaler.fit(pd.concat([df_expansion_train[["Hematoma volume on baseline CT, mL"]], df_no_expansion_train[["Hematoma volume on baseline CT, mL"]]]))
df_expansion_train[["Hematoma volume on baseline CT, mL"]] = scaler.transform(df_expansion_train[["Hematoma volume on baseline CT, mL"]])
df_no_expansion_train[["Hematoma volume on baseline CT, mL"]] = scaler.transform(df_no_expansion_train[["Hematoma volume on baseline CT, mL"]])
df_expansion_test[["Hematoma volume on baseline CT, mL"]] = scaler.transform(df_expansion_test[["Hematoma volume on baseline CT, mL"]])
df_no_expansion_test[["Hematoma volume on baseline CT, mL"]] = scaler.transform(df_no_expansion_test[["Hematoma volume on baseline CT, mL"]])

## Encode strings as numbers

In [None]:
df_expansion_train["Anticoagulant use"] = pd.factorize(df_expansion_train["Anticoagulant use"])[0]
df_no_expansion_train["Anticoagulant use"] = pd.factorize(df_no_expansion_train["Anticoagulant use"])[0]
df_expansion_test["Anticoagulant use"] = pd.factorize(df_expansion_test["Anticoagulant use"])[0]
df_no_expansion_test["Anticoagulant use"] = pd.factorize(df_no_expansion_test["Anticoagulant use"])[0]

df_expansion_train["Intrahematoma hypodensities"] = pd.factorize(df_expansion_train["Intrahematoma hypodensities"])[0]
df_no_expansion_train["Intrahematoma hypodensities"] = pd.factorize(df_no_expansion_train["Intrahematoma hypodensities"])[0]
df_expansion_test["Intrahematoma hypodensities"] = pd.factorize(df_expansion_test["Intrahematoma hypodensities"])[0]
df_no_expansion_test["Intrahematoma hypodensities"] = pd.factorize(df_no_expansion_test["Intrahematoma hypodensities"])[0]

## Convert categorical variables into dummy variables.

In [None]:
df_train = pd.concat([df_expansion_train, df_no_expansion_train])
df_test = pd.concat([df_expansion_test, df_no_expansion_test])

In [None]:
location_dummies = pd.get_dummies(df_train["Hematoma location"])
df_train = df_train.drop(columns=["Hematoma location"])
df_train = pd.concat([df_train, location_dummies], axis=1)
location_dummies = pd.get_dummies(df_test["Hematoma location"])
df_test = df_test.drop(columns=["Hematoma location"])
df_test = pd.concat([df_test, location_dummies], axis=1)

In [None]:
df_expansion_train = df_train.iloc[:df_expansion_train.shape[0], :]
df_no_expansion_train = df_train.iloc[df_expansion_train.shape[0]:, :]
df_expansion_test = df_test.iloc[:df_expansion_test.shape[0], :]
df_no_expansion_test = df_test.iloc[df_expansion_test.shape[0]:, :]

## Convert DataFrame to NumPy array

In [None]:
expansion_train_val = df_expansion_train.to_numpy().astype("float32")
no_expansion_train_val = df_no_expansion_train.to_numpy().astype("float32")
expansion_test = df_expansion_test.to_numpy().astype("float32")
no_expansion_test = df_no_expansion_test.to_numpy().astype("float32")

## Shuffle the data in the arrays

In [None]:
random.seed(42)
np.random.shuffle(expansion_train_val)
np.random.shuffle(no_expansion_train_val)

## 70% were assigned to the training set and the rest to the validation set

In [None]:
expansion_train = expansion_train_val[:int(len(expansion_train_val)*0.7)]
expansion_val = expansion_train_val[int(len(expansion_train_val)*0.7):]
no_expansion_train = no_expansion_train_val[:int(len(no_expansion_train_val)*0.7)]
no_expansion_val = no_expansion_train_val[int(len(no_expansion_train_val)*0.7):]

## Labeling

In [None]:
label_expansion_train = np.array([1 for i in range(len(expansion_train))])
label_expansion_val = np.array([1 for i in range(len(expansion_val))])
label_no_expansion_train = np.array([0 for i in range(len(no_expansion_train))])
label_no_expansion_val = np.array([0 for i in range(len(no_expansion_val))])
label_expansion_test = np.array([1 for i in range(len(expansion_test))])
label_no_expansion_test = np.array([0 for i in range(len(no_expansion_test))])

## Concatenate expansion and no expansion arrays

In [None]:
x_train = np.concatenate((expansion_train, no_expansion_train), axis=0)
x_val = np.concatenate((expansion_val, no_expansion_val), axis=0)
y_train = np.concatenate((label_expansion_train, label_no_expansion_train), axis=0)
y_val = np.concatenate((label_expansion_val, label_no_expansion_val), axis=0)
x_test = np.concatenate((expansion_test, no_expansion_test), axis=0)
y_test = np.concatenate((label_expansion_test, label_no_expansion_test), axis=0)

## Random oversampling

In [None]:
x_train, y_train = RandomOverSampler(random_state=42).fit_resample(x_train, y_train)

#  **k-nearest neighbors**

In [None]:
for i in [3,5,7,9,11]:
    clf = KNeighborsClassifier(n_neighbors=i)
    clf.fit(x_train, y_train)
    y_prediction = clf.predict(x_test)
    result = confusion_matrix(y_test, y_prediction)
    sensitivity = recall_score(y_test, y_prediction)
    specificity = recall_score(y_test, y_prediction, pos_label=0)
    accuracy = accuracy_score(y_test, y_prediction)
    auc = roc_auc_score(y_test, y_prediction)
    print(result)
    print("sensitivity: {:.3f}".format(sensitivity))
    print("specificity: {:.3f}".format(specificity))
    print("accuracy: {:.3f}".format(accuracy))
    print("AUC: {:.3f}".format(auc))
    print()