In [1]:
import pandas as pd
import numpy as np
import scipy.stats
import altair as alt

In [2]:
import sys
import bp_util

In [3]:
df_full = pd.read_csv('full_data.csv')

In [4]:
df_full, dict_proteins = bp_util.clean_raw_bp_data(df_full)

In [5]:
df_full = df_full[['group', 'assay_ID', 'AK1C1', 'TAU', '1433T', 'GDIA', '1433B', '1433F', '1433G', '1433Z', 'GUAD', 'SERC']]

In [6]:
groups = ['AD_MCI', 'Healthy']
df = df_full[df_full['group'].isin(groups)]

df

Unnamed: 0,group,assay_ID,AK1C1,TAU,1433T,GDIA,1433B,1433F,1433G,1433Z,GUAD,SERC
0,AD_MCI,TPAD0210,17.048220,20.309884,23.238585,22.268062,22.320183,24.059067,22.805169,23.185457,22.062487,22.319883
1,AD_MCI,TPAD0262,17.818075,20.317162,23.350587,22.682374,22.473186,24.331354,22.925892,23.144332,22.088050,21.724684
2,AD_MCI,TPAD0266,19.049924,18.526141,22.769605,21.379855,21.904326,23.623255,22.559370,22.852953,20.967126,21.618941
3,AD_MCI,TPAD0273,16.630259,19.962984,22.002609,22.449737,21.217635,22.653958,21.982198,22.269707,21.975284,21.559119
4,AD_MCI,TPAD0292,17.831350,19.877654,22.627299,22.580290,21.859133,23.524767,22.439768,22.714383,20.834141,21.937996
...,...,...,...,...,...,...,...,...,...,...,...,...
202,Healthy,TPAD0521,18.732985,18.456411,21.733054,21.822557,20.883586,22.448767,21.426972,21.747074,21.359872,21.131670
203,Healthy,TPAD0522,18.332783,18.670935,22.509808,22.069713,21.642476,23.319039,22.199985,22.473105,20.970339,21.374229
204,Healthy,TPAD0524,18.942413,18.791507,22.504096,22.210963,21.627516,23.317197,22.206803,22.407550,21.673876,21.755798
205,Healthy,TPAD0526,18.935836,19.249951,22.481129,22.197582,21.654925,23.371007,22.205707,22.506010,21.270581,21.583191


In [7]:
mean = np.mean(df.iloc[:,2:], axis=0)
std = np.std(df.iloc[:,2:], axis=0)

zscores = (df.iloc[:,2:] - mean) / std

zscores

Unnamed: 0,AK1C1,TAU,1433T,GDIA,1433B,1433F,1433G,1433Z,GUAD,SERC
0,-1.474578,1.459741,1.936216,0.458127,1.878312,1.763714,1.805057,1.917240,1.300690,2.012213
1,-0.736341,1.468687,2.213290,1.676765,2.280065,2.375425,2.137555,1.807386,1.353565,0.221807
2,0.444913,-0.732476,0.776041,-2.154401,0.786364,0.784635,1.128074,1.029029,-0.964989,-0.096277
3,-1.875372,1.033402,-1.121376,0.992498,-1.016736,-1.392952,-0.461582,-0.528985,1.120315,-0.276229
4,-0.723612,0.928532,0.424000,1.376500,0.667697,0.563374,0.798663,0.658871,-1.240060,0.863465
...,...,...,...,...,...,...,...,...,...,...
202,0.140991,-0.818174,-1.788208,-0.852260,-1.893877,-1.853928,-1.990793,-1.925084,-0.152621,-1.562030
203,-0.242774,-0.554524,0.133347,-0.125285,0.098803,0.101194,0.138249,0.014349,-0.958342,-0.832392
204,0.341816,-0.406342,0.119217,0.290179,0.059521,0.097057,0.157029,-0.160766,0.496875,0.315399
205,0.335510,0.157086,0.062399,0.250822,0.131491,0.217943,0.154010,0.102247,-0.337312,-0.203818


In [8]:
#Rule-based classifiers
cols_1 = ["TAU"]
cols_2 = ["1433B", "1433G", "1433Z"]
cols_3 = ["1433T", "GUAD"]

zscore_encoded = zscores.copy()

for col in zscores.columns:
    if col in cols_1:
        zscore_encoded[col] = np.where(np.abs(zscores[col]) > 1.5, 4, 0)      
    elif col in cols_2:
        zscore_encoded[col] = np.where(np.abs(zscores[col]) > 1.5, 3, 0)
    elif col in cols_3:
        zscore_encoded[col] = np.where(np.abs(zscores[col]) > 1.5, 2, 0)
    else:
        zscore_encoded[col] = np.where(np.abs(zscores[col]) > 1.5, 1, 0)
        
zscore_encoded['Sum'] = zscore_encoded.sum(axis=1)
zscore_encoded = pd.concat([df.iloc[:,:2], zscore_encoded], axis=1)

In [9]:
zscore_encoded

Unnamed: 0,group,assay_ID,AK1C1,TAU,1433T,GDIA,1433B,1433F,1433G,1433Z,GUAD,SERC,Sum
0,AD_MCI,TPAD0210,0,0,2,0,3,1,3,3,0,1,13
1,AD_MCI,TPAD0262,0,0,2,1,3,1,3,3,0,0,13
2,AD_MCI,TPAD0266,0,0,0,1,0,0,0,0,0,0,1
3,AD_MCI,TPAD0273,1,0,0,0,0,0,0,0,0,0,1
4,AD_MCI,TPAD0292,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
202,Healthy,TPAD0521,0,0,2,0,3,1,3,3,0,1,13
203,Healthy,TPAD0522,0,0,0,0,0,0,0,0,0,0,0
204,Healthy,TPAD0524,0,0,0,0,0,0,0,0,0,0,0
205,Healthy,TPAD0526,0,0,0,0,0,0,0,0,0,0,0


In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score

In [12]:
X = zscore_encoded.iloc[:, 2:-1]
Y = zscore_encoded['group']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=666)

# Create the decision tree classifier
model = DecisionTreeClassifier(random_state=666)

# Fit the model on the training data
model.fit(x_train, y_train)

# evaluate the performance on the testing set
y_pred = model.predict(x_test)

# Calculate various evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='AD_MCI')
recall = recall_score(y_test, y_pred, pos_label='AD_MCI')
f1 = f1_score(y_test, y_pred, pos_label='AD_MCI')
conf_mat = confusion_matrix(y_test, y_pred, labels=['Healthy', 'MCI', 'AD', 'AD_MCI'])

# Print the results
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")

Accuracy: 0.88
Precision: 0.78
Recall: 0.64
F1-score: 0.70
