In [4]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import pandas as pd
from correlation_map import corr_map
from correlation_3D import corr_3D, rat_bead_study_data, rat_pa_study_data, mouse_b_enac_study_data, mouse_mps_study_data
from Principle_Component_Analysis import pca_2D, pca_3D
from Simple_Classifiers import data_split, svm

In [5]:
# Loading the data
df_rat_sterile_baseline = pd.read_csv('rat_sterile_baseline_summary.csv')
df_rat_sterile_post_beads = pd.read_csv('rat_sterile_post_beads_summary.csv')
df_rat_bead_combine, bead_labels = rat_bead_study_data(df_rat_sterile_baseline, df_rat_sterile_post_beads)

df_rat_pa = pd.read_csv('rat_pa_summary.csv')
df_rat_pa_combine, pa_labels = rat_pa_study_data(df_rat_pa)

df_mouse_b_enac = pd.read_csv('mouse_b_enac_summary.csv')
df_mouse_enac_combine, b_enac_labels = mouse_b_enac_study_data(df_mouse_b_enac)

df_mouse_mps = pd.read_csv('mouse_mps_summary.csv')
df_mouse_mps_combine, mps_labels = mouse_mps_study_data(df_mouse_mps)

df_rat_sterile_baseline_hd = pd.read_csv('rat_sterile_baseline_summary_HD.csv')
df_rat_sterile_post_beads_hd = pd.read_csv('rat_sterile_post_beads_summary_HD.csv')
df_rat_bead_combine_hd, bead_hd_labels = rat_bead_study_data(df_rat_sterile_baseline_hd, df_rat_sterile_post_beads_hd)

df_rat_pa_hd = pd.read_csv('rat_pa_summary_HD.csv')
df_rat_pa_combine_hd, pa_hd_labels = rat_pa_study_data(df_rat_pa_hd)

df_mouse_enac_combine_sel = df_mouse_enac_combine[['VDP(%)','MSV(mL/mL)','TV(L)','VH(%)','VHSS(%)','VHLS(%)','Genotype','IQR','HD']]
df_mouse_mps_combine_sel = df_mouse_mps_combine[['VDP(%)','MSV(mL/mL)','TV(L)','VH(%)','VHSS(%)','VHLS(%)','Genotype','IQR','HD']]
df_mouse_all = pd.concat([df_mouse_enac_combine_sel, df_mouse_mps_combine_sel], ignore_index=True)

df_rat_pa_combine_hd_sel = df_rat_pa_combine_hd[['VDP(%)','MSV(mL/mL)','TV(L)','VH(%)','VHSS(%)','VHLS(%)','Genotype','IQR','HD']]
df_rat_sterile_baseline_hd_sel = df_rat_sterile_baseline_hd[['VDP(%)','MSV(mL/mL)','TV(L)','VH(%)','VHSS(%)','VHLS(%)','Genotype','IQR','HD']]
df_rat_sterile_post_beads_hd_sel = df_rat_sterile_post_beads_hd[['VDP(%)','MSV(mL/mL)','TV(L)','VH(%)','VHSS(%)','VHLS(%)','Genotype','IQR','HD']]
df_rat_all = pd.concat([df_rat_pa_combine_hd_sel, df_rat_sterile_baseline_hd_sel], ignore_index=True)




# SVM

# Rat bead study (baseline vs post beads)

In [6]:
# df_rat_sterile_baseline['target'] = 'baseline'
# df_rat_sterile_post_beads['target'] = 'post_beads'
# df_rat_combine = pd.concat([df_rat_sterile_baseline, df_rat_sterile_post_beads], ignore_index=True)

In [190]:
# def features(df):
#     df = df.drop('ScanName', axis=1)
#     df = df.drop('Date Prepared', axis=1)
#     df = df.drop('VDP(%)', axis=1)
#     df = df.drop('VHSS(%)', axis=1)
#     df = df.drop('VHLS(%)', axis=1)
#     if 'Genotype' in df.columns:
#         df['target'] = df['Genotype']   # copy 'Genotype' column at the end with coolumn name target
#         df = df.drop(columns=['Genotype']) # remove 'Genotype' column
#     return df

# df_rat_combine = features(df_rat_combine) 
# df_rat_combine.shape

(104, 4)

In [9]:
df_mouse_features = df_mouse_all[['VDP(%)','MSV(mL/mL)','TV(L)','VH(%)','VHSS(%)','VHLS(%)','HD','Genotype']]

df_mouse_features

Unnamed: 0,VDP(%),MSV(mL/mL),TV(L),VH(%),VHSS(%),VHLS(%),HD,Genotype
0,10.40,0.355,0.197,32.91,15.81,22.39,0.896879,WT
1,13.30,0.356,0.251,38.23,20.18,24.30,1.038735,WT
2,11.20,0.333,0.239,30.29,17.12,18.41,0.775494,WT
3,12.10,0.363,0.240,36.05,17.88,23.80,1.004333,WT
4,14.40,0.381,0.210,36.94,20.32,23.81,1.063833,WT
...,...,...,...,...,...,...,...,...
56,12.10,0.340,0.155,41.64,16.84,29.73,0.929657,MPS I
57,11.00,0.330,0.186,36.08,17.59,25.01,0.800212,MPS I
58,8.46,0.390,0.208,31.27,15.68,24.24,0.817827,MPS I
59,11.20,0.380,0.181,41.08,16.81,29.39,1.031144,MPS I


In [49]:
def data_split(df, test_size, val_size):
    np_df = df.values

    bigtrain_set, test_set = train_test_split(np_df, test_size=test_size, random_state=42, stratify=np_df[:,-1])
    train_set, val_set = train_test_split(bigtrain_set, test_size=val_size, random_state=42, stratify=bigtrain_set[:,-1])

    # Get the X and y for train, val and test
    X_train = train_set[:,:-1]
    y_train = train_set[:,-1]
    X_test = test_set[:,:-1]
    y_test = test_set[:,-1]
    X_val = val_set[:,:-1]
    y_val = val_set[:,-1]
    X_bigtrain = bigtrain_set[:,:-1]
    y_bigtrain = bigtrain_set[:,-1]
    
    print(f'Shapes are {[X_train.shape,y_train.shape,X_val.shape,y_val.shape,X_bigtrain.shape,y_bigtrain.shape,X_test.shape,y_test.shape]}')
    
    return X_train,y_train,X_test,y_test,X_val,y_val,X_bigtrain,y_bigtrain

X_train,y_train,X_test,y_test,X_val,y_val,X_bigtrain,y_bigtrain = data_split(df_mouse_features, 0.3, 0.2)



Shapes are [(33, 7), (33,), (9, 7), (9,), (42, 7), (42,), (19, 7), (19,)]


In [50]:
# Create SVM classifier and optimize the hyperparameters
def svm_hyper_tune(X_train, y_train, X_val, y_val):
    
    preproc_pl = Pipeline([ ('imputer', SimpleImputer(strategy="median")),
                        ('std_scaler', StandardScaler())])
    
    for kerneltype in ['rbf','linear','poly']:
        for c_choice in [1, 10, 100]:
            svm_pl = Pipeline([('preproc',preproc_pl),
                               ('svc',SVC(kernel=kerneltype, C=c_choice, random_state=42))])
            svm_pl.fit(X_train,y_train)
            y_pred_svm = svm_pl.predict(X_val)
            acc = accuracy_score(y_val,y_pred_svm)
            print(f'Validation accuracy score = {acc} for kernel {kerneltype} and C={c_choice}')
            
svm_hyper_tune(X_train, y_train, X_val, y_val)

Validation accuracy score = 0.6666666666666666 for kernel rbf and C=1
Validation accuracy score = 0.3333333333333333 for kernel rbf and C=10
Validation accuracy score = 0.2222222222222222 for kernel rbf and C=100
Validation accuracy score = 0.5555555555555556 for kernel linear and C=1
Validation accuracy score = 0.6666666666666666 for kernel linear and C=10
Validation accuracy score = 0.7777777777777778 for kernel linear and C=100
Validation accuracy score = 0.4444444444444444 for kernel poly and C=1
Validation accuracy score = 0.3333333333333333 for kernel poly and C=10
Validation accuracy score = 0.3333333333333333 for kernel poly and C=100


In [52]:
# Rat bead study (baseline vs post beads)
def svm(X_bigtrain,y_bigtrain,X_test,y_test,name,kernel,c):
    
    preproc_pl = Pipeline([ ('imputer', SimpleImputer(strategy="median")),
                        ('std_scaler', StandardScaler())])
    
    svm_pl = Pipeline([('preproc',preproc_pl),
                       ('svc',SVC(kernel=kernel, C=c, random_state=42))])
    
    svm_pl.fit(X_bigtrain,y_bigtrain)

    y_train_pred_svm = svm_pl.predict(X_bigtrain)
    y_test_pred_svm = svm_pl.predict(X_test)

    acc_train = accuracy_score(y_bigtrain,y_train_pred_svm)
    acc_test = accuracy_score(y_test,y_test_pred_svm)
    
    print('\033[1m' + name + '\033[0m')
    print()
    print(f'Training accuracy score = {acc_train}')
    print(f'Testing accuracy score = {acc_test}')
    
svm(X_bigtrain,y_bigtrain,X_test,y_test,'Mouse all','linear',100)

[1mMouse all[0m

Training accuracy score = 0.9761904761904762
Testing accuracy score = 0.7894736842105263


# Rat_PA study (WT, KO, CF)

In [195]:
df_rat_pa = pd.read_csv('rat_pa_summary.csv')
df_rat_WT = df_rat_pa[df_rat_pa['Genotype'] == 'WT']
df_rat_KO = df_rat_pa[df_rat_pa['Genotype'] == 'KO']
df_rat_CF = df_rat_pa[df_rat_pa['Genotype'] == 'CF']

df_rat_combine = pd.concat([df_rat_WT, df_rat_KO, df_rat_CF], ignore_index=True)
df_rat_combine = features(df_rat_combine) 
df_rat_combine.shape

(82, 4)

In [197]:
X_train,y_train,X_test,y_test,X_val,y_val,X_bigtrain,y_bigtrain = data_split(df_rat_combine)

Shapes are [(56, 3), (56,), (13, 3), (13,), (13, 3), (13,), (69, 3), (69,)]


In [198]:
svm_hyper_tune(X_train, y_train, X_val, y_val)

Validation accuracy score = 0.5384615384615384 for kernel rbf and C=1
Validation accuracy score = 0.5384615384615384 for kernel rbf and C=10
Validation accuracy score = 0.46153846153846156 for kernel rbf and C=100
Validation accuracy score = 0.6153846153846154 for kernel linear and C=1
Validation accuracy score = 0.6153846153846154 for kernel linear and C=10
Validation accuracy score = 0.6153846153846154 for kernel linear and C=100
Validation accuracy score = 0.6923076923076923 for kernel poly and C=1
Validation accuracy score = 0.6923076923076923 for kernel poly and C=10
Validation accuracy score = 0.6153846153846154 for kernel poly and C=100


In [207]:
svm(X_bigtrain,y_bigtrain,X_test,y_test,'Rat_PA study (WT, KO, CF)','poly',1)

[1mRat_PA study (WT, KO, CF)[0m

Training accuracy score = 0.6376811594202898
Testing accuracy score = 0.46153846153846156


# Mouse B-Enac study (B-Enac vs WT)

In [208]:
df_mouse_b_enac = pd.read_csv('mouse_b_enac_summary_new.csv')
df_WT = df_mouse_b_enac[df_mouse_b_enac['Genotype'] == 'WT']
df_b_enac = df_mouse_b_enac[df_mouse_b_enac['Genotype'] == 'B_Enac']

df_mouse_combine = pd.concat([df_WT, df_b_enac], ignore_index=True)
df_mouse_combine = features(df_mouse_combine) 
df_mouse_combine.shape

(28, 4)

In [210]:
X_train,y_train,X_test,y_test,X_val,y_val,X_bigtrain,y_bigtrain = data_split(df_mouse_combine)

Shapes are [(18, 3), (18,), (5, 3), (5,), (5, 3), (5,), (23, 3), (23,)]


In [211]:
svm_hyper_tune(X_train, y_train, X_val, y_val)

Validation accuracy score = 0.8 for kernel rbf and C=1
Validation accuracy score = 0.8 for kernel rbf and C=10
Validation accuracy score = 0.8 for kernel rbf and C=100
Validation accuracy score = 1.0 for kernel linear and C=1
Validation accuracy score = 1.0 for kernel linear and C=10
Validation accuracy score = 1.0 for kernel linear and C=100
Validation accuracy score = 0.8 for kernel poly and C=1
Validation accuracy score = 0.8 for kernel poly and C=10
Validation accuracy score = 0.8 for kernel poly and C=100


In [212]:
svm(X_bigtrain,y_bigtrain,X_test,y_test,'Mouse B-Enac study (B-Enac vs WT)','linear',1)

[1mMouse B-Enac study (B-Enac vs WT)[0m

Training accuracy score = 1.0
Testing accuracy score = 1.0


# Mouse MPS study (MPS_I vs WT vs Het)

In [213]:
df_mouse_mps = pd.read_csv('mouse_mps_summary.csv')
df_WT = df_mouse_mps[df_mouse_mps['Genotype'] == 'WT']
df_MPS_I = df_mouse_mps[df_mouse_mps['Genotype'] == 'MPS I']
df_Het = df_mouse_mps[df_mouse_mps['Genotype'] == 'Het']

df_mouse_combine = pd.concat([df_WT, df_MPS_I, df_Het], ignore_index=True)
df_mouse_combine = features(df_mouse_combine) 
df_mouse_combine.shape

(33, 4)

In [214]:
X_train,y_train,X_test,y_test,X_val,y_val,X_bigtrain,y_bigtrain = data_split(df_mouse_combine)

Shapes are [(23, 3), (23,), (5, 3), (5,), (5, 3), (5,), (28, 3), (28,)]


In [215]:
svm_hyper_tune(X_train, y_train, X_val, y_val)

Validation accuracy score = 0.6 for kernel rbf and C=1
Validation accuracy score = 0.6 for kernel rbf and C=10
Validation accuracy score = 0.6 for kernel rbf and C=100
Validation accuracy score = 0.6 for kernel linear and C=1
Validation accuracy score = 0.4 for kernel linear and C=10
Validation accuracy score = 0.4 for kernel linear and C=100
Validation accuracy score = 0.6 for kernel poly and C=1
Validation accuracy score = 0.6 for kernel poly and C=10
Validation accuracy score = 0.4 for kernel poly and C=100


In [218]:
svm(X_bigtrain,y_bigtrain,X_test,y_test,'Mouse MPS study (MPS_I vs WT vs Het)','linear',1)

[1mMouse MPS study (MPS_I vs WT vs Het)[0m

Training accuracy score = 0.6071428571428571
Testing accuracy score = 0.4
