In [74]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import pandas as pd
from correlation_map import corr_map
from correlation_3D import corr_3D, rat_bead_study_data, rat_pa_study_data, mouse_b_enac_study_data, mouse_mps_study_data
from Principle_Component_Analysis import pca_2D, pca_3D
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_validate, GridSearchCV

#     cv_scores_svm = cross_validate(svm_pl, X_train, y_train, cv=10, scoring='accuracy', return_train_score=True)
#     print("Cross-Validation Accuracy:", np.round(cv_scores_svm['test_score'], decimals=3))
#     average_accuracy = cv_scores_svm['test_score'].mean()
#     print("Average Validation Accuracy:", round(average_accuracy,3))

# (0.778+0.667+0.778+0.444+0.667)/5


In [75]:
df_mouse_b_enac = pd.read_csv('mouse_b_enac_summary.csv')
df_mouse_enac_combine, b_enac_labels = mouse_b_enac_study_data(df_mouse_b_enac)

df_mouse_mps = pd.read_csv('mouse_mps_summary.csv')
df_mouse_mps_combine, mps_labels = mouse_mps_study_data(df_mouse_mps)

df_mouse_enac_combine_sel = df_mouse_enac_combine[['VDP(%)','MSV(mL/mL)','TV(L)','VH(%)','VHSS(%)','VHLS(%)','Genotype','IQR','HD']]
df_mouse_mps_combine_sel = df_mouse_mps_combine[['VDP(%)','MSV(mL/mL)','TV(L)','VH(%)','VHSS(%)','VHLS(%)','Genotype','IQR','HD']]
df_mouse_all = pd.concat([df_mouse_enac_combine_sel, df_mouse_mps_combine_sel], ignore_index=True)


In [117]:
def data_split(df, test_size):
    np_df = df.values

    train_set, test_set = train_test_split(np_df, test_size=test_size, random_state=42, stratify=np_df[:,-1])

    # Get the X and y for train, val and test
    X_train = train_set[:,:-1]
    y_train = train_set[:,-1]
    X_test = test_set[:,:-1]
    y_test = test_set[:,-1]
    
    print(f'Shapes are {[X_train.shape,y_train.shape,X_test.shape,y_test.shape]}')
    
    return X_train,y_train,X_test,y_test

In [174]:
def grid(clr, X_train, y_train):
    
    preproc_pl = Pipeline([ ('imputer', SimpleImputer(strategy="median")),
                        ('std_scaler', StandardScaler())])
    
    if clr == 'svm':
        model = Pipeline([('preproc',preproc_pl), ('svc', SVC(decision_function_shape='ovr'))])
        param_grid = {'kernel': ['linear', 'rbf', 'poly'], 'C': [1, 10, 100],}
        grid_search = GridSearchCV(model['svc'], param_grid, cv=3, scoring='accuracy')
        
    if clr == 'decision tree':
        model = Pipeline([('preproc',preproc_pl), ('dt', DecisionTreeClassifier(random_state=42))])
        param_grid = {'criterion': ['gini', 'entropy'], 'max_depth': [5, 10, 15],}
        grid_search = GridSearchCV(model['dt'], param_grid, cv=3, scoring='accuracy')
        
    if clr == 'knn':
        model = Pipeline([('preproc',preproc_pl), ('knn', KNeighborsClassifier())])
        param_grid = {'n_neighbors': [3, 5, 7, 9], 'weights': ['uniform', 'distance'],}
        grid_search = GridSearchCV(model['knn'], param_grid, cv=3, scoring='accuracy')
        
        
    grid_search.fit(X_train, y_train)
    
    best_params = grid_search.best_params_
    print("Best Hyperparameters:", best_params)
    print()

    grid_results = grid_search.cv_results_
    
    # Print all accuracy scores for each fold
    for fold_idx in range(grid_search.cv):
        fold_key = f"split{fold_idx}_test_score"
        fold_scores = grid_results[fold_key]
        print(f"Accuracy scores for Fold {fold_idx + 1}: {np.round(fold_scores, decimals=3)}")
    
    
    print()
    mean_test_scores = np.round(grid_results['mean_test_score'], decimals=3)
    std_test_scores = np.round(grid_results['std_test_score'], decimals=3)

    for params, mean_score, std_score in zip(grid_results['params'], mean_test_scores, std_test_scores):
        print(f"Hyperparameters: {params}, Mean Accuracy: {mean_score}, Std: {std_score}")
        print()
    

In [175]:
# Create SVM classifier and optimize the hyperparameters
def classier(clr, X_train, y_train, p1, p2):
    
    preproc_pl = Pipeline([ ('imputer', SimpleImputer(strategy="median")),
                        ('std_scaler', StandardScaler())])
    
    if clr == 'svm':
        model = Pipeline([('preproc',preproc_pl),
                           ('svc',SVC(kernel=p1, C=p2, random_state=42, decision_function_shape='ovr'))])
        
    if clr == 'decision tree':
        model = Pipeline([('preproc',preproc_pl),
                           ('dt',DecisionTreeClassifier(criterion=p1, max_depth=p2, random_state=42))])
        
    if clr == 'knn':
        model = Pipeline([('preproc',preproc_pl),
                           ('knn', KNeighborsClassifier(n_neighbors=p1,weights=p2))])
         
    model.fit(X_train,y_train)
    
    y_train_pred_svm = model.predict(X_train)
    y_test_pred_svm = model.predict(X_test)

    acc_train = accuracy_score(y_train,y_train_pred_svm)
    acc_test = accuracy_score(y_test,y_test_pred_svm)
    
    print('\033[1m' + clr + '\033[0m')
    print()
    print(f'Training accuracy score = {acc_train}')
    print(f'Testing accuracy score = {acc_test}')

            

In [176]:
# SVM
df_mouse_features = df_mouse_all[['VDP(%)','MSV(mL/mL)','TV(L)','VH(%)','VHSS(%)','VHLS(%)','HD','Genotype']]
X_train,y_train,X_test,y_test = data_split(df_mouse_features,0.25)
grid('svm',X_train, y_train)

Shapes are [(45, 7), (45,), (16, 7), (16,)]
Best Hyperparameters: {'C': 10, 'kernel': 'linear'}

Accuracy scores for Fold 1: [0.733 0.533 0.533 0.733 0.667 0.667 0.667 0.8   0.733]
Accuracy scores for Fold 2: [0.533 0.533 0.6   0.733 0.667 0.533 0.667 0.6   0.667]
Accuracy scores for Fold 3: [0.667 0.4   0.467 0.6   0.533 0.333 0.6   0.533 0.533]

Hyperparameters: {'C': 1, 'kernel': 'linear'}, Mean Accuracy: 0.644, Std: 0.083

Hyperparameters: {'C': 1, 'kernel': 'rbf'}, Mean Accuracy: 0.489, Std: 0.063

Hyperparameters: {'C': 1, 'kernel': 'poly'}, Mean Accuracy: 0.533, Std: 0.054

Hyperparameters: {'C': 10, 'kernel': 'linear'}, Mean Accuracy: 0.689, Std: 0.063

Hyperparameters: {'C': 10, 'kernel': 'rbf'}, Mean Accuracy: 0.622, Std: 0.063

Hyperparameters: {'C': 10, 'kernel': 'poly'}, Mean Accuracy: 0.511, Std: 0.137

Hyperparameters: {'C': 100, 'kernel': 'linear'}, Mean Accuracy: 0.644, Std: 0.031

Hyperparameters: {'C': 100, 'kernel': 'rbf'}, Mean Accuracy: 0.644, Std: 0.113

Hyperpar

In [178]:
classier('svm', X_train, y_train, 'linear', 10)

[1msvm[0m

Training accuracy score = 0.9333333333333333
Testing accuracy score = 0.75


In [163]:
# decision tree
grid('decision tree',X_train, y_train)

Best Hyperparameters: {'criterion': 'gini', 'max_depth': 5}

Accuracy scores for Fold 1: [0.778 0.778 0.778 0.778 0.778 0.778]
Accuracy scores for Fold 2: [0.333 0.333 0.333 0.333 0.333 0.333]
Accuracy scores for Fold 3: [0.889 0.778 0.778 0.889 0.889 0.889]
Accuracy scores for Fold 4: [0.667 0.667 0.667 0.444 0.556 0.556]
Accuracy scores for Fold 5: [0.556 0.556 0.556 0.556 0.556 0.556]

Hyperparameters: {'criterion': 'gini', 'max_depth': 5}, Mean Accuracy: 0.644, Std: 0.191

Hyperparameters: {'criterion': 'gini', 'max_depth': 10}, Mean Accuracy: 0.622, Std: 0.166

Hyperparameters: {'criterion': 'gini', 'max_depth': 15}, Mean Accuracy: 0.622, Std: 0.166

Hyperparameters: {'criterion': 'entropy', 'max_depth': 5}, Mean Accuracy: 0.6, Std: 0.206

Hyperparameters: {'criterion': 'entropy', 'max_depth': 10}, Mean Accuracy: 0.622, Std: 0.194

Hyperparameters: {'criterion': 'entropy', 'max_depth': 15}, Mean Accuracy: 0.622, Std: 0.194



In [164]:
classier('decision tree', X_train, y_train, 'gini', 5)

[1mdecision tree[0m

Training accuracy score = 0.9777777777777777
Testing accuracy score = 0.625


In [172]:
# KNN
grid('knn',X_train, y_train)

Best Hyperparameters: {'n_neighbors': 3, 'weights': 'distance'}

Accuracy scores for Fold 1: [0.778 0.889 0.778 0.889 0.889 0.889 0.889 0.889]
Accuracy scores for Fold 2: [0.667 0.556 0.667 0.556 0.667 0.667 0.667 0.667]
Accuracy scores for Fold 3: [0.667 0.667 0.778 0.778 0.778 0.667 0.667 0.667]
Accuracy scores for Fold 4: [0.556 0.556 0.444 0.556 0.444 0.556 0.333 0.556]
Accuracy scores for Fold 5: [0.556 0.778 0.333 0.556 0.333 0.444 0.333 0.444]

Hyperparameters: {'n_neighbors': 3, 'weights': 'uniform'}, Mean Accuracy: 0.644, Std: 0.083

Hyperparameters: {'n_neighbors': 3, 'weights': 'distance'}, Mean Accuracy: 0.689, Std: 0.13

Hyperparameters: {'n_neighbors': 5, 'weights': 'uniform'}, Mean Accuracy: 0.6, Std: 0.181

Hyperparameters: {'n_neighbors': 5, 'weights': 'distance'}, Mean Accuracy: 0.667, Std: 0.141

Hyperparameters: {'n_neighbors': 7, 'weights': 'uniform'}, Mean Accuracy: 0.622, Std: 0.206

Hyperparameters: {'n_neighbors': 7, 'weights': 'distance'}, Mean Accuracy: 0.644

In [173]:
classier('knn', X_train, y_train, 3, 'distance')

[1mknn[0m

Training accuracy score = 1.0
Testing accuracy score = 0.625
