In [389]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import pandas as pd
from correlation_map import corr_map
from correlation_3D import corr_3D, rat_bead_study_data, rat_pa_study_data, mouse_b_enac_study_data, mouse_mps_study_data
from Principle_Component_Analysis import pca_2D, pca_3D
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_validate, GridSearchCV

#     for fold_idx in range(grid_search.cv):
#         fold_key = f"split{fold_idx}_test_score"
#         fold_scores = grid_results[fold_key]
#         print(f"Valid accuracy scores for Fold {fold_idx + 1}: {np.round(fold_scores, decimals=3)}")
       
    
# #     for fold_idx in range(grid_search.cv):
# #         fold_key = f"split{fold_idx}_test_score"
# #         fold_scores = grid_results[fold_key]
# #         print(f"Valid accuracy scores for Fold {fold_idx + 1}: {np.round(fold_scores, decimals=3)}")
        
    
#     print()
#     mean_test_scores = np.round(grid_results['mean_test_score'], decimals=3)
#     std_test_scores = np.round(grid_results['std_test_score'], decimals=3)

#     for params, mean_score, std_score in zip(grid_results['params'], mean_test_scores, std_test_scores):
#         print(f"Hyperparameters: {params}, Valid Mean Accuracy: {mean_score}, Valid Std: {std_score}")
#         print()



In [390]:
df_mouse_b_enac = pd.read_csv('mouse_b_enac_summary.csv')
df_mouse_enac_combine, b_enac_labels = mouse_b_enac_study_data(df_mouse_b_enac)

df_mouse_mps = pd.read_csv('mouse_mps_summary.csv')
df_mouse_mps_combine, mps_labels = mouse_mps_study_data(df_mouse_mps)

df_mouse_enac_combine_sel = df_mouse_enac_combine[['VDP(%)','MSV(mL/mL)','TV(L)','VH(%)','VHSS(%)','VHLS(%)','Genotype','IQR','HD']]
df_mouse_mps_combine_sel = df_mouse_mps_combine[['VDP(%)','MSV(mL/mL)','TV(L)','VH(%)','VHSS(%)','VHLS(%)','Genotype','IQR','HD']]
df_mouse_all = pd.concat([df_mouse_enac_combine_sel, df_mouse_mps_combine_sel], ignore_index=True)


In [391]:
def data_split(df, test_size):
    np_df = df.values

    train_set, test_set = train_test_split(np_df, test_size=test_size, random_state=42, stratify=np_df[:,-1])

    # Get the X and y for train, val and test
    X_train = train_set[:,:-1]
    y_train = train_set[:,-1]
    X_test = test_set[:,:-1]
    y_test = test_set[:,-1]
    
    print(f'Shapes are {[X_train.shape,y_train.shape,X_test.shape,y_test.shape]}')
    
    return X_train,y_train,X_test,y_test

In [392]:
def grid(clr, X_train, y_train):
    
    preproc_pl = Pipeline([ ('imputer', SimpleImputer(strategy="median")),
                        ('std_scaler', StandardScaler())])
    
    if clr == 'svm':
        model = Pipeline([('preproc',preproc_pl), ('svc', SVC(decision_function_shape='ovr'))])
        param_grid = {'kernel': ['linear', 'rbf', 'poly'], 'C': [1, 10, 100],}
        grid_search = GridSearchCV(model['svc'], param_grid, cv=3, scoring='accuracy')
        
    elif clr == 'decision tree':
        model = Pipeline([('preproc',preproc_pl), ('dt', DecisionTreeClassifier(random_state=42))])
        param_grid = {'criterion': ['gini', 'entropy'], 'max_depth': [5, 10, 15],}
        grid_search = GridSearchCV(model['dt'], param_grid, cv=3, scoring='accuracy')
        
    elif clr == 'knn':
        model = Pipeline([('preproc',preproc_pl), ('knn', KNeighborsClassifier())])
        param_grid = {'n_neighbors': [3, 5, 7, 9], 'weights': ['uniform', 'distance'],}
        grid_search = GridSearchCV(model['knn'], param_grid, cv=3, scoring='accuracy')
        
    elif clr == 'rf':
        model = Pipeline([('preproc',preproc_pl), ('rf', RandomForestClassifier(random_state=42))])
        param_grid = {'criterion': ['gini', 'entropy'], 'max_depth': [5, 15, 20],}
        grid_search = GridSearchCV(model['rf'], param_grid, cv=3, scoring='accuracy')
        
    elif clr == 'gbc':
        model = Pipeline([('preproc',preproc_pl), ('gbc', GradientBoostingClassifier(random_state=42))])
        param_grid = {'max_depth': [3, 5, 7, 9, 11], 'learning_rate': [0.01, 0.1, 1],}
        grid_search = GridSearchCV(model['gbc'], param_grid, cv=3, scoring='accuracy')
        
    elif clr == 'sgd':
        model = Pipeline([('preproc',preproc_pl), ('sgd', SGDClassifier(random_state=42))])
        param_grid = {'loss': ['hinge','squared_hinge','perceptron'], 'penalty': ['l2', 'l1', 'elasticnet'],}
        grid_search = GridSearchCV(model['sgd'], param_grid, cv=3, scoring='accuracy')
    
        
        
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    print("Best Hyperparameters:", best_params)
     
    

In [393]:
def cross_valid(clr, X_train, y_train):
    
    preproc_pl = Pipeline([ ('imputer', SimpleImputer(strategy="median")),
                        ('std_scaler', StandardScaler())])
    
    if clr == 'svm':
        model = Pipeline([('preproc',preproc_pl), ('svc', SVC(decision_function_shape='ovr'))])
        param_grid = {'kernel': ['linear', 'rbf', 'poly'], 'C': [1, 10, 100],}
        grid_search = GridSearchCV(model['svc'], param_grid, cv=3, scoring='accuracy')
        
    elif clr == 'decision tree':
        model = Pipeline([('preproc',preproc_pl), ('dt', DecisionTreeClassifier(random_state=42))])
        param_grid = {'criterion': ['gini', 'entropy'], 'max_depth': [5, 10, 15],}
        grid_search = GridSearchCV(model['dt'], param_grid, cv=3, scoring='accuracy')
        
    elif clr == 'knn':
        model = Pipeline([('preproc',preproc_pl), ('knn', KNeighborsClassifier())])
        param_grid = {'n_neighbors': [3, 5, 7, 9], 'weights': ['uniform', 'distance'],}
        grid_search = GridSearchCV(model['knn'], param_grid, cv=3, scoring='accuracy')
        
    elif clr == 'rf':
        model = Pipeline([('preproc',preproc_pl), ('rf', RandomForestClassifier(random_state=42))])
        param_grid = {'criterion': ['gini', 'entropy'], 'max_depth': [5, 15, 20],}
        grid_search = GridSearchCV(model['rf'], param_grid, cv=3, scoring='accuracy')
        
    elif clr == 'gbc':
        model = Pipeline([('preproc',preproc_pl), ('gbc', GradientBoostingClassifier(random_state=42))])
        param_grid = {'max_depth': [3, 5, 7, 9, 11], 'learning_rate': [0.01, 0.1, 1],}
        grid_search = GridSearchCV(model['gbc'], param_grid, cv=3, scoring='accuracy')
        
    elif clr == 'sgd':
        model = Pipeline([('preproc',preproc_pl), ('sgd', SGDClassifier(random_state=42))])
        param_grid = {'loss': ['hinge','squared_hinge','perceptron'], 'penalty': ['l2', 'l1', 'elasticnet'],}
        grid_search = GridSearchCV(model['sgd'], param_grid, cv=3, scoring='accuracy')
        
         
    model.fit(X_train, y_train)
    
    cv_scores = cross_validate(model, X_train, y_train, cv=3, scoring='accuracy', return_train_score=True)
    
    print("Validation Accuracy:", np.round(cv_scores['test_score'], decimals=3))
    print()
    
    average_valid_accuracy = cv_scores['test_score'].mean()
    print("Average Validation Accuracy:", round(average_valid_accuracy,3))
    print()
    
    std_valid_accuracy = cv_scores['test_score'].std()
    print("Standard Deviation of Validation Accuracy:", round(std_valid_accuracy, 3))
    print()
    
    print("Training Accuracy:", np.round(cv_scores['train_score'], decimals=3))
    print()

    average_train_accuracy = cv_scores['train_score'].mean()
    print("Average Training Accuracy:", round(average_train_accuracy,3))
    print()
    
    std_train_accuracy = cv_scores['train_score'].std()
    print("Standard Deviation of Training Accuracy:", round(std_train_accuracy, 3))
        

In [394]:
# Create SVM classifier and optimize the hyperparameters
def classier(clr, X_train, y_train, p1, p2):
    
    preproc_pl = Pipeline([ ('imputer', SimpleImputer(strategy="median")),
                        ('std_scaler', StandardScaler())])
    
    if clr == 'svm':
        model = Pipeline([('preproc',preproc_pl),
                           ('svc',SVC(kernel=p1, C=p2, random_state=42, decision_function_shape='ovr'))])
        
    elif clr == 'decision tree':
        model = Pipeline([('preproc',preproc_pl),
                           ('dt',DecisionTreeClassifier(criterion=p1, max_depth=p2, random_state=42))])
        
    elif clr == 'knn':
        model = Pipeline([('preproc',preproc_pl),
                           ('knn', KNeighborsClassifier(n_neighbors=p1,weights=p2))])
        
    elif clr == 'rf':
        model = Pipeline([('preproc',preproc_pl),
                           ('rf', RandomForestClassifier(criterion=p1,max_depth=p2,random_state=42))])
        
    elif clr == 'gbc':
        model = Pipeline([('preproc',preproc_pl),
                           ('gbc', GradientBoostingClassifier(learning_rate=p1,max_depth=p2,random_state=42))])
        
    elif clr == 'sgd':
        model = Pipeline([('preproc',preproc_pl),
                           ('sgd', SGDClassifier(loss=p1,penalty=p2,random_state=42))])
         
    model.fit(X_train,y_train)
    
    y_train_pred_svm = model.predict(X_train)
    y_test_pred_svm = model.predict(X_test)

    acc_train = accuracy_score(y_train,y_train_pred_svm)
    acc_test = accuracy_score(y_test,y_test_pred_svm)
    
    print('\033[1m' + clr + '\033[0m')
    print()
    print(f'Testing accuracy score = {acc_test}')

            

In [395]:
# SVM
df_mouse_features = df_mouse_all[['VDP(%)','MSV(mL/mL)','TV(L)','VH(%)','VHSS(%)','VHLS(%)','HD','Genotype']]
X_train,y_train,X_test,y_test = data_split(df_mouse_features,0.25)
print()
grid_results = grid('svm',X_train, y_train)
print()
cross_valid('svm',X_train, y_train)

Shapes are [(45, 7), (45,), (16, 7), (16,)]

Best Hyperparameters: {'C': 10, 'kernel': 'linear'}

Validation Accuracy: [0.733 0.667 0.667]

Average Validation Accuracy: 0.689

Standard Deviation of Validation Accuracy: 0.031

Training Accuracy: [0.767 0.767 0.833]

Average Training Accuracy: 0.789

Standard Deviation of Training Accuracy: 0.031


In [396]:
classier('svm', X_train, y_train, 'linear', 10)

[1msvm[0m

Testing accuracy score = 0.75


In [397]:
# decision tree
grid('decision tree',X_train, y_train)
print()
cross_valid('decision tree',X_train, y_train)

Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 5}

Validation Accuracy: [0.6   0.533 0.6  ]

Average Validation Accuracy: 0.578

Standard Deviation of Validation Accuracy: 0.031

Training Accuracy: [1. 1. 1.]

Average Training Accuracy: 1.0

Standard Deviation of Training Accuracy: 0.0


In [398]:
classier('decision tree', X_train, y_train, 'entropy', 5)

[1mdecision tree[0m

Testing accuracy score = 0.6875


In [399]:
# KNN
grid('knn',X_train, y_train)
print()
cross_valid('knn',X_train, y_train)

Best Hyperparameters: {'n_neighbors': 3, 'weights': 'distance'}

Validation Accuracy: [0.733 0.6   0.6  ]

Average Validation Accuracy: 0.644

Standard Deviation of Validation Accuracy: 0.063

Training Accuracy: [0.7 0.7 0.8]

Average Training Accuracy: 0.733

Standard Deviation of Training Accuracy: 0.047


In [400]:
classier('knn', X_train, y_train, 3, 'distance')

[1mknn[0m

Testing accuracy score = 0.625


In [401]:
# Random Forest
grid('rf',X_train, y_train)
print()
cross_valid('rf',X_train, y_train)

Best Hyperparameters: {'criterion': 'gini', 'max_depth': 15}

Validation Accuracy: [0.733 0.6   0.667]

Average Validation Accuracy: 0.667

Standard Deviation of Validation Accuracy: 0.054

Training Accuracy: [1. 1. 1.]

Average Training Accuracy: 1.0

Standard Deviation of Training Accuracy: 0.0


In [402]:
classier('rf', X_train, y_train, 'gini', 15)

[1mrf[0m

Testing accuracy score = 0.625


In [403]:
# Gradient Boost Classifier
grid('gbc',X_train, y_train)
print()
cross_valid('gbc',X_train, y_train)

Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 3}

Validation Accuracy: [0.733 0.667 0.733]

Average Validation Accuracy: 0.711

Standard Deviation of Validation Accuracy: 0.031

Training Accuracy: [1. 1. 1.]

Average Training Accuracy: 1.0

Standard Deviation of Training Accuracy: 0.0


In [404]:
classier('gbc', X_train, y_train, 0.1, 3)

[1mgbc[0m

Testing accuracy score = 0.6875


In [405]:
# SGD
grid('sgd',X_train, y_train)
print()
cross_valid('sgd',X_train, y_train)

Best Hyperparameters: {'loss': 'hinge', 'penalty': 'l2'}

Validation Accuracy: [0.667 0.8   0.667]

Average Validation Accuracy: 0.711

Standard Deviation of Validation Accuracy: 0.063

Training Accuracy: [0.8   0.967 0.9  ]

Average Training Accuracy: 0.889

Standard Deviation of Training Accuracy: 0.068


In [406]:
classier('sgd', X_train, y_train, "hinge", "l2")

[1msgd[0m

Testing accuracy score = 0.5
