In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

In [2]:
df_mouse_b_enac = pd.read_csv('mouse_b_enac_summary_new.csv')
df_mouse_b_enac = df_mouse_b_enac.drop('Genotype', axis=1)
df_mouse_b_enac.shape

(28, 8)

In [3]:
# One more column for genotype
df_mouse_mps = pd.read_csv('mouse_mps_summary.csv')
df_mouse_mps = df_mouse_mps.drop('Genotype', axis=1)
df_mouse_mps.shape

(33, 8)

In [4]:
df_rat_pa = pd.read_csv('rat_pa_summary.csv')
df_rat_pa = df_rat_pa.drop('Genotype', axis=1)
df_rat_pa.shape

(83, 8)

In [5]:
df_rat_sterile_baseline = pd.read_csv('rat_sterile_baseline_summary.csv')
df_rat_sterile_baseline.shape

(51, 8)

In [6]:
df_rat_sterile_post_beads = pd.read_csv('rat_sterile_post_beads_summary.csv')
df_rat_sterile_post_beads.shape

(53, 8)

In [7]:
df_sheep_01_60 = pd.read_csv('sheep_01_60_summary.csv')
df_sheep_01_60.shape

(62, 8)

# SVM

# Rat bead study (baseline vs post beads)

In [30]:
df_rat_sterile_baseline['target'] = 'baseline'
df_rat_sterile_post_beads['target'] = 'post_beads'
df_rat_combine = pd.concat([df_rat_sterile_baseline, df_rat_sterile_post_beads], ignore_index=True)

In [31]:
def features(df):
    df = df.drop('ScanName', axis=1)
    df = df.drop('Date Prepared', axis=1)
    df = df.drop('VDP(%)', axis=1)
    df = df.drop('VHSS(%)', axis=1)
    df = df.drop('VHLS(%)', axis=1)
    if 'Genotype' in df.columns:
        df['target'] = df['Genotype']   # copy 'Genotype' column at the end with coolumn name target
        df = df.drop(columns=['Genotype']) # remove 'Genotype' column
    return df

df_rat_combine = features(df_rat_combine) 
df_rat_combine.shape

(104, 4)

In [32]:
def data_split(df, test_size):
    np_df = df.values

    train_set, test_set = train_test_split(np_df, test_size=test_size, random_state=42, stratify=np_df[:,-1])

    # Get the X and y for train, val and test
    X_train = train_set[:,:-1]
    y_train = train_set[:,-1]
    X_test = test_set[:,:-1]
    y_test = test_set[:,-1]
    
    print(f'Shapes are {[X_train.shape,y_train.shape,X_test.shape,y_test.shape]}')
    
    return X_train,y_train,X_test,y_test

X_train,y_train,X_test,y_test = data_split(df_rat_combine,0.2)



Shapes are [(83, 3), (83,), (21, 3), (21,)]


In [33]:
preproc_pl = Pipeline([ ('imputer', SimpleImputer(strategy="median")),
                        ('std_scaler', StandardScaler())])

In [34]:
# Rat bead study (baseline vs post beads)
def svm(X_train,y_train,X_test,y_test,name,kernel,c):
    
    svm_pl = Pipeline([('preproc',preproc_pl),
                       ('svc',SVC(kernel=kernel, C=c, random_state=42))])
    svm_pl.fit(X_train,y_train)

    y_train_pred_svm = svm_pl.predict(X_train)
    y_test_pred_svm = svm_pl.predict(X_test)

    acc_train = accuracy_score(y_train,y_train_pred_svm)
    acc_test = accuracy_score(y_test,y_test_pred_svm)
    
    print('\033[1m' + name + '\033[0m')
    print()
    print(f'Training accuracy score = {acc_train}')
    print(f'Testing accuracy score = {acc_test}')
    
svm(X_train,y_train,X_test,y_test,'Rat bead study (baseline vs post beads)','linear',1)

[1mRat bead study (baseline vs post beads)[0m

Training accuracy score = 0.8795180722891566
Testing accuracy score = 0.8571428571428571


# Rat_PA study (WT, KO, CF)

In [35]:
df_rat_pa = pd.read_csv('rat_pa_summary.csv')
df_rat_WT = df_rat_pa[df_rat_pa['Genotype'] == 'WT']
df_rat_KO = df_rat_pa[df_rat_pa['Genotype'] == 'KO']
df_rat_CF = df_rat_pa[df_rat_pa['Genotype'] == 'CF']

df_rat_combine = pd.concat([df_rat_WT, df_rat_KO, df_rat_CF], ignore_index=True)
df_rat_combine = features(df_rat_combine) 
df_rat_combine.shape

(82, 4)

In [36]:
X_train,y_train,X_test,y_test = data_split(df_rat_combine,0.2)

Shapes are [(65, 3), (65,), (17, 3), (17,)]


In [37]:
svm(X_train,y_train,X_test,y_test,'Rat_PA study (WT, KO, CF)','poly',1)

[1mRat_PA study (WT, KO, CF)[0m

Training accuracy score = 0.6153846153846154
Testing accuracy score = 0.4117647058823529


# Mouse B-Enac study (B-Enac vs WT)

In [129]:
df_mouse_b_enac = pd.read_csv('mouse_b_enac_summary_new.csv')
df_WT = df_mouse_b_enac[df_mouse_b_enac['Genotype'] == 'WT']
df_b_enac = df_mouse_b_enac[df_mouse_b_enac['Genotype'] == 'B_Enac']

df_mouse_combine = pd.concat([df_WT, df_b_enac], ignore_index=True)
df_mouse_combine = features(df_mouse_combine) 

df_mouse_combine.shape

(28, 4)

In [130]:
X_train,y_train,X_test,y_test = data_split(df_mouse_combine,0.3)

Shapes are [(19, 3), (19,), (9, 3), (9,)]


In [132]:
svm(X_train,y_train,X_test,y_test,'Mouse B-Enac study (B-Enac vs WT)','linear',1)

[1mMouse B-Enac study (B-Enac vs WT)[0m

Training accuracy score = 1.0
Testing accuracy score = 0.8888888888888888


# Mouse MPS study (MPS_I vs WT vs Het)

In [133]:
df_mouse_mps = pd.read_csv('mouse_mps_summary.csv')
df_WT = df_mouse_mps[df_mouse_mps['Genotype'] == 'WT']
df_MPS_I = df_mouse_mps[df_mouse_mps['Genotype'] == 'MPS I']
df_Het = df_mouse_mps[df_mouse_mps['Genotype'] == 'Het']

df_mouse_combine = pd.concat([df_WT, df_MPS_I, df_Het], ignore_index=True)
df_mouse_combine = features(df_mouse_combine) 
df_mouse_combine.shape

(33, 4)

In [134]:
X_train,y_train,X_test,y_test = data_split(df_mouse_combine,0.3)

Shapes are [(23, 3), (23,), (10, 3), (10,)]


In [135]:
svm(X_train,y_train,X_test,y_test,'Mouse MPS study (MPS_I vs WT vs Het)','linear',1)

[1mMouse MPS study (MPS_I vs WT vs Het)[0m

Training accuracy score = 0.6086956521739131
Testing accuracy score = 0.4
