## Model Development

In [47]:
import pandas as pd
import numpy as np
import sys

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split
import time
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from pathlib import Path
import os

import datetime
import pickle 

In [50]:
import utils.numerical_attr_eda_utils as num_eda_utils
import utils.categorical_attr_eda_utils as cat_eda_utils
import utils.all_attr_eda_utils as all_attr_eda_utils
import utils.attr_eda_utils as attr_eda_utils
import utils.assign_and_lab_utils as al_utils
import utils.multi_class_target_encoder_utils as mc_te_utils
import utils.classification_utils as class_utils
import utils.classifier_hyp_param_grid as cl_hpg

In [3]:
train_test_split_random_state = 42
train_validation_split_random_state = 42
fast_script_dev = False  
model_random_state = 42
test_size = 0.20
target_attr ='Genetic_disorder'
prediction_task_type = 'classification'
sgd_max_iter = 10000
binary = False
missingness_threshold = 0.20
calibrate_classifiers = True

In [4]:
genetic_df = pd.read_csv('data/genetic_df.csv',index_col=0)
genetic_df.head()

Unnamed: 0,Patient_age,Genes_mother_side,Inherited_father,Maternal_gene,Paternal_gene,Blood_cell_count,Status,Respiratory_rate,Heart_rate,Follow_up,...,Birth_defects,White_blood_cell_count,Blood_test,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Genetic_disorder,Disorder_subclass
0,2.0,Yes,No,Yes,No,4.760603,Alive,Normal (30-60),Normal,High,...,,9.857562,,1.0,1.0,1.0,1.0,1.0,Mitochondrial genetic inheritance disorders,Leber's hereditary optic neuropathy
1,4.0,Yes,Yes,No,No,4.910669,Deceased,Tachypnea,Normal,High,...,Multiple,5.52256,normal,1.0,,1.0,1.0,0.0,,Cystic fibrosis
2,6.0,Yes,No,No,No,4.893297,Alive,Normal (30-60),Tachycardia,Low,...,Singular,,normal,0.0,1.0,1.0,1.0,1.0,Multifactorial genetic inheritance disorders,Diabetes
3,12.0,Yes,No,Yes,No,4.70528,Deceased,Tachypnea,Normal,High,...,Singular,7.919321,inconclusive,0.0,0.0,1.0,0.0,0.0,Mitochondrial genetic inheritance disorders,Leigh syndrome
4,11.0,Yes,No,,Yes,4.720703,Alive,Tachypnea,Tachycardia,Low,...,Multiple,4.09821,,0.0,0.0,0.0,0.0,,Multifactorial genetic inheritance disorders,Cancer


In [5]:
genetic_df = genetic_df.copy()

In [6]:
genetic_df.columns

Index(['Patient_age', 'Genes_mother_side', 'Inherited_father', 'Maternal_gene',
       'Paternal_gene', 'Blood_cell_count', 'Status', 'Respiratory_rate',
       'Heart_rate', 'Follow_up', 'Gender', 'Folic_acid',
       'Assisted_conception', 'History_previous_pregnancies',
       'Previous_abortions', 'Birth_defects', 'White_blood_cell_count',
       'Blood_test', 'Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4',
       'Symptom_5', 'Genetic_disorder', 'Disorder_subclass'],
      dtype='object')

In [7]:
genetic_df.isna().sum()

Patient_age                     1427
Genes_mother_side                  0
Inherited_father                 306
Maternal_gene                   2810
Paternal_gene                      0
Blood_cell_count                   0
Status                             0
Respiratory_rate                2149
Heart_rate                      2113
Follow_up                       2166
Gender                          2173
Folic_acid                      2117
Assisted_conception             2122
History_previous_pregnancies    2172
Previous_abortions              2162
Birth_defects                   2154
White_blood_cell_count          2148
Blood_test                      2145
Symptom_1                       2155
Symptom_2                       2222
Symptom_3                       2101
Symptom_4                       2113
Symptom_5                       2153
Genetic_disorder                2146
Disorder_subclass               2168
dtype: int64

Remove rows with missing target values.

In [8]:
# Filter rows where the target_column is missing
genetic_df.dropna(subset=['Genetic_disorder'],inplace=True)


We have an imbalanced dataset. some classes are much more frequent than others

In [9]:
#genetic_df["Disorder_subclass"].value_counts()

In [10]:
genetic_df["Genetic_disorder"].value_counts()

Genetic_disorder
Mitochondrial genetic inheritance disorders     10202
Single-gene inheritance diseases                 7664
Multifactorial genetic inheritance disorders     2071
Name: count, dtype: int64

In [11]:
le = LabelEncoder()
# Encode 'Disorder_subclass'
#genetic_df['Disorder_subclass'] = le.fit_transform(genetic_df['Disorder_subclass'])

# Encode 'Genetic_disorder'
genetic_df['Genetic_disorder'] = le.fit_transform(genetic_df['Genetic_disorder'])

In [12]:
X = genetic_df.drop(['Genetic_disorder','Disorder_subclass'], axis=1)
y = genetic_df['Genetic_disorder']

In [13]:
print(genetic_df['Genetic_disorder'].unique())

[0 1 2]


## assign predictors to data type lists

In [14]:
numerical_attr = ['Patient_age','Blood_cell_count','Previous_abortions',
                    'White_blood_cell_count']

nominal_attr =  ['Genes_mother_side','Inherited_father','Maternal_gene','Paternal_gene','Status','Respiratory_rate',
                         'Heart_rate','Follow_up','Gender','Folic_acid','Assisted_conception','History_previous_pregnancies',
                         'Birth_defects','Blood_test','Symptom_1', 'Symptom_2','Symptom_3', 'Symptom_4', 'Symptom_5']

target = ['Genetic_disorder']
assert X.shape[1] == len(numerical_attr) + len(nominal_attr)

In [15]:
concern_list = all_attr_eda_utils.check_for_complete_unique_attrs(X)
print(f'\nconcern_list:\n{concern_list}', sep='')

Patient_age; 15; float64; 19937 
Genes_mother_side; 2; object; 19937 
Inherited_father; 2; object; 19937 
Maternal_gene; 2; object; 19937 
Paternal_gene; 2; object; 19937 
Blood_cell_count; 19937; float64; 19937 examine more closely
Status; 2; object; 19937 
Respiratory_rate; 2; object; 19937 
Heart_rate; 2; object; 19937 
Follow_up; 2; object; 19937 
Gender; 3; object; 19937 
Folic_acid; 2; object; 19937 
Assisted_conception; 2; object; 19937 
History_previous_pregnancies; 2; object; 19937 
Previous_abortions; 5; float64; 19937 
Birth_defects; 2; object; 19937 
White_blood_cell_count; 15681; float64; 19937 
Blood_test; 4; object; 19937 
Symptom_1; 2; float64; 19937 
Symptom_2; 2; float64; 19937 
Symptom_3; 2; float64; 19937 
Symptom_4; 2; float64; 19937 
Symptom_5; 2; float64; 19937 

concern_list:
['Blood_cell_count']


## Train/test split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20, random_state=42, shuffle=True,stratify=y)
print ('Train observations: %d\nTest observations: %d' % (X_train.shape[0], X_test.shape[0]))

Train observations: 15949
Test observations: 3988


## Train/validation split

In [17]:
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train, test_size=0.20, random_state=42, shuffle=True,stratify=y_train)
print ('Train observations: %d\nvalidation observations: %d' % (X_train.shape[0], X_val.shape[0]))

Train observations: 12759
validation observations: 3190


In [18]:
estimator_names = [
    'SGDClassifier', 
    'DecisionTreeClassifier', 
    'RandomForestClassifier', 
    'AdaBoostClassifier', 
    'GradientBoostingClassifier'
]

estimator_list = [
    
    SGDClassifier(loss='log_loss', random_state=model_random_state, class_weight='balanced',
                  max_iter=sgd_max_iter),  # logistic regr
    
    DecisionTreeClassifier(criterion='log_loss', random_state=model_random_state, class_weight='balanced'),
    
    RandomForestClassifier(criterion='log_loss', random_state=model_random_state, 
                           class_weight='balanced_subsample'),
    
     AdaBoostClassifier(
         estimator=DecisionTreeClassifier(
             criterion='log_loss', 
             random_state=model_random_state, 
             class_weight='balanced',
             max_depth=1
         ),
         random_state=model_random_state
     ),
    
     GradientBoostingClassifier(loss='log_loss', random_state=model_random_state),
    
]

## Fit the models and evaluate performance on the train set

In [51]:
print_plots = True
class_eval_dict = {
    'multiclass': binary,  # Assuming you have a multiclass evaluation function
    'scoring': 'accuracy',  # You can choose an appropriate metric for multiclass classification
    'get_precision_recall_curves': [True, 
                                    {'print_prc': print_plots, 
                                     'print_prd': print_plots,
                                    }],
    'get_roc_curve': [True, 
                      {
                        'print_roc': print_plots,
                      }]
}

default_train_compare_df, trained_default_estimator_dict = \
    al_utils.fit_collection_of_estimators(
        numerical_attr, 
        nominal_attr, 
        estimator_names, 
        estimator_list, 
        X_train, 
        y_train, 
        data_set_type='train', 
        model_selection_stage='default',
        prediction_task_type='classification',
        class_eval_dict=class_eval_dict
)
default_train_compare_df


KeyError: 'binary'

### evaluate the performance of the trained estimators on the validation set

In [20]:
print_plots = False
class_eval_dict={
    'binary': binary,
    'scoring': 'average_precision',
    'get_precision_recall_curves': [True, 
                                    {'print_prc': print_plots, 
                                     'print_prd': print_plots,
                                    }],
    'get_roc_curve': [True, 
                      {
                        'print_roc': print_plots,
                      }]
}


default_validation_compare_df = al_utils.eval_trained_estimators_in_trained_estimator_dict_class(
    trained_default_estimator_dict, 
    X_val, 
    y_val, 
    data_set_type='validation',
    model_selection_stage='default', 
    class_eval_dict=class_eval_dict
)
default_validation_compare_df


**************************************************
default of the SGDClassifier estimator predicting on the validation data set

roc_auc_score_: {0: 0.605, 1: 0.82, 2: 0.542}
ave_roc_auc_score_: 0.6558161038881893

ave_precision_score: {0: 0.588, 1: 0.343, 2: 0.435}
ave_ave_precision_score: 0.4552133460906673

**************************************************
default of the DecisionTreeClassifier estimator predicting on the validation data set

roc_auc_score_: {0: 0.565, 1: 0.6, 2: 0.541}
ave_roc_auc_score_: 0.5685902735919779

ave_precision_score: {0: 0.548, 1: 0.154, 2: 0.406}
ave_ave_precision_score: 0.369464889923125

**************************************************
default of the RandomForestClassifier estimator predicting on the validation data set

roc_auc_score_: {0: 0.702, 1: 0.844, 2: 0.629}
ave_roc_auc_score_: 0.7252414248679347

ave_precision_score: {0: 0.673, 1: 0.356, 2: 0.512}
ave_ave_precision_score: 0.5135284189454845

**********************************************

Unnamed: 0,ave_ave_precision_score,ave_precision_score,ave_roc_auc_score_,roc_auc_score_,estimator,data_set_type,model_selection_stage,number_of_attrs,attrs
0,0.455213,"{0: 0.588, 1: 0.343, 2: 0.435}",0.655816,"{0: 0.605, 1: 0.82, 2: 0.542}",SGDClassifier,validation,default,23,"[Patient_age, Genes_mother_side, Inherited_fat..."
1,0.369465,"{0: 0.548, 1: 0.154, 2: 0.406}",0.56859,"{0: 0.565, 1: 0.6, 2: 0.541}",DecisionTreeClassifier,validation,default,23,"[Patient_age, Genes_mother_side, Inherited_fat..."
2,0.513528,"{0: 0.673, 1: 0.356, 2: 0.512}",0.725241,"{0: 0.702, 1: 0.844, 2: 0.629}",RandomForestClassifier,validation,default,23,"[Patient_age, Genes_mother_side, Inherited_fat..."
3,0.447104,"{0: 0.496, 1: 0.359, 2: 0.486}",0.64767,"{0: 0.546, 1: 0.825, 2: 0.572}",AdaBoostClassifier,validation,default,23,"[Patient_age, Genes_mother_side, Inherited_fat..."
4,0.510867,"{0: 0.642, 1: 0.386, 2: 0.505}",0.706379,"{0: 0.677, 1: 0.842, 2: 0.601}",GradientBoostingClassifier,validation,default,23,"[Patient_age, Genes_mother_side, Inherited_fat..."


## assemble a data frame of default estimator performance on the train and validation stage

In [21]:
compare_df = pd.concat([default_train_compare_df, default_validation_compare_df], axis=0).\
    sort_values(['estimator', 'data_set_type', 'model_selection_stage'])
compare_df

Unnamed: 0,ave_ave_precision_score,ave_precision_score,ave_roc_auc_score_,roc_auc_score_,estimator,data_set_type,model_selection_stage,number_of_attrs,attrs
3,0.467781,"{0: 0.503, 1: 0.4, 2: 0.501}",0.65878,"{0: 0.562, 1: 0.841, 2: 0.574}",AdaBoostClassifier,train,default,23,"[Patient_age, Genes_mother_side, Inherited_fat..."
3,0.447104,"{0: 0.496, 1: 0.359, 2: 0.486}",0.64767,"{0: 0.546, 1: 0.825, 2: 0.572}",AdaBoostClassifier,validation,default,23,"[Patient_age, Genes_mother_side, Inherited_fat..."
1,1.0,"{0: 1.0, 1: 1.0, 2: 1.0}",1.0,"{0: 1.0, 1: 1.0, 2: 1.0}",DecisionTreeClassifier,train,default,23,"[Patient_age, Genes_mother_side, Inherited_fat..."
1,0.369465,"{0: 0.548, 1: 0.154, 2: 0.406}",0.56859,"{0: 0.565, 1: 0.6, 2: 0.541}",DecisionTreeClassifier,validation,default,23,"[Patient_age, Genes_mother_side, Inherited_fat..."
4,0.59694,"{0: 0.751, 1: 0.452, 2: 0.588}",0.769586,"{0: 0.756, 1: 0.868, 2: 0.686}",GradientBoostingClassifier,train,default,23,"[Patient_age, Genes_mother_side, Inherited_fat..."
4,0.510867,"{0: 0.642, 1: 0.386, 2: 0.505}",0.706379,"{0: 0.677, 1: 0.842, 2: 0.601}",GradientBoostingClassifier,validation,default,23,"[Patient_age, Genes_mother_side, Inherited_fat..."
2,1.0,"{0: 1.0, 1: 1.0, 2: 1.0}",1.0,"{0: 1.0, 1: 1.0, 2: 1.0}",RandomForestClassifier,train,default,23,"[Patient_age, Genes_mother_side, Inherited_fat..."
2,0.513528,"{0: 0.673, 1: 0.356, 2: 0.512}",0.725241,"{0: 0.702, 1: 0.844, 2: 0.629}",RandomForestClassifier,validation,default,23,"[Patient_age, Genes_mother_side, Inherited_fat..."
0,0.449012,"{0: 0.57, 1: 0.354, 2: 0.423}",0.647054,"{0: 0.582, 1: 0.821, 2: 0.538}",SGDClassifier,train,default,23,"[Patient_age, Genes_mother_side, Inherited_fat..."
0,0.455213,"{0: 0.588, 1: 0.343, 2: 0.435}",0.655816,"{0: 0.605, 1: 0.82, 2: 0.542}",SGDClassifier,validation,default,23,"[Patient_age, Genes_mother_side, Inherited_fat..."


## Hyperparameter tuning

In [22]:
alpha_points = 5
l1_ratio_points = 5
m_points = 5

hyp_param_tuning_exp_dict = cl_hpg.get_hyp_param_tuning_exp_dict(
    estimator_names,
    estimator_list, 
    alpha_points, 
    l1_ratio_points, 
    m_points, 
    X_train, 
    binary=True,
    fast_script_dev=fast_script_dev, 
    print_param_grids=True
)


************************************************************
************************************************************
SGDClassifier(class_weight='balanced', loss='log_loss', max_iter=10000,
              random_state=42)
{'estimator__penalty': ['l2'], 'estimator__alpha': [0.0001], 'estimator__l1_ratio': [0.15], 'estimator__n_jobs': [None], 'preprocessor__numerical__imputer__strategy': ['mean', 'median'], 'preprocessor__nominal__target_encoder__smooth': ['auto']}

************************************************************
************************************************************
DecisionTreeClassifier(class_weight='balanced', criterion='log_loss',
                       random_state=42)
{'estimator__criterion': ['gini', 'entropy'], 'estimator__splitter': ['best', 'random'], 'estimator__max_depth': [None, 10, 20], 'estimator__max_features': ['auto', 'sqrt', 'log2'], 'preprocessor__numerical__imputer__strategy': ['mean', 'median'], 'preprocessor__nominal__target_encoder__smooth'

## perform a grid search over hyper parameters to select best model

In [34]:
y_train_df = pd.DataFrame(y_train)
y_train_df.head()

Unnamed: 0,Genetic_disorder
3764,2
19106,1
11608,1
14608,0
12647,0


In [45]:
print_plots = False
class_eval_dict={
    'binary': binary,
    'scoring': 'average_precision',
    'get_precision_recall_curves': [True, 
                                    {'print_prc': print_plots, 
                                     'print_prd': print_plots,
                                     'data_set_name': '',  # this is here to make things work - a bit of a hack
                                     'model_selection_stage': '',  # this is here to make things work - a bit of 
                                                                   # a hack
                                    }],
    'get_roc_curve': [True, 
                      {
                        'print_roc': print_plots,
                        'data_set_name': '',  # this is here to make things work - a bit of a hack
                        'model_selection_stage': '',  # this is here to make things work - a bit of a hack
                      }]
}

grid_search_cv_results_df, _ = \
    al_utils.grid_search_cv_wrapper(
        estimator_names,
        hyp_param_tuning_exp_dict, 
        numerical_attr, 
        nominal_attr,
        X_train, 
        y_train_df, 
        target_attr,
        prediction_task_type='classification',
        class_eval_dict=class_eval_dict
)
grid_search_cv_results_df


********************************************************************************
SGDClassifier
average_precision
Fitting 5 folds for each of 2 candidates, totalling 10 fits


ValueError: Invalid parameter 'smooth' for estimator PolynomialWrapper(feature_encoder=TargetEncoder()). Valid parameters are: ['feature_encoder'].

## build out the preprocessor

We built an numerical transformer

In [None]:
numerical_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), 
           ("scaler", StandardScaler())]
)

We built a nominal transformer

In [None]:
# Create a nominal transformer with one-hot encoding
nominal_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ('target_encoder', TargetEncoder(target_type='continuous', random_state=42)),
    ("scaler", StandardScaler())]
)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', numerical_transformer, numerical_attr),
        ('nominal', nominal_transformer, nominal_attr)
    ]
)
preprocessor

In [None]:
X_transformed = preprocessor.fit_transform(X, y.values.ravel())
cap_x_transformed = pd.DataFrame(X_transformed, columns=X.columns)
cap_x_transformed.isna().sum()

In [None]:
cap_x_transformed.head()

## train test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(cap_x_transformed,y, test_size=0.20, random_state=42, shuffle=True,stratify=y)
print ('Train observations: %d\nTest observations: %d' % (X_train.shape[0], X_test.shape[0]))

In [None]:
y_train.value_counts()

In [None]:
X_train.to_csv('X_train.csv')
y_train.to_csv('y_train.csv')
X_test.to_csv('X_test.csv')
y_test.to_csv('y_test.csv')

In [None]:
del X_test
del y_test

In [None]:
estimator_names = [
    'SGDClassifier', 
    'DecisionTreeClassifier', 
    'RandomForestClassifier', 
    'AdaBoostClassifier', 
    'GradientBoostingClassifier'
]

estimator_list = [
    
    SGDClassifier(loss='log_loss', random_state=model_random_state, class_weight='balanced',
                  max_iter=sgd_max_iter),  # logistic regr
    
    DecisionTreeClassifier(criterion='log_loss', random_state=model_random_state, class_weight='balanced'),
    
    RandomForestClassifier(criterion='log_loss', random_state=model_random_state, 
                           class_weight='balanced_subsample'),
    
     AdaBoostClassifier(
         estimator=DecisionTreeClassifier(
             criterion='log_loss', 
             random_state=model_random_state, 
             class_weight='balanced',
             max_depth=1
         ),
         random_state=model_random_state
     ),
    
     GradientBoostingClassifier(loss='log_loss', random_state=model_random_state),
    
]

In [None]:
print_plots = True
class_eval_dict={
    'binary': binary,
    'scoring': 'average_precision',
    'get_precision_recall_curves': [True, 
                                    {'print_prc': print_plots, 
                                     'print_prd': print_plots,
                                    }],
    'get_roc_curve': [True, 
                      {
                        'print_roc': print_plots,
                      }]
}

default_train_compare_df, trained_default_estimator_dict = \
    al_utils.fit_collection_of_estimators(
        numerical_attr, 
        nominal_attr, 
        estimator_names, 
        estimator_list, 
        X_train, 
        y_train, 
        data_set_type='train', 
        model_selection_stage='default',
        prediction_task_type='classification',
        class_eval_dict=class_eval_dict
)
default_train_compare_df