In [1]:
import pandas as pd
import numpy as np
import sys

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split

import time

from pathlib import Path
import os

import datetime
import pickle 

In [2]:
import utils.numerical_attr_eda_utils as num_eda_utils
import utils.categorical_attr_eda_utils as cat_eda_utils
import utils.all_attr_eda_utils as all_attr_eda_utils
import utils.attr_eda_utils as attr_eda_utils
import utils.assign_and_lab_utils as al_utils
import utils.multi_class_target_encoder_utils as mc_te_utils
import utils.classification_utils as class_utils
import utils.classifier_hyp_param_grid as cl_hpg

## Parameters

In [3]:
path_to_data = 'data/genetic_disorder.csv'

train_test_split_random_state = 42
train_validation_split_random_state = 42
fast_script_dev = False  
model_random_state = 42
test_size = 0.20
target_attr = 'Genetic Disorder'
prediction_task_type = 'classification'
sgd_max_iter = 10000
binary = False
missingness_threshold = 0.20
calibrate_classifiers = True

## Import Data

In [4]:
genetic_df = pd.read_csv(path_to_data)
print(genetic_df.shape)
genetic_df.head()

(22083, 45)


Unnamed: 0,Patient Id,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Patient First Name,Family Name,Father's name,...,Birth defects,White Blood cell count (thousand per microliter),Blood test result,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5,Genetic Disorder,Disorder Subclass
0,PID0x6418,2.0,Yes,No,Yes,No,4.760603,Richard,,Larre,...,,9.857562,,1.0,1.0,1.0,1.0,1.0,Mitochondrial genetic inheritance disorders,Leber's hereditary optic neuropathy
1,PID0x25d5,4.0,Yes,Yes,No,No,4.910669,Mike,,Brycen,...,Multiple,5.52256,normal,1.0,,1.0,1.0,0.0,,Cystic fibrosis
2,PID0x4a82,6.0,Yes,No,No,No,4.893297,Kimberly,,Nashon,...,Singular,,normal,0.0,1.0,1.0,1.0,1.0,Multifactorial genetic inheritance disorders,Diabetes
3,PID0x4ac8,12.0,Yes,No,Yes,No,4.70528,Jeffery,Hoelscher,Aayaan,...,Singular,7.919321,inconclusive,0.0,0.0,1.0,0.0,0.0,Mitochondrial genetic inheritance disorders,Leigh syndrome
4,PID0x1bf7,11.0,Yes,No,,Yes,4.720703,Johanna,Stutzman,Suave,...,Multiple,4.09821,,0.0,0.0,0.0,0.0,,Multifactorial genetic inheritance disorders,Cancer


## convert target_attr to numerical encoding using label encoder

In [5]:
le = LabelEncoder()
# Encode 'Disorder_subclass'
#genetic_df['Disorder_subclass'] = le.fit_transform(genetic_df['Disorder_subclass'])

# Encode 'Genetic_disorder'
genetic_df['Genetic Disorder'] = le.fit_transform(genetic_df['Genetic Disorder'])

In [6]:
X = genetic_df.drop(['Genetic Disorder','Disorder Subclass'], axis=1)
y = genetic_df['Genetic Disorder']

In [7]:
print(genetic_df['Genetic Disorder'].unique())

[0 3 1 2]
