# Predict Genetic Disorders

In [174]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif, SelectFromModel, RFE
from sklearn.preprocessing import LabelEncoder, RobustScaler, MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

In [175]:
df_train = pd.read_csv('train.csv')

In [190]:
df_train.describe()

Unnamed: 0,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Mother's age,Father's age,Status,Respiratory Rate (breaths/min),...,White Blood cell count (thousand per microliter),Blood test result,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5,Genetic Disorder,Disorder Subclass,Total Blood Cell Count
count,20745.0,20745.0,20745.0,20745.0,20745.0,20745.0,20745.0,20745.0,20745.0,20745.0,...,20745.0,20745.0,20745.0,20745.0,20745.0,20745.0,20745.0,20745.0,20745.0,20745.0
mean,6.974211,0.595131,0.392769,0.560617,0.434514,4.898918,34.523355,41.964088,0.496361,0.45182,...,7.48617,1.553242,0.61557,0.557628,0.54524,0.481417,0.448012,0.871584,5.197975,12.386709
std,4.17783,0.490878,0.488378,0.496324,0.495705,0.200022,8.408232,11.145906,0.499999,0.497685,...,2.522119,1.118238,0.486472,0.49668,0.497961,0.499667,0.497302,0.937651,2.15095,2.529646
min,0.0,0.0,0.0,0.0,0.0,4.092727,18.0,20.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.302273
25%,3.0,0.0,0.0,0.0,0.0,4.762823,29.0,35.0,0.0,0.0,...,5.654238,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,10.559937
50%,7.0,1.0,0.0,1.0,0.0,4.899443,35.0,42.0,0.0,0.0,...,7.466348,2.0,1.0,1.0,1.0,0.0,0.0,0.0,6.0,12.402436
75%,10.0,1.0,1.0,1.0,1.0,5.033903,40.0,49.0,1.0,1.0,...,9.278805,3.0,1.0,1.0,1.0,1.0,1.0,2.0,7.0,14.183326
max,14.0,1.0,1.0,1.0,1.0,5.609829,51.0,64.0,1.0,1.0,...,12.0,3.0,1.0,1.0,1.0,1.0,1.0,2.0,8.0,17.536404


In [176]:
df_train.drop(["Patient Id","Patient First Name","Family Name","Father's name","Location of Institute","Institute Name","Place of birth",'Test 1',
       'Test 2', 'Test 3', 'Test 4', 'Test 5', 'Parental consent'],inplace=True,axis=1)

df_train["Birth asphyxia"] = df_train["Birth asphyxia"].replace("No record",np.nan)
df_train["Birth asphyxia"] = df_train["Birth asphyxia"].replace("Not available",np.nan)

df_train["Autopsy shows birth defect (if applicable)"] = df_train["Autopsy shows birth defect (if applicable)"].replace("Not applicable",np.nan)

df_train["H/O radiation exposure (x-ray)"] = df_train["H/O radiation exposure (x-ray)"].replace("Not applicable",np.nan)
df_train["H/O radiation exposure (x-ray)"] = df_train["H/O radiation exposure (x-ray)"].replace("-",np.nan)

df_train["H/O substance abuse"] = df_train["H/O substance abuse"].replace("Not applicable",np.nan)
df_train["H/O substance abuse"] = df_train["H/O substance abuse"].replace("-",np.nan)

In [177]:
disorder_map = {
        "Leber's hereditary optic neuropathy": "Mitochondrial genetic inheritance disorders",
        "Leigh syndrome": "Mitochondrial genetic inheritance disorders",
        "Mitochondrial myopathy": "Mitochondrial genetic inheritance disorders",
        "Alzheimer's": "Multifactorial genetic inheritance disorders",
        "Cancer": "Multifactorial genetic inheritance disorders",
        "Diabetes": "Multifactorial genetic inheritance disorders",
        "Cystic fibrosis": "Single-gene inheritance diseases",
        "Hemochromatosis": "Single-gene inheritance diseases",
        "Tay-Sachs": "Single-gene inheritance diseases",
}

df_train["Genetic Disorder"] = df_train.apply(
    lambda row: disorder_map[row["Disorder Subclass"]]
    if pd.isnull(row["Genetic Disorder"]) and row["Disorder Subclass"] in disorder_map
    else row["Genetic Disorder"], axis=1
)

for subclass in df_train['Disorder Subclass'].dropna().unique():
    mode_genetic_disorder = df_train[df_train['Disorder Subclass'] == subclass]['Genetic Disorder'].mode()
    if not mode_genetic_disorder.empty:
        df_train.loc[
            (df_train['Disorder Subclass'] == subclass) & (df_train['Genetic Disorder'].isnull()),
            'Genetic Disorder'
        ] = mode_genetic_disorder[0]

for disorder in df_train['Genetic Disorder'].dropna().unique():
    mode_disorder_subclass = df_train[df_train['Genetic Disorder'] == disorder]['Disorder Subclass'].mode()
    if not mode_disorder_subclass.empty:
        df_train.loc[
            (df_train['Genetic Disorder'] == disorder) & (df_train['Disorder Subclass'].isnull()),
            'Disorder Subclass'
        ] = mode_disorder_subclass[0]

In [178]:
df_train['Total Blood Cell Count'] = df_train['Blood cell count (mcL)'] + df_train['White Blood cell count (thousand per microliter)']

In [179]:
df_train = df_train.dropna(subset=["Genetic Disorder", "Disorder Subclass"])

In [180]:
df_train = (
    df_train.groupby(["Genetic Disorder", "Disorder Subclass"])
    .apply(
        lambda group: group.apply(
            lambda column: column.fillna(
                column.mode()[0] if column.dtype == 'object' and not column.mode().empty
                else column.median()
            )
            if column.isnull().any()
            else column
        )
    )
    .reset_index(drop=True)
)

  .apply(


In [181]:
numerical_cols = df_train.select_dtypes(exclude=["object"]).columns
categorical_cols = df_train.select_dtypes(include=["object"]).columns

In [182]:
df_train['Status']
set_column = "Status"

df_train[set_column].value_counts()

Status
Alive       10448
Deceased    10297
Name: count, dtype: int64

In [183]:
label_encoder = LabelEncoder()
for col in categorical_cols:
    df_train[col] = label_encoder.fit_transform(df_train[col])

In [184]:
df_train[set_column].value_counts()

Status
0    10448
1    10297
Name: count, dtype: int64

In [185]:
full_model=['Genes in mother\'s side', 'Inherited from father', 'Maternal gene',
       'Mother\'s age', 'Father\'s age', 'Status', 'Follow-up', 'Birth asphyxia',
       'Autopsy shows birth defect (if applicable)',
       'H/O radiation exposure (x-ray)', 'H/O substance abuse',
       'White Blood cell count (thousand per microliter)', 'Blood test result',
       'Symptom 1', 'Symptom 2', 'Symptom 3', 'Symptom 4', 'Symptom 5',
       'Total Blood Cell Count']

In [186]:
x_full=df_train[full_model]
y_full=df_train[['Disorder Subclass']]

# Simple Model Prediction

In [187]:
x_train_f  , x_test_f , y_train_f, y_test_f = train_test_split (x_full ,y_full ,test_size = 0.3 , random_state = 0,stratify=y_full)

base_models_f = [
    ('logistic', Pipeline([
        ('scaler', StandardScaler()),
        ('lr', LogisticRegression(penalty='l1', solver='liblinear', C=0.5, max_iter=1000))
    ])),
]

model_f = StackingClassifier(
    estimators=base_models_f,
    final_estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    passthrough=True
)

model_f.fit(x_train_f, y_train_f)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
Parameters: { "use_label_encoder" } are not used.



In [188]:
y_pred_f=model_f.predict(x_test_f)
# print(classification_report(y_test_f,y_pred_f))
print(accuracy_score(y_test_f, y_pred_f))

0.711439588688946


In [189]:
filename = 'disorder_subclass_full.sav'
pickle.dump(model_f, open(filename, 'wb'))