# Predict Genetic Disorders

In [4]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif, SelectFromModel, RFE
from sklearn.preprocessing import LabelEncoder, RobustScaler, MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

In [5]:
df_train = pd.read_csv('train.csv')

In [6]:
df_train.drop(["Patient Id","Patient First Name","Family Name","Father's name","Location of Institute","Institute Name","Place of birth",'Test 1',
       'Test 2', 'Test 3', 'Test 4', 'Test 5', 'Parental consent'],inplace=True,axis=1)

df_train["Birth asphyxia"] = df_train["Birth asphyxia"].replace("No record",np.nan)
df_train["Birth asphyxia"] = df_train["Birth asphyxia"].replace("Not available",np.nan)

df_train["Autopsy shows birth defect (if applicable)"] = df_train["Autopsy shows birth defect (if applicable)"].replace("Not applicable",np.nan)

df_train["H/O radiation exposure (x-ray)"] = df_train["H/O radiation exposure (x-ray)"].replace("Not applicable",np.nan)
df_train["H/O radiation exposure (x-ray)"] = df_train["H/O radiation exposure (x-ray)"].replace("-",np.nan)

df_train["H/O substance abuse"] = df_train["H/O substance abuse"].replace("Not applicable",np.nan)
df_train["H/O substance abuse"] = df_train["H/O substance abuse"].replace("-",np.nan)

In [7]:
disorder_map = {
        "Leber's hereditary optic neuropathy": "Mitochondrial genetic inheritance disorders",
        "Leigh syndrome": "Mitochondrial genetic inheritance disorders",
        "Mitochondrial myopathy": "Mitochondrial genetic inheritance disorders",
        "Alzheimer's": "Multifactorial genetic inheritance disorders",
        "Cancer": "Multifactorial genetic inheritance disorders",
        "Diabetes": "Multifactorial genetic inheritance disorders",
        "Cystic fibrosis": "Single-gene inheritance diseases",
        "Hemochromatosis": "Single-gene inheritance diseases",
        "Tay-Sachs": "Single-gene inheritance diseases",
}

df_train["Genetic Disorder"] = df_train.apply(
    lambda row: disorder_map[row["Disorder Subclass"]]
    if pd.isnull(row["Genetic Disorder"]) and row["Disorder Subclass"] in disorder_map
    else row["Genetic Disorder"], axis=1
)

for subclass in df_train['Disorder Subclass'].dropna().unique():
    mode_genetic_disorder = df_train[df_train['Disorder Subclass'] == subclass]['Genetic Disorder'].mode()
    if not mode_genetic_disorder.empty:
        df_train.loc[
            (df_train['Disorder Subclass'] == subclass) & (df_train['Genetic Disorder'].isnull()),
            'Genetic Disorder'
        ] = mode_genetic_disorder[0]

for disorder in df_train['Genetic Disorder'].dropna().unique():
    mode_disorder_subclass = df_train[df_train['Genetic Disorder'] == disorder]['Disorder Subclass'].mode()
    if not mode_disorder_subclass.empty:
        df_train.loc[
            (df_train['Genetic Disorder'] == disorder) & (df_train['Disorder Subclass'].isnull()),
            'Disorder Subclass'
        ] = mode_disorder_subclass[0]

In [8]:
df_train['Total Blood Cell Count'] = df_train['Blood cell count (mcL)'] + df_train['White Blood cell count (thousand per microliter)']

In [9]:
df_train = df_train.dropna(subset=["Genetic Disorder", "Disorder Subclass"])

In [10]:
df_train = (
    df_train.groupby(["Genetic Disorder", "Disorder Subclass"])
    .apply(
        lambda group: group.apply(
            lambda column: column.fillna(
                column.mode()[0] if column.dtype == 'object' and not column.mode().empty
                else column.median()
            )
            if column.isnull().any()
            else column
        )
    )
    .reset_index(drop=True)
)

  .apply(


In [11]:
numerical_cols = df_train.select_dtypes(exclude=["object"]).columns
categorical_cols = df_train.select_dtypes(include=["object"]).columns

In [12]:
label_encoder = LabelEncoder()
for col in categorical_cols:
    df_train[col] = label_encoder.fit_transform(df_train[col])

full_model=['Genes in mother\'s side', 'Inherited from father', 'Maternal gene',
       'Mother\'s age', 'Father\'s age', 'Status', 'Follow-up', 'Birth asphyxia',
       'Autopsy shows birth defect (if applicable)',
       'H/O radiation exposure (x-ray)', 'H/O substance abuse',
       'White Blood cell count (thousand per microliter)', 'Blood test result',
       'Symptom 1', 'Symptom 2', 'Symptom 3', 'Symptom 4', 'Symptom 5',
       'Total Blood Cell Count']

In [13]:
x_full=df_train[full_model]
y_full=df_train[['Disorder Subclass']]

NameError: name 'full_model' is not defined

# Full Model Prediction

In [None]:
x_train_f  , x_test_f , y_train_f, y_test_f = train_test_split (x_full ,y_full ,test_size = 0.3 , random_state = 0,stratify=y_full)

base_models_f = [
    ('logistic', Pipeline([
        ('scaler', StandardScaler()),
        ('lr', LogisticRegression(penalty='l1', solver='liblinear', C=0.5, max_iter=1000))
    ])),
]

model_f = StackingClassifier(
    estimators=base_models_f,
    final_estimator=XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    learning_rate=0.1,
    max_depth=7,
    n_estimators=50,
    subsample=0.7,
    colsample_bytree=1.0,
    gamma=0.1
    ),
    passthrough=True
)

model_f.fit(x_train_f, y_train_f)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
Parameters: { "use_label_encoder" } are not used.



In [None]:
y_pred_f=model_f.predict(x_test_f)
print(classification_report(y_test_f,y_pred_f))
# print(accuracy_score(y_test_f, y_pred_f))

              precision    recall  f1-score   support

           0       0.74      0.52      0.61        44
           1       0.68      0.61      0.64        28
           2       0.85      0.87      0.86      1193
           3       0.75      0.70      0.73       578
           4       0.70      0.67      0.69       387
           5       0.73      0.54      0.62       182
           6       0.74      0.79      0.76      1744
           7       0.63      0.67      0.65      1261
           8       0.60      0.51      0.55       807

    accuracy                           0.72      6224
   macro avg       0.71      0.65      0.68      6224
weighted avg       0.72      0.72      0.72      6224



In [None]:
filename = 'disorder_subclass_full.sav'
pickle.dump(model_f, open(filename, 'wb'))