# Predict Genetic Disorders

In [134]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif, SelectFromModel, RFE
from sklearn.preprocessing import LabelEncoder, RobustScaler, MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

In [135]:
df_train = pd.read_csv('train.csv')

In [136]:
df_train.drop(["Patient Id","Patient First Name","Family Name","Father's name","Location of Institute","Institute Name","Place of birth",'Test 1',
       'Test 2', 'Test 3', 'Test 4', 'Test 5', 'Parental consent'],inplace=True,axis=1)

df_train["Birth asphyxia"] = df_train["Birth asphyxia"].replace("No record",np.nan)
df_train["Birth asphyxia"] = df_train["Birth asphyxia"].replace("Not available",np.nan)

df_train["Autopsy shows birth defect (if applicable)"] = df_train["Autopsy shows birth defect (if applicable)"].replace("Not applicable",np.nan)

df_train["H/O radiation exposure (x-ray)"] = df_train["H/O radiation exposure (x-ray)"].replace("Not applicable",np.nan)
df_train["H/O radiation exposure (x-ray)"] = df_train["H/O radiation exposure (x-ray)"].replace("-",np.nan)

df_train["H/O substance abuse"] = df_train["H/O substance abuse"].replace("Not applicable",np.nan)
df_train["H/O substance abuse"] = df_train["H/O substance abuse"].replace("-",np.nan)

In [137]:
disorder_map = {
        "Leber's hereditary optic neuropathy": "Mitochondrial genetic inheritance disorders",
        "Leigh syndrome": "Mitochondrial genetic inheritance disorders",
        "Mitochondrial myopathy": "Mitochondrial genetic inheritance disorders",
        "Alzheimer's": "Multifactorial genetic inheritance disorders",
        "Cancer": "Multifactorial genetic inheritance disorders",
        "Diabetes": "Multifactorial genetic inheritance disorders",
        "Cystic fibrosis": "Single-gene inheritance diseases",
        "Hemochromatosis": "Single-gene inheritance diseases",
        "Tay-Sachs": "Single-gene inheritance diseases",
}

df_train["Genetic Disorder"] = df_train.apply(
    lambda row: disorder_map[row["Disorder Subclass"]]
    if pd.isnull(row["Genetic Disorder"]) and row["Disorder Subclass"] in disorder_map
    else row["Genetic Disorder"], axis=1
)

for subclass in df_train['Disorder Subclass'].dropna().unique():
    mode_genetic_disorder = df_train[df_train['Disorder Subclass'] == subclass]['Genetic Disorder'].mode()
    if not mode_genetic_disorder.empty:
        df_train.loc[
            (df_train['Disorder Subclass'] == subclass) & (df_train['Genetic Disorder'].isnull()),
            'Genetic Disorder'
        ] = mode_genetic_disorder[0]

for disorder in df_train['Genetic Disorder'].dropna().unique():
    mode_disorder_subclass = df_train[df_train['Genetic Disorder'] == disorder]['Disorder Subclass'].mode()
    if not mode_disorder_subclass.empty:
        df_train.loc[
            (df_train['Genetic Disorder'] == disorder) & (df_train['Disorder Subclass'].isnull()),
            'Disorder Subclass'
        ] = mode_disorder_subclass[0]

In [138]:
df_train['Total Blood Cell Count'] = df_train['Blood cell count (mcL)'] + df_train['White Blood cell count (thousand per microliter)']

In [139]:
df_train = df_train.dropna(subset=["Genetic Disorder", "Disorder Subclass"])

In [140]:
df_train = (
    df_train.groupby(["Genetic Disorder", "Disorder Subclass"])
    .apply(
        lambda group: group.apply(
            lambda column: column.fillna(
                column.mode()[0] if column.dtype == 'object' and not column.mode().empty
                else column.median()
            )
            if column.isnull().any()
            else column
        )
    )
    .reset_index(drop=True)
)

  .apply(


In [141]:
numerical_cols = df_train.select_dtypes(exclude=["object"]).columns
categorical_cols = df_train.select_dtypes(include=["object"]).columns

In [142]:
df_train['Disorder Subclass'].value_counts()

Disorder Subclass
Leigh syndrome                         5813
Mitochondrial myopathy                 4202
Cystic fibrosis                        3977
Tay-Sachs                              2690
Diabetes                               1925
Hemochromatosis                        1291
Leber's hereditary optic neuropathy     607
Alzheimer's                             148
Cancer                                   92
Name: count, dtype: int64

In [143]:
label_encoder = LabelEncoder()
for col in categorical_cols:
    df_train[col] = label_encoder.fit_transform(df_train[col])

In [144]:
df_train['Disorder Subclass'].value_counts()

Disorder Subclass
6    5813
7    4202
2    3977
8    2690
3    1925
4    1291
5     607
0     148
1      92
Name: count, dtype: int64

In [145]:
simple_model=['Genes in mother\'s side', 'Inherited from father',
       'Maternal gene', 'Paternal gene', 'Autopsy shows birth defect (if applicable)','H/O substance abuse',
              'Symptom 1', 'Symptom 2', 'Symptom 3', 'Symptom 4', 'Symptom 5']

In [146]:
x_simple=df_train[simple_model]
y_simple=df_train[['Disorder Subclass']]

# Simple Model Prediction

In [147]:
x_train_s  , x_test_s , y_train_s, y_test_s = train_test_split (x_simple ,y_simple ,test_size = 0.3 , random_state = 0,stratify=y_simple)

base_models_s = [
    ('logistic', Pipeline([
        ('scaler', StandardScaler()),
        ('lr', LogisticRegression(penalty='l1', solver='liblinear', C=0.5, max_iter=1000))
    ])),
]

model_s = StackingClassifier(
    estimators=base_models_s,
    final_estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    passthrough=True
)
# Best Parameters: {'final_estimator__learning_rate': 0.2, 'final_estimator__max_depth': 3, 'final_estimator__n_estimators': 50, 'final_estimator__subsample': 1.0}
# Best F1 Score: 0.5329823181791176
model_s.fit(x_train_s, y_train_s)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
Parameters: { "use_label_encoder" } are not used.



In [148]:
y_pred_s=model_s.predict(x_test_s)
# print(classification_report(y_test_s,y_pred_s))
print(accuracy_score(y_test_s, y_pred_s))

0.5400064267352185


In [149]:
filename = 'disorder_subclass_simple.sav'
pickle.dump(model_s, open(filename, 'wb'))