In [339]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score

In [340]:
df_train = pd.read_csv("train.csv")

In [341]:
df_train.drop(["Patient Id","Patient First Name","Family Name","Father's name","Location of Institute","Institute Name","Place of birth",'Test 1',
       'Test 2', 'Test 3', 'Test 4', 'Test 5', 'Parental consent'],inplace=True,axis=1)

In [342]:
df_train.drop(columns=['Autopsy shows birth defect (if applicable)'],inplace=True,axis=1)

In [343]:
df_train["Birth asphyxia"] = df_train["Birth asphyxia"].replace("No record",np.nan)
df_train["Birth asphyxia"] = df_train["Birth asphyxia"].replace("Not available",np.nan)

df_train["H/O radiation exposure (x-ray)"] = df_train["H/O radiation exposure (x-ray)"].replace("Not applicable",np.nan)
df_train["H/O radiation exposure (x-ray)"] = df_train["H/O radiation exposure (x-ray)"].replace("-",np.nan)

df_train["H/O substance abuse"] = df_train["H/O substance abuse"].replace("Not applicable",np.nan)
df_train["H/O substance abuse"] = df_train["H/O substance abuse"].replace("-",np.nan)

In [344]:
disorder_map = {
        "Leber's hereditary optic neuropathy": "Mitochondrial genetic inheritance disorders",
        "Leigh syndrome": "Mitochondrial genetic inheritance disorders",
        "Mitochondrial myopathy": "Mitochondrial genetic inheritance disorders",
        "Alzheimer's": "Multifactorial genetic inheritance disorders",
        "Cancer": "Multifactorial genetic inheritance disorders",
        "Diabetes": "Multifactorial genetic inheritance disorders",
        "Cystic fibrosis": "Single-gene inheritance diseases",
        "Hemochromatosis": "Single-gene inheritance diseases",
        "Tay-Sachs": "Single-gene inheritance diseases",
}

df_train["Genetic Disorder"] = df_train.apply(
    lambda row: disorder_map[row["Disorder Subclass"]]
    if pd.isnull(row["Genetic Disorder"]) and row["Disorder Subclass"] in disorder_map
    else row["Genetic Disorder"], axis=1
)

df_train = df_train.dropna(subset=['Disorder Subclass']).reset_index(drop=True)
df_train.drop(columns=['Genetic Disorder'],inplace=True,axis=1)

In [345]:
df_train['Total Blood Cell Count'] = df_train['Blood cell count (mcL)'] + df_train['White Blood cell count (thousand per microliter)']
df_train['Total Symptoms'] = df_train['Symptom 1'] + df_train['Symptom 2'] + df_train['Symptom 3'] + df_train['Symptom 4'] + df_train['Symptom 5']

In [346]:
x_full=df_train
y_full=df_train[['Disorder Subclass']]
le = LabelEncoder()
y_full['Disorder Subclass'] = le.fit_transform(y_full['Disorder Subclass'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_full['Disorder Subclass'] = le.fit_transform(y_full['Disorder Subclass'])


In [347]:
x_train_f, x_test_f, y_train_f, y_test_f = train_test_split(x_full, y_full, test_size = 0.3, random_state = 0, stratify=y_full)

In [348]:
x_train_f["Gender"].value_counts()

Gender
Male         4052
Ambiguous    4027
Female       3959
Name: count, dtype: int64

In [349]:
categorical_cols = x_full.select_dtypes(include=['object']).columns.tolist()

one_hot_cols = []
label_encode_cols = [col for col in categorical_cols if col not in one_hot_cols]

label_encoders = {}
for col in label_encode_cols:
    le = LabelEncoder()
    x_train_f[col] = le.fit_transform(x_train_f[col])
    print(x_train_f[col].value_counts())
    x_test_f[col] = le.transform(x_test_f[col])
    label_encoders[col] = le

# x_train_f = pd.get_dummies(x_train_f, columns=one_hot_cols)
# x_test_f = pd.get_dummies(x_test_f, columns=one_hot_cols)

Genes in mother's side
1    7908
0    5352
Name: count, dtype: int64
Inherited from father
0    7914
1    5173
2     173
Name: count, dtype: int64
Maternal gene
1    6441
0    5189
2    1630
Name: count, dtype: int64
Paternal gene
0    7522
1    5738
Name: count, dtype: int64
Status
0    6683
1    6577
Name: count, dtype: int64
Respiratory Rate (breaths/min)
0    6092
1    5916
2    1252
Name: count, dtype: int64
Heart Rate (rates/min
0    6179
1    5875
2    1206
Name: count, dtype: int64
Follow-up
1    6038
0    5951
2    1271
Name: count, dtype: int64
Gender
2    4052
0    4027
1    3959
3    1222
Name: count, dtype: int64
Birth asphyxia
2    7257
1    3094
0    2909
Name: count, dtype: int64
Folic acid details (peri-conceptional)
1    6127
0    5906
2    1227
Name: count, dtype: int64
H/O serious maternal illness
0    6060
1    5965
2    1235
Name: count, dtype: int64
H/O radiation exposure (x-ray)
2    7229
0    3040
1    2991
Name: count, dtype: int64
H/O substance abuse
2    721

In [350]:
def fill_missing_values(group):
    group['Birth defects'].fillna(0, inplace=True)
    for column in group.columns:
        if group[column].isnull().any():
            if group[column].dtype == 'object' and not group[column].mode().empty:
                fill_value = group[column].mode()[0]
            else:
                fill_value = group[column].median()
            group[column] = group[column].fillna(fill_value)
    return group

x_train_f = (
    x_train_f.groupby(["Disorder Subclass"], group_keys=False)
    .apply(fill_missing_values)
    .reset_index(drop=True)
)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  group['Birth defects'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  group['Birth defects'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

In [351]:
x_train_f.drop(columns=['Disorder Subclass'],inplace=True,axis=1)
x_test_f.drop(columns=['Disorder Subclass'],inplace=True,axis=1)

In [352]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

def knn_imputation(train, test, k=7):
    imputer = KNNImputer(n_neighbors=k)
    scaler = StandardScaler()

    train_scaled = scaler.fit_transform(train)
    test_scaled = scaler.transform(test)

    train_imputed = imputer.fit_transform(train_scaled)
    test_imputed = imputer.transform(test_scaled)

    train_imputed = pd.DataFrame(train_imputed, columns=train.columns)
    test_imputed = pd.DataFrame(test_imputed, columns=test.columns)

    return train_imputed, test_imputed

In [353]:
x_train_f, x_test_f = knn_imputation(x_train_f, x_test_f, k=7)

In [354]:
# selection=["Genes in mother's side", 'Inherited from father', 'Maternal gene', 'Paternal gene', 'Folic acid details (peri-conceptional)', 'White Blood cell count (thousand per microliter)', 'Symptom 1', 'Symptom 2', 'Symptom 3', 'Symptom 4', 'Symptom 5', 'Total Blood Cell Count', 'Total Symptoms', 'Gender_Ambiguous', 'Gender_Female', 'Gender_Male']
selection=["Genes in mother's side", 'Inherited from father', 'Maternal gene', 'Paternal gene', 'Folic acid details (peri-conceptional)', 'White Blood cell count (thousand per microliter)', 'Symptom 1', 'Symptom 2', 'Symptom 3', 'Symptom 4', 'Symptom 5', 'Total Blood Cell Count', 'Total Symptoms', 'Gender']

In [355]:
x_train_f=x_train_f[selection]
x_test_f=x_test_f[selection]

In [356]:
for i in x_train_f.columns:
    print(i)

Genes in mother's side
Inherited from father
Maternal gene
Paternal gene
Folic acid details (peri-conceptional)
White Blood cell count (thousand per microliter)
Symptom 1
Symptom 2
Symptom 3
Symptom 4
Symptom 5
Total Blood Cell Count
Total Symptoms
Gender


In [357]:
model_f=LogisticRegression(
    penalty= 'l2',
    C= 1,
    solver= 'saga',
    max_iter= 1000,
    tol= 1e-10,
    fit_intercept= True,
    class_weight= None,
    warm_start= True
)

model_f.fit(x_train_f, y_train_f)

  y = column_or_1d(y, warn=True)


In [358]:
y_pred_f=model_f.predict(x_test_f)
print(classification_report(y_test_f,y_pred_f))

recall_score(y_test_f, y_pred_f, average='weighted')

              precision    recall  f1-score   support

           0       1.00      0.05      0.09        44
           1       0.40      0.14      0.21        28
           2       0.46      0.56      0.50       983
           3       0.40      0.14      0.20       523
           4       0.43      0.33      0.38       387
           5       0.39      0.28      0.33       182
           6       0.43      0.56      0.49      1468
           7       0.38      0.45      0.41      1261
           8       0.38      0.22      0.28       807

    accuracy                           0.42      5683
   macro avg       0.47      0.30      0.32      5683
weighted avg       0.42      0.42      0.40      5683



np.float64(0.4170332570825268)

In [359]:
filename = "disorder_subclass_final.sav"
pickle.dump(model_f, open(filename, 'wb'))