# MVP First Model

Based on https://www.kaggle.com/code/thecansin/genomes-and-genetic-disorders-knn-78 which predicted the patients alive or deceased status using the same dataset.

In [156]:
import pandas as pd
train = pd.read_csv("./train.csv", encoding='utf-8')

In [157]:
# Checkout the data

# train.head()
# train.tail()
# train.columns

In [158]:
# Drop fields without a probable relation to the target.
train.drop("Patient Id", axis=1, inplace=True)
train.drop("Family Name", axis=1, inplace=True)
train.drop("Patient First Name", axis=1, inplace=True)
train.drop("Father's name", axis=1, inplace=True)
train.drop("Institute Name", axis=1, inplace=True)
train.drop("Location of Institute", axis=1, inplace=True)
train.drop("Place of birth", axis=1, inplace=True)

In [159]:
# This shows there is some missing values across most fields.
# For better results, we need to fill these in.

# train.isna().sum()

In [160]:
# Here, the solution was to fill with the mode
train["Patient Age"].fillna(str(train["Patient Age"].mode().values[0]),inplace=True)
train["Inherited from father"].fillna(str(train["Inherited from father"].mode().values[0]),inplace=True)
train["Maternal gene"].fillna(str(train["Maternal gene"].mode().values[0]),inplace=True)
train["Mother's age"].fillna(str(train["Mother's age"].mode().values[0]),inplace=True)
train["Father's age"].fillna(str(train["Father's age"].mode().values[0]),inplace=True)
train["Respiratory Rate (breaths/min)"].fillna(str(train["Respiratory Rate (breaths/min)"].mode().values[0]),inplace=True)
train["Heart Rate (rates/min"].fillna(str(train["Heart Rate (rates/min"].mode().values[0]),inplace=True)
train["Test 1"].fillna(str(train["Test 1"].mode().values[0]),inplace=True)
train["Test 2"].fillna(str(train["Test 2"].mode().values[0]),inplace=True)
train["Test 3"].fillna(str(train["Test 3"].mode().values[0]),inplace=True)
train["Test 4"].fillna(str(train["Test 4"].mode().values[0]),inplace=True)
train["Test 5"].fillna(str(train["Test 5"].mode().values[0]),inplace=True)
train["Parental consent"].fillna(str(train["Parental consent"].mode().values[0]),inplace=True)
train["Follow-up"].fillna(str(train["Follow-up"].mode().values[0]),inplace=True)
train["Gender"].fillna(str(train["Gender"].mode().values[0]),inplace=True)
train["Birth asphyxia"].fillna(str(train["Birth asphyxia"].mode().values[0]),inplace=True)
train["Autopsy shows birth defect (if applicable)"].fillna(str(train["Autopsy shows birth defect (if applicable)"].mode().values[0]),inplace=True)
train["Folic acid details (peri-conceptional)"].fillna(str(train["Folic acid details (peri-conceptional)"].mode().values[0]),inplace=True)
train["H/O serious maternal illness"].fillna(str(train["H/O serious maternal illness"].mode().values[0]),inplace=True)
train["H/O radiation exposure (x-ray)"].fillna(str(train["H/O radiation exposure (x-ray)"].mode().values[0]),inplace=True)
train["H/O substance abuse"].fillna(str(train["H/O substance abuse"].mode().values[0]),inplace=True)
train["Assisted conception IVF/ART"].fillna(str(train["Assisted conception IVF/ART"].mode().values[0]),inplace=True)
train["History of anomalies in previous pregnancies"].fillna(str(train["History of anomalies in previous pregnancies"].mode().values[0]),inplace=True)
train["No. of previous abortion"].fillna(str(train["No. of previous abortion"].mode().values[0]),inplace=True)
train["Birth defects"].fillna(str(train["Birth defects"].mode().values[0]),inplace=True)
train["White Blood cell count (thousand per microliter)"].fillna(str(train["White Blood cell count (thousand per microliter)"].mode().values[0]),inplace=True)
train["Blood test result"].fillna(str(train["Blood test result"].mode().values[0]),inplace=True)
train["Symptom 1"].fillna(str(train["Symptom 1"].mode().values[0]),inplace=True)
train["Symptom 2"].fillna(str(train["Symptom 2"].mode().values[0]),inplace=True)
train["Symptom 3"].fillna(str(train["Symptom 3"].mode().values[0]),inplace=True)
train["Symptom 4"].fillna(str(train["Symptom 4"].mode().values[0]),inplace=True)
train["Symptom 5"].fillna(str(train["Symptom 5"].mode().values[0]),inplace=True)

# TODO: This should be changed to drop null targets?
train["Genetic Disorder"].fillna(str(train["Genetic Disorder"].mode().values[0]),inplace=True)
# train["Disorder Subclass"].fillna(str(train["Disorder Subclass"].mode().values[0]),inplace=True)
train = train[train['Disorder Subclass'].isna() == False]
# train.drop("Genetic Disorder", axis=1, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train["Patient Age"].fillna(str(train["Patient Age"].mode().values[0]),inplace=True)
  train["Patient Age"].fillna(str(train["Patient Age"].mode().values[0]),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train["Inherited from father"].fillna(str(train["Inherited f

In [161]:
# train.isna().sum()

In [162]:
# Check there are no more null values.
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19915 entries, 0 to 22082
Data columns (total 38 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   Patient Age                                       19915 non-null  object 
 1   Genes in mother's side                            19915 non-null  object 
 2   Inherited from father                             19915 non-null  object 
 3   Maternal gene                                     19915 non-null  object 
 4   Paternal gene                                     19915 non-null  object 
 5   Blood cell count (mcL)                            19915 non-null  float64
 6   Mother's age                                      19915 non-null  object 
 7   Father's age                                      19915 non-null  object 
 8   Status                                            19915 non-null  object 
 9   Respiratory Rate (brea

In [163]:
# TODO: Instead of the mode: Could use KNN to predict the missing values using rows that aren't null?

train["Genes in mother's side"]=[1 if i.strip()== "Yes" else 0 for i in train["Genes in mother's side"]]
train["Inherited from father"]=[1 if i.strip()== "Yes" else 0 for i in train["Inherited from father"]]
train["Maternal gene"]=[1 if i.strip()== "Yes" else 0 for i in train["Maternal gene"]]
train["Paternal gene"]=[1 if i.strip()== "Yes" else 0 for i in train["Paternal gene"]]
train["Parental consent"]=[1 if i.strip()== "Yes" else 0 for i in train["Parental consent"]]
train["Birth asphyxia"]=[1 if i.strip()== "Yes" else 0 for i in train["Birth asphyxia"]]
train["Folic acid details (peri-conceptional)"]=[1 if i.strip()== "Yes" else 0 for i in train["Folic acid details (peri-conceptional)"]]
train["H/O radiation exposure (x-ray)"]=[1 if i.strip()== "Yes" else 0 for i in train["H/O radiation exposure (x-ray)"]]
train["H/O substance abuse"]=[1 if i.strip()== "Yes" else 0 for i in train["H/O substance abuse"]]
train["Assisted conception IVF/ART"]=[1 if i.strip()== "Yes" else 0 for i in train["Assisted conception IVF/ART"]]
train["History of anomalies in previous pregnancies"]=[1 if i.strip()== "Yes" else 0 for i in train["History of anomalies in previous pregnancies"]]
train["H/O serious maternal illness"]=[1 if i.strip()=="Yes" else 0 for i in train["H/O serious maternal illness"]]

#Alive':1 'Deceased:0'
train["Status"]=[1 if i.strip()== "Alive" else 0 for i in train["Status"]]
#Normal (30-60):1' 'Tachypnea:0
train["Respiratory Rate (breaths/min)"]=[1 if i.strip()== "Normal (30-60)" else 0 for i in train["Respiratory Rate (breaths/min)"]]
#Normal:1' 'Tachycardia:0
train["Heart Rate (rates/min"]=[1 if i.strip()== "Normal" else 0 for i in train["Heart Rate (rates/min"]]
#High:1, Low:0
train["Follow-up"]=[1 if i.strip()== "High" else 0 for i in train["Follow-up"]]
#['Singular' 'Multiple']
train["Birth defects"]=[1 if i.strip()== "Singular" else 0 for i in train["Birth defects"]]

#1: male 0: female 2: ambiguous    
train["Gender"]=[1 if i.strip()== "Male" else 0 if i.strip() == "Female" else 2 for i in train["Gender"]]

train["Autopsy shows birth defect (if applicable)"]=[1 if i.strip()== "Yes" else 0 if i.strip() == "No" else 2 if i.strip()=="None" else 3 for i in train["Autopsy shows birth defect (if applicable)"]]

train["Blood test result"]=[1 if i.strip()== "slightly abnormal" else 0 if i.strip() == "normal" else 2 if i.strip()=="inconclusive" else 3 for i in train["Blood test result"]]

# TODO: The disorder itself can still be text based?
# TODO: Can get rid of the subclass?
# TODO: What is the relation between these.
train["Genetic Disorder"]=[1 if i.strip() == "Mitochondrial genetic inheritance disorders" else 0 if i.strip() == "Multifactorial genetic inheritance disorders" else 2 for i in train["Genetic Disorder"]]
#Leber's hereditary optic neuropathy:1 
#Cystic fibrosis:0
#Diabetes:2
#Leigh syndrome:3
#Cancer:4
#Tay-Sachs:5
#Hemochromatosis:6
#Mitochondrial myopathy:7
#Alzheimer's:8
train["Disorder Subclass"]=[1 if i.strip()== "Leber's hereditary optic neuropathy" 
                              else 0 if i.strip() == "Cystic fibrosis" 
                               else 2 if i.strip()=="Diabetes" 
                               else 3 if i.strip()=="Leigh syndrome"
                               else 4 if i.strip()=="Cancer"
                               else 5 if i.strip()=="Tay-Sachs"
                               else 6 if i.strip()=="Hemochromatosis"
                               else 7 if i.strip()=="Mitochondrial myopathy"
                               else 8 for i in train["Disorder Subclass"]]

train = train.apply(pd.to_numeric,downcast="float")

# train.head()

In [164]:
# Check all the fields are now floats
# train.info()


In [165]:
# Optionally we could:

# ?? One Percentage less accurate?

# train["sum of Mother's and fathers age avg"]=(train["Mother's age"]+train["Father's age"]) / 2
# train["total symptom"]=(train["Symptom 1"]+train["Symptom 2"]+train["Symptom 3"]+train["Symptom 4"]+train["Symptom 5"]) / 5


In [166]:
# TODO: This is predicting alive or dead. I need to predict diseased or not.

from sklearn.neighbors import KNeighborsClassifier
# knn = KNeighborsClassifier(n_neighbors = 3)
x,y = train.loc[:,train.columns != 'Status'], train.loc[:,'Status']
# knn.fit(x,y)
# prediction = knn.predict(x)
# print('Prediction: {}'.format(prediction))

# train test split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 1)
knn = KNeighborsClassifier(n_neighbors = 23)
x,y = train.loc[:,train.columns != 'Status'], train.loc[:,'Status']
knn.fit(x_train,y_train)
prediction = knn.predict(x_test)
#print('Prediction: {}'.format(prediction))
print('With KNN (K=3) status accuracy is: ',knn.score(x_test,y_test)) # accuracy

With KNN (K=3) status accuracy is:  0.7163179916317992


## Now we have cleaned data: Let us test classifiers on it.

In [167]:
train.describe()

Unnamed: 0,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Mother's age,Father's age,Status,Respiratory Rate (breaths/min),...,Birth defects,White Blood cell count (thousand per microliter),Blood test result,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5,Genetic Disorder,Disorder Subclass
count,19915.0,19915.0,19915.0,19915.0,19915.0,19915.0,19915.0,19915.0,19915.0,19915.0,...,19915.0,19915.0,19915.0,19915.0,19915.0,19915.0,19915.0,19915.0,19915.0,19915.0
mean,6.765353,0.595933,0.391162,0.604921,0.433191,4.898917,31.532312,36.317348,0.502385,0.550389,...,0.547075,7.059959,1.446548,0.631333,0.594225,0.579161,0.453628,0.41893,1.253678,3.740698
std,4.238961,0.490723,0.488023,0.48888,0.495529,0.199735,9.862125,14.795049,0.500007,0.497467,...,0.497791,2.845945,1.068833,0.482455,0.491054,0.493706,0.497857,0.493396,0.614691,2.479392
min,0.0,0.0,0.0,0.0,0.0,4.092727,18.0,20.0,0.0,0.0,...,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,0.0,0.0,0.0,0.0,4.763367,23.0,20.0,0.0,0.0,...,0.0,4.632952,1.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0
50%,6.0,1.0,0.0,1.0,0.0,4.899457,28.0,34.0,1.0,1.0,...,1.0,7.092109,1.0,1.0,1.0,1.0,0.0,0.0,1.0,3.0
75%,10.0,1.0,1.0,1.0,1.0,5.033677,40.0,49.0,1.0,1.0,...,1.0,9.287531,2.0,1.0,1.0,1.0,1.0,1.0,2.0,6.0
max,14.0,1.0,1.0,1.0,1.0,5.609829,51.0,64.0,1.0,1.0,...,1.0,12.0,3.0,1.0,1.0,1.0,1.0,1.0,2.0,8.0


In [168]:
# TODO: Drop the subclass until I understand it more.
# Done above now.
# train.drop("Disorder Subclass", axis=1, inplace=True)
# train.describe()



In [169]:
output_set = train['Disorder Subclass']
# output_set

In [170]:
input_set = train.drop(columns=['Disorder Subclass'])
# input_set

In [171]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train, X_test, Y_train, Y_test = train_test_split(input_set, output_set, test_size=0.2)
M = DecisionTreeClassifier()
M.fit(X_train.values, Y_train) # x.values because: https://stackoverflow.com/questions/69326639/sklearn-warning-valid-feature-names-in-version-1-0
predictions = M.predict(X_test)
accuracy_score(Y_test, predictions) # Produces diff each time as the train_test_split splitting is random each time.



0.5628922922420286

In [172]:
from sklearn.neighbors import KNeighborsClassifier
# knn = KNeighborsClassifier(n_neighbors = 3)
x,y = train.loc[:,train.columns != 'Disorder Subclass'], train.loc[:,'Disorder Subclass']
# knn.fit(x,y)
# prediction = knn.predict(x)
# print('Prediction: {}'.format(prediction))

# train test split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3,random_state = 1)
knn = KNeighborsClassifier(n_neighbors = 1500)
x,y = train.loc[:,train.columns != 'Disorder Subclass'], train.loc[:,'Disorder Subclass']
knn.fit(x_train,y_train)
prediction = knn.predict(x_test)
#print('Prediction: {}'.format(prediction))
print('With KNN (K=3) disorder accuracy is: ',knn.score(x_test,y_test)) # accuracy
accuracy_score(y_test, prediction)
# y_train


With KNN (K=3) disorder accuracy is:  0.25573221757322173


0.25573221757322173

In [173]:
#Import svm model
from sklearn import svm

#Create a svm Classifier
model = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
model.fit(x_train, y_train)

#Predict the response for test dataset
y_pred = model.predict(X_test)

accuracy_score(y_test, prediction)


0.25573221757322173

In [176]:
accuracy_score(Y_test, y_pred)
# y_pred



0.6886768767260859