In [26]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder as OHE, LabelEncoder as LE, OrdinalEncoder as OE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.impute import KNNImputer

In [27]:
data = pd.read_csv('./data/train.csv')

In [28]:
data.head()

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status
0,0,3850.0,D-penicillamine,17841.0,M,N,N,N,N,1.8,,3.7,33.0,1268.0,99.0,,338.0,10.2,3.0,D
1,1,2157.0,Placebo,15628.0,F,N,N,N,N,0.5,303.0,4.09,11.0,657.0,77.5,80.0,309.0,9.9,3.0,C
2,2,4427.0,Placebo,20392.0,F,N,N,N,N,0.8,,3.6,13.0,10396.8,128.65,,164.0,11.0,2.0,C
3,3,2149.0,,13514.0,F,,,,N,0.6,,4.19,,,,,248.0,10.5,2.0,C
4,4,2202.0,,23741.0,F,,,,N,0.6,,3.14,,,,,76.0,10.3,4.0,C


In [29]:
data.isna().sum()

id                  0
N_Days              1
Drug             6488
Age                 0
Sex                 0
Ascites          6482
Hepatomegaly     6496
Spiders          6498
Edema               0
Bilirubin           0
Cholesterol      8353
Albumin             0
Copper           6600
Alk_Phos         6495
SGOT             6502
Tryglicerides    8393
Platelets         540
Prothrombin        37
Stage               0
Status              0
dtype: int64

In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             15000 non-null  int64  
 1   N_Days         14999 non-null  float64
 2   Drug           8512 non-null   object 
 3   Age            15000 non-null  float64
 4   Sex            15000 non-null  object 
 5   Ascites        8518 non-null   object 
 6   Hepatomegaly   8504 non-null   object 
 7   Spiders        8502 non-null   object 
 8   Edema          15000 non-null  object 
 9   Bilirubin      15000 non-null  float64
 10  Cholesterol    6647 non-null   float64
 11  Albumin        15000 non-null  float64
 12  Copper         8400 non-null   float64
 13  Alk_Phos       8505 non-null   float64
 14  SGOT           8498 non-null   float64
 15  Tryglicerides  6607 non-null   float64
 16  Platelets      14460 non-null  float64
 17  Prothrombin    14963 non-null  float64
 18  Stage 

In [31]:
mapper = {
    'N_Days': data['N_Days'].median(),
    'Cholesterol': data['Cholesterol'].median(),
    'Copper': data['Copper'].median(),
    'Alk_Phos': data['Alk_Phos'].median(),
    'SGOT': data['SGOT'].median(),
    'Tryglicerides': data['Tryglicerides'].median(),
    'Platelets': data['Platelets'].median(),
    'Prothrombin': data['Prothrombin'].median(),
    
    }

In [32]:
data_partly_cleaned = data.fillna(mapper)

In [33]:
X = data_partly_cleaned.drop(columns=['Drug', 'Hepatomegaly', 'Spiders', 'Edema', 'Sex', 'Ascites', 'Status', 'id'])
X = StandardScaler().fit_transform(X)

In [34]:
oe_obj = OE()
oe = oe_obj.fit_transform(data_partly_cleaned[['Drug', 'Hepatomegaly', 'Spiders', 'Edema', 'Sex', 'Ascites']])
knn = KNNImputer()
oe_imputed = knn.fit_transform(oe)
ohe_obj = OHE()
ohe = ohe_obj.fit_transform(oe_imputed).toarray()
x_imputed = np.column_stack([X, ohe])

In [35]:
x_imputed.shape

(15000, 39)

In [36]:
ohe

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 1.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [37]:
X.shape, oe.shape

((15000, 12), (15000, 6))

In [38]:
x_imputed.shape

(15000, 39)

In [39]:
y = data_partly_cleaned['Status']
y = LE().fit_transform(y)

In [40]:
xtrain, xtest, ytrain, ytest = train_test_split(x_imputed, y, test_size=0.3, stratify=y)

In [41]:
import tensorflow as tf
from tensorflow import keras

In [42]:
model = tf.keras.Sequential(
    [
        tf.keras.layers.Input((xtrain.shape[1],)),
        tf.keras.layers.Dense(128, activation='tanh'),
        tf.keras.layers.Dense(64, activation=tf.keras.layers.LeakyReLU(alpha=0.01)),
        tf.keras.layers.Dense(32, activation=tf.keras.layers.LeakyReLU(alpha=0.01)),
        tf.keras.layers.Dense(16, activation=tf.keras.layers.LeakyReLU(alpha=0.01)),
        tf.keras.layers.Dense(3, activation='softmax'),
    ]
)



In [43]:
model.compile(optimizer='adam', loss=tf.keras.losses.sparse_categorical_crossentropy, metrics=['accuracy'])

In [44]:
model.fit(xtrain, ytrain, epochs=5)

Epoch 1/5
[1m329/329[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7963 - loss: 0.5436
Epoch 2/5
[1m329/329[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8368 - loss: 0.4276
Epoch 3/5
[1m329/329[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8395 - loss: 0.4090
Epoch 4/5
[1m329/329[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8402 - loss: 0.4041
Epoch 5/5
[1m329/329[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8380 - loss: 0.4108


<keras.src.callbacks.history.History at 0x20e2dcc22d0>

In [45]:
model.evaluate(xtest, ytest)

[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8415 - loss: 0.4217


[0.4361327290534973, 0.8351110816001892]

In [46]:
test=pd.read_csv('./data/test.csv')
test_partly_cleaned = test.fillna(mapper)

X_test = test_partly_cleaned.drop(columns=['Drug', 'Hepatomegaly', 'Spiders', 'Edema', 'Sex', 'Ascites', 'id'])
X_test = StandardScaler().fit_transform(X_test)

oe = oe_obj.fit_transform(test_partly_cleaned[['Drug', 'Hepatomegaly', 'Spiders', 'Edema', 'Sex', 'Ascites']])

oe_imputed = knn.fit_transform(oe)

ohe = ohe_obj.fit_transform(oe_imputed).toarray()
x_imputed_test = np.column_stack([X_test, ohe])

In [47]:
test_partly_cleaned.shape

(10000, 19)

In [48]:
xtrain.shape

(10500, 39)

In [49]:
logits = model.predict(x_imputed_test)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step


In [51]:
sub = pd.DataFrame(data=np.column_stack([test_partly_cleaned['id'], logits]), columns=['id', 'C', 'CL', 'D'])
sub['id'] = test['id']
sub.to_csv('./submission2.csv')