In [2]:
import pandas as pd 
import numpy as np


from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import DictVectorizer

import xgboost as xgb

### Data train preparation

In [3]:
DATA_PATH = '../data/train.csv'
df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status
0,0,999,D-penicillamine,21532,M,N,N,N,N,2.3,316.0,3.35,172.0,1601.0,179.8,63.0,394.0,9.7,3.0,D
1,1,2574,Placebo,19237,F,N,N,N,N,0.9,364.0,3.54,63.0,1440.0,134.85,88.0,361.0,11.0,3.0,C
2,2,3428,Placebo,13727,F,N,Y,Y,Y,3.3,299.0,3.55,131.0,1029.0,119.35,50.0,199.0,11.7,4.0,D
3,3,2576,Placebo,18460,F,N,N,N,N,0.6,256.0,3.5,58.0,1653.0,71.3,96.0,269.0,10.7,3.0,C
4,4,788,Placebo,16658,F,N,Y,N,N,1.1,346.0,3.65,63.0,1181.0,125.55,96.0,298.0,10.6,4.0,C


In [4]:
df['Cholesterol'] = df['Cholesterol'].astype('int64')
df['Copper'] = df['Copper'].astype('int64')
df['Tryglicerides'] = df['Tryglicerides'].astype('int64')
df['Platelets'] = df['Platelets'].astype('int64')
df['Stage'] = df['Stage'].astype('int64')

In [5]:
le = LabelEncoder()
df = df.apply(le.fit_transform)
df.head()

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status
0,0,97,0,305,1,0,0,0,0,20,97,69,123,227,162,13,195,7,2,2
1,1,350,1,228,0,0,0,0,0,6,128,87,52,210,121,38,182,20,2,0
2,2,414,1,57,0,0,1,1,2,30,87,88,105,130,102,4,68,27,3,2
3,3,351,1,201,0,0,0,0,0,3,56,83,48,237,30,46,119,17,2,0
4,4,69,1,138,0,0,1,0,0,8,118,98,52,174,109,46,140,16,3,0


- C -> 0
- CL -> 1
- D -> 2

In [6]:
scaler = StandardScaler()
df[['N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos','SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin']] = scaler.fit_transform(df[['N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos','SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin']]) 

In [7]:
df = df.drop(['id'], axis=1)

In [8]:
df

Unnamed: 0,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status
0,-1.124041,0,1.000115,1,0,0,0,0,0.046393,0.026450,-0.643483,1.507867,0.497174,1.415277,-1.298130,1.498456,-1.224804,2,2
1,0.789556,1,0.279586,0,0,0,0,0,-0.539986,0.584107,-0.051180,-0.226251,0.333203,0.639068,-0.594472,1.256588,0.499071,2,0
2,1.273628,1,-1.320550,0,0,1,1,2,0.465235,-0.153440,-0.018274,1.068232,-0.438427,0.279361,-1.551446,-0.864404,1.427311,3,2
3,0.797120,1,0.026933,0,0,0,0,0,-0.665638,-0.711097,-0.182803,-0.323947,0.593628,-1.083737,-0.369302,0.084461,0.101253,2,0
4,-1.335822,1,-0.562591,0,0,1,0,0,-0.456217,0.404218,0.310784,-0.226251,-0.014031,0.411885,-0.369302,0.475170,-0.031352,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7900,-0.965205,0,-0.497089,0,0,0,0,0,-0.581870,-0.045506,0.014632,-0.763583,0.555046,-0.894418,2.107571,1.126352,-0.959593,1,0
7901,-0.397933,1,-0.412871,0,0,1,0,0,-0.539986,-0.639141,-0.380237,-0.250675,0.333203,0.828387,-0.875935,0.177487,-0.826987,3,0
7902,-0.307169,0,1.701929,0,0,0,1,1,-0.079259,-1.070876,-1.104164,-0.446068,-0.669916,-1.140533,-1.326276,-0.845798,2.753368,1,2
7903,1.341701,0,1.346343,1,0,1,0,0,-0.623754,-0.819030,-2.288771,-0.910128,-0.496299,-1.310920,0.221770,-0.548115,-0.031352,3,2


### Data test preparation

In [9]:
DATA_PATH_TEST = '../data/test.csv'
df_test = pd.read_csv(DATA_PATH_TEST)
df_test.head()

Unnamed: 0,id,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,7905,3839,D-penicillamine,19724,F,N,Y,N,N,1.2,546.0,3.37,65.0,1636.0,151.9,90.0,430.0,10.6,2.0
1,7906,2468,D-penicillamine,14975,F,N,N,N,N,1.1,660.0,4.22,94.0,1257.0,151.9,155.0,227.0,10.0,2.0
2,7907,51,Placebo,13149,F,N,Y,N,Y,2.0,151.0,2.96,46.0,961.0,69.75,101.0,213.0,13.0,4.0
3,7908,2330,D-penicillamine,20510,F,N,N,N,N,0.6,293.0,3.85,40.0,554.0,125.55,56.0,270.0,10.6,2.0
4,7909,1615,D-penicillamine,21904,F,N,Y,N,N,1.4,277.0,2.97,121.0,1110.0,125.0,126.0,221.0,9.8,1.0


In [10]:
df_test['Cholesterol'] = df_test['Cholesterol'].astype('int64')
df_test['Copper'] = df_test['Copper'].astype('int64')
df_test['Tryglicerides'] = df_test['Tryglicerides'].astype('int64')
df_test['Platelets'] = df_test['Platelets'].astype('int64')
df_test['Stage'] = df_test['Stage'].astype('int64')

In [11]:
identifier = df_test['id']

In [12]:
df_test = df_test.drop(['id'], axis=1)

In [13]:
le = LabelEncoder()
df_test= df_test.apply(le.fit_transform)
df_test.head()

Unnamed: 0,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,379,0,226,0,0,1,0,0,9,182,66,53,238,133,39,203,16,1
1,285,0,78,0,0,0,0,0,8,200,144,77,190,133,96,84,10,1
2,2,1,42,0,0,1,0,2,17,4,31,37,111,27,50,74,40,3
3,265,0,254,0,0,0,0,0,3,85,113,31,9,104,8,114,16,1
4,192,0,290,0,0,1,0,0,11,74,32,94,155,103,74,79,8,0


In [14]:
df_test[['N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos','SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin']] = scaler.transform(df_test[['N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos','SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin']]) 

In [15]:
df_test

Unnamed: 0,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,1.008902,0,0.260871,0,0,1,0,0,-0.414333,1.555510,-0.742201,-0.201826,0.603273,0.866251,-0.566326,1.647298,-0.031352,1
1,0.297921,0,-1.124043,0,0,0,0,0,-0.456217,1.879311,1.824448,0.384354,0.140295,0.866251,1.038012,-0.566721,-0.826987,1
2,-1.842585,1,-1.460913,0,0,1,0,2,-0.079259,-1.646522,-1.893902,-0.592613,-0.621689,-1.140533,-0.256717,-0.752773,3.151186,3
3,0.146648,0,0.522881,0,0,0,0,0,-0.665638,-0.189418,0.804370,-0.739159,-1.605517,0.317225,-1.438861,-0.008565,-0.031352,1
4,-0.405496,0,0.859752,0,0,1,0,0,-0.330565,-0.387296,-1.860996,0.799566,-0.197293,0.298293,0.418794,-0.659747,-1.092198,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5266,0.630720,1,-1.638706,0,0,0,0,0,-0.372449,-0.081484,-0.544766,0.017991,0.256040,0.771591,-1.635885,-1.143482,-0.031352,2
5267,-0.261788,1,1.336986,0,0,0,0,0,-0.707522,-1.178809,1.462485,0.799566,-1.335447,-0.913350,-0.453741,0.568196,-1.224804,2
5268,0.978647,0,-0.543876,0,0,1,0,0,-0.581870,0.044439,1.462485,-1.374188,0.612919,1.093434,-1.129252,1.610087,0.366465,2
5269,-0.995459,1,-1.750996,0,0,0,0,0,-0.623754,0.224328,0.672747,-0.446068,-1.267929,-1.310920,0.418794,0.493776,-0.561775,0


### Implementing XGBoost

In [16]:
df_train = df.drop(['Status'], axis=1)
y_train = df['Status'].values

In [17]:
# train_dicts = df_train[['Bilirubin', 'Copper', 'N_Days', 'Stage', 'Hepatomegaly', 'Prothrombin', 'SGOT', 'Edema', 'Platelets', 'Age', 'Cholesterol', 'Drug']].fillna(0).to_dict(orient='records')
train_dicts = df_train.fillna(0).to_dict(orient='records')

In [18]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

In [19]:
dv.feature_names_

['Age',
 'Albumin',
 'Alk_Phos',
 'Ascites',
 'Bilirubin',
 'Cholesterol',
 'Copper',
 'Drug',
 'Edema',
 'Hepatomegaly',
 'N_Days',
 'Platelets',
 'Prothrombin',
 'SGOT',
 'Sex',
 'Spiders',
 'Stage',
 'Tryglicerides']

In [20]:
# val_dicts = df_test[['Bilirubin', 'Copper', 'N_Days', 'Stage', 'Hepatomegaly', 'Prothrombin', 'SGOT', 'Edema', 'Platelets', 'Age', 'Cholesterol', 'Drug']].fillna(0).to_dict(orient='records')
val_dicts = df_test.fillna(0).to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [21]:
features = dv.feature_names_

In [22]:
dtrain = xgb.DMatrix(X_train, label= y_train, feature_names=features)
dval = xgb.DMatrix(X_val, feature_names=features)

In [23]:
xgb_params = {
    'eta': 0.1,
    'max_depth': 3,
    'min_child_weight':10,

    'objective': 'multi:softmax',
    'num_class': 3,
    'eval_metric': 'logloss',

    'nthread': 8,
    'seed': 1,
    'verbosity': 1
}

model = xgb.train(xgb_params, dtrain, num_boost_round = 300 )

In [24]:
y_pred = model.predict(dval)

In [25]:
probabilities = model.predict(dval, output_margin=True)
probabilities = np.exp(probabilities) / np.sum(np.exp(probabilities), axis=1, keepdims=True)

In [26]:
y_train

array([2, 0, 2, ..., 2, 2, 0])

In [27]:
probabilities

array([[0.74175596, 0.03465237, 0.22359167],
       [0.67067605, 0.09585282, 0.23347116],
       [0.02099357, 0.00749365, 0.97151273],
       ...,
       [0.8188512 , 0.01784433, 0.16330451],
       [0.9875772 , 0.00300561, 0.00941717],
       [0.27409932, 0.01827342, 0.70762724]], dtype=float32)

In [28]:
probabilities = pd.DataFrame(probabilities, columns=['Status_C', 'Status_CL', 'Status_D'])
probabilities

Unnamed: 0,Status_C,Status_CL,Status_D
0,0.741756,0.034652,0.223592
1,0.670676,0.095853,0.233471
2,0.020994,0.007494,0.971513
3,0.943162,0.004345,0.052492
4,0.707885,0.097736,0.194379
...,...,...,...
5266,0.915328,0.050944,0.033728
5267,0.984020,0.001401,0.014579
5268,0.818851,0.017844,0.163305
5269,0.987577,0.003006,0.009417


In [29]:
submission = pd.concat([identifier, probabilities], axis=1)
submission

Unnamed: 0,id,Status_C,Status_CL,Status_D
0,7905,0.741756,0.034652,0.223592
1,7906,0.670676,0.095853,0.233471
2,7907,0.020994,0.007494,0.971513
3,7908,0.943162,0.004345,0.052492
4,7909,0.707885,0.097736,0.194379
...,...,...,...,...
5266,13171,0.915328,0.050944,0.033728
5267,13172,0.984020,0.001401,0.014579
5268,13173,0.818851,0.017844,0.163305
5269,13174,0.987577,0.003006,0.009417


In [30]:
# submission.to_csv('../Testing test data/submission_xgb_all.csv', index=False)

### Implementing Logistic Regression

In [54]:
from sklearn.linear_model import LogisticRegression

In [58]:
model_lr = LogisticRegression(solver='lbfgs', random_state=1)
model_lr.fit(df_train, y_train)

y_pred_lr = model_lr.predict_proba(df_test)

In [59]:
y_pred_lr

array([[0.77720443, 0.02061227, 0.2021833 ],
       [0.84292632, 0.05637802, 0.10069566],
       [0.08318744, 0.03504662, 0.88176594],
       ...,
       [0.82750194, 0.03130779, 0.14119026],
       [0.94957566, 0.03222711, 0.01819723],
       [0.39789801, 0.00939938, 0.59270261]])

In [60]:
y_pred_lr = pd.DataFrame(y_pred_lr, columns=['Status_C', 'Status_CL', 'Status_D'])
y_pred_lr

Unnamed: 0,Status_C,Status_CL,Status_D
0,0.777204,0.020612,0.202183
1,0.842926,0.056378,0.100696
2,0.083187,0.035047,0.881766
3,0.916075,0.013967,0.069958
4,0.791510,0.019742,0.188748
...,...,...,...
5266,0.840312,0.034855,0.124833
5267,0.920316,0.010264,0.069420
5268,0.827502,0.031308,0.141190
5269,0.949576,0.032227,0.018197


In [61]:
submission_lr = pd.concat([identifier, y_pred_lr], axis=1)
submission_lr

Unnamed: 0,id,Status_C,Status_CL,Status_D
0,7905,0.777204,0.020612,0.202183
1,7906,0.842926,0.056378,0.100696
2,7907,0.083187,0.035047,0.881766
3,7908,0.916075,0.013967,0.069958
4,7909,0.791510,0.019742,0.188748
...,...,...,...,...
5266,13171,0.840312,0.034855,0.124833
5267,13172,0.920316,0.010264,0.069420
5268,13173,0.827502,0.031308,0.141190
5269,13174,0.949576,0.032227,0.018197


In [66]:
# submission_lr.to_csv('../Testing test data/submission_lrall.csv', index=False)

### Less Features with Logistic Regression

In [63]:
lr = LogisticRegression(solver='lbfgs', random_state=1)
lr.fit(df_train[['Bilirubin', 'Copper', 'N_Days', 'Stage', 'Hepatomegaly', 'Prothrombin', 'SGOT', 'Edema', 'Platelets', 'Age', 'Cholesterol', 'Drug']], y_train)

y_lr = lr.predict_proba(df_test[['Bilirubin', 'Copper', 'N_Days', 'Stage', 'Hepatomegaly', 'Prothrombin', 'SGOT', 'Edema', 'Platelets', 'Age', 'Cholesterol', 'Drug']])

In [64]:
y_lr

array([[0.76002861, 0.02328675, 0.21668464],
       [0.82275763, 0.05347717, 0.1237652 ],
       [0.06719744, 0.03313612, 0.89966643],
       ...,
       [0.7924819 , 0.03387657, 0.17364153],
       [0.94904348, 0.02954378, 0.02141275],
       [0.40465442, 0.01073226, 0.58461332]])

In [65]:
y_lr = pd.DataFrame(y_lr, columns=['Status_C', 'Status_CL', 'Status_D'])
submission_lr_less = pd.concat([identifier, y_lr], axis=1)
submission_lr_less

Unnamed: 0,id,Status_C,Status_CL,Status_D
0,7905,0.760029,0.023287,0.216685
1,7906,0.822758,0.053477,0.123765
2,7907,0.067197,0.033136,0.899666
3,7908,0.880918,0.013480,0.105601
4,7909,0.794299,0.020632,0.185069
...,...,...,...,...
5266,13171,0.820906,0.042911,0.136182
5267,13172,0.891969,0.010311,0.097720
5268,13173,0.792482,0.033877,0.173642
5269,13174,0.949043,0.029544,0.021413


In [67]:
# submission_lr_less.to_csv('../Testing test data/submission_lrless.csv', index=False)