In [1]:
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import log_loss

In [2]:
DATA_PATH = '../data/train_processed.csv'
df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status
0,97,0,305,1,0,0,0,0,20,97,69,123,227,162,13,195,7,2,2
1,350,1,228,0,0,0,0,0,6,128,87,52,210,121,38,182,20,2,0
2,414,1,57,0,0,1,1,2,30,87,88,105,130,102,4,68,27,3,2
3,351,1,201,0,0,0,0,0,3,56,83,48,237,30,46,119,17,2,0
4,69,1,138,0,0,1,0,0,8,118,98,52,174,109,46,140,16,3,0


In [3]:
df.columns

Index(['N_Days', 'Drug', 'Age', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders',
       'Edema', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos',
       'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin', 'Stage', 'Status'],
      dtype='object')

In [4]:
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
df[['N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos','SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin']] = scaler.fit_transform(df[['N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos','SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin']]) 


In [26]:
df.to_csv('../data/train_processed_scaled.csv', index=False)

In [5]:
df_train, df_val = train_test_split(df, test_size=0.20, random_state=1, stratify=df.Status)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

y_train = df_train.Status.values
y_val = df_val.Status.values

del df_train['Status']
del df_val['Status']

In [6]:
len(df_train), len(df_val), len(y_train), len(y_val)

(6324, 1581, 6324, 1581)

In [7]:
columnas = ['N_Days', 'Drug', 'Age', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders',
       'Edema', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos',
       'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin', 'Stage']
resultable = []
for i in range(len(columnas)):
    model = LogisticRegression(solver='lbfgs', random_state=1)
    model.fit(df_train.iloc[:,[i]], y_train)

    y_pred = model.predict_proba(df_val.iloc[:,[i]])

    log_loss_metric = log_loss(y_val, y_pred)

    resultable.append([columnas[i], log_loss_metric])

In [8]:
resultable = pd.DataFrame(resultable, columns=['Feature', 'Logloss'])
resultable

Unnamed: 0,Feature,Logloss
0,N_Days,0.696699
1,Drug,0.776406
2,Age,0.759088
3,Sex,0.768152
4,Ascites,0.744798
5,Hepatomegaly,0.701176
6,Spiders,0.730433
7,Edema,0.71899
8,Bilirubin,0.651714
9,Cholesterol,0.759327


In [9]:
best_result_table = resultable.sort_values('Logloss', ascending=True)
best_result_table.reset_index(drop=True, inplace=True)
best_result_table

Unnamed: 0,Feature,Logloss
0,Bilirubin,0.651714
1,Copper,0.676299
2,N_Days,0.696699
3,Stage,0.700347
4,Hepatomegaly,0.701176
5,Prothrombin,0.704893
6,SGOT,0.712991
7,Edema,0.71899
8,Spiders,0.730433
9,Albumin,0.736426


In [10]:
featuresN = [best_result_table.Feature[0]]
limit = best_result_table.Logloss[0]

In [11]:
for i in range(1, len(best_result_table)):
    featuresN.append(best_result_table.iloc[i, 0])
    datosZ = df_train.loc[:, featuresN]
    modeloZ = LogisticRegression(solver='lbfgs', random_state=1)
    modeloZ.fit(datosZ, y_train)
    
    prediccionesZ = modeloZ.predict_proba(df_val.loc[:, featuresN])
    
    loglossZ = log_loss(y_val, prediccionesZ)

    print(featuresN, loglossZ)
    
    if loglossZ < limit:
        limit = loglossZ
    else:
        del featuresN[-1]

print(len(featuresN))
print(limit)

['Bilirubin', 'Copper'] 0.6265492872346909
['Bilirubin', 'Copper', 'N_Days'] 0.6078800547200114
['Bilirubin', 'Copper', 'N_Days', 'Stage'] 0.5789898132728415
['Bilirubin', 'Copper', 'N_Days', 'Stage', 'Hepatomegaly'] 0.573000680733139
['Bilirubin', 'Copper', 'N_Days', 'Stage', 'Hepatomegaly', 'Prothrombin'] 0.556419176196112
['Bilirubin', 'Copper', 'N_Days', 'Stage', 'Hepatomegaly', 'Prothrombin', 'SGOT'] 0.5502498572996756
['Bilirubin', 'Copper', 'N_Days', 'Stage', 'Hepatomegaly', 'Prothrombin', 'SGOT', 'Edema'] 0.5430806860769729
['Bilirubin', 'Copper', 'N_Days', 'Stage', 'Hepatomegaly', 'Prothrombin', 'SGOT', 'Edema', 'Spiders'] 0.5444208742331078
['Bilirubin', 'Copper', 'N_Days', 'Stage', 'Hepatomegaly', 'Prothrombin', 'SGOT', 'Edema', 'Albumin'] 0.5431398367317275
['Bilirubin', 'Copper', 'N_Days', 'Stage', 'Hepatomegaly', 'Prothrombin', 'SGOT', 'Edema', 'Ascites'] 0.5442880137732745
['Bilirubin', 'Copper', 'N_Days', 'Stage', 'Hepatomegaly', 'Prothrombin', 'SGOT', 'Edema', 'Platele

In [12]:
from sklearn.feature_extraction import DictVectorizer
import xgboost as xgb

In [13]:
train_dicts = df_train[['Bilirubin', 'Copper', 'N_Days', 'Stage', 'Hepatomegaly', 'Prothrombin', 'SGOT', 'Edema', 'Platelets', 'Age', 'Cholesterol', 'Drug']].fillna(0).to_dict(orient='records')

In [14]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)

In [15]:
dv.feature_names_

['Age',
 'Bilirubin',
 'Cholesterol',
 'Copper',
 'Drug',
 'Edema',
 'Hepatomegaly',
 'N_Days',
 'Platelets',
 'Prothrombin',
 'SGOT',
 'Stage']

In [16]:
val_dicts = df_val[['Bilirubin', 'Copper', 'N_Days', 'Stage', 'Hepatomegaly', 'Prothrombin', 'SGOT', 'Edema', 'Platelets', 'Age', 'Cholesterol', 'Drug']].fillna(0).to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [17]:
features = dv.feature_names_

In [18]:
dtrain = xgb.DMatrix(X_train, label = y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

In [19]:
xgb_params = {
    'eta': 0.1,
    'max_depth': 3,
    'min_child_weight':10,

    'objective': 'multi:softmax',
    'num_class': 3,
    'eval_metric': 'logloss',

    'nthread': 8,
    'seed': 1,
    'verbosity': 1
}

model = xgb.train(xgb_params, dtrain, num_boost_round = 300 )

In [20]:
y_pred = model.predict(dval)

In [21]:
import numpy as np

In [22]:
probabilities = model.predict(dval, output_margin=True)
probabilities = np.exp(probabilities) / np.sum(np.exp(probabilities), axis=1, keepdims=True)

In [23]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [24]:
print('AUC',roc_auc_score(y_val, probabilities, multi_class='ovr'))
print('logloss',log_loss(y_val, probabilities))
print('Accuracy',accuracy_score(y_val, y_pred))

AUC 0.8911806785367388
logloss 0.46193818231202044
Accuracy 0.8153067678684377


## Final model

- Standarizing the features
    - With StandardScaler
- Using the best features
    - ['Bilirubin', 'Copper', 'N_Days', 'Stage', 'Hepatomegaly', 'Prothrombin', 'SGOT', 'Edema', 'Platelets', 'Age', 'Cholesterol', 'Drug']
- Using XGBoost
    - Using the best parameters for XGBoost 
        - eta = 0.1
        - max_depth = 3
        - min_child_weight = 10
        - num_boost_round = 300
        - objective = 'multi:softmax'