# Multiclass Prediction of Cirrhosis Outcomes

### Importing libraries for the project

In [18]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.feature_extraction import DictVectorizer

In [2]:
import warnings
warnings.filterwarnings("ignore")

### Uploading the data

In [3]:
df_train = pd.read_csv('data/train.csv')   # Syntetic data
df_original = pd.read_csv('data/cirrhosis.csv')  # original dataset

### Concat both dataset

In [4]:
df = pd.concat([df_train, df_original], axis =0)

In [5]:
# drop the "id" column
df.drop(columns=['id', 'ID'], inplace=True)

In [6]:
df.reset_index(drop=True, inplace=True)

In [7]:
# Dropping ows with null values
df = df.dropna()

In [8]:
def process_day_columns(input_df):
    from datetime import datetime
    # Define the starting date
    df = input_df.copy()
    start_date = datetime(1986, 7, 1)

    # Convert days to months and years using pandas
    df['date'] = start_date + pd.to_timedelta(df['N_Days'], unit='D')
    df['N_Months'] = (df['date'].dt.year - start_date.year) * 12 + df['date'].dt.month - start_date.month
    # Age column to year
    df['Age'] = df['Age'] // 365.25 

    df.drop(columns='date', inplace=True)

    return df


In [9]:
df_proc = process_day_columns(df)

In [10]:
df_proc.head()

Unnamed: 0,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status,N_Months
0,999,D-penicillamine,58.0,M,N,N,N,N,2.3,316.0,3.35,172.0,1601.0,179.8,63.0,394.0,9.7,3.0,D,32
1,2574,Placebo,52.0,F,N,N,N,N,0.9,364.0,3.54,63.0,1440.0,134.85,88.0,361.0,11.0,3.0,C,84
2,3428,Placebo,37.0,F,N,Y,Y,Y,3.3,299.0,3.55,131.0,1029.0,119.35,50.0,199.0,11.7,4.0,D,112
3,2576,Placebo,50.0,F,N,N,N,N,0.6,256.0,3.5,58.0,1653.0,71.3,96.0,269.0,10.7,3.0,C,84
4,788,Placebo,45.0,F,N,Y,N,N,1.1,346.0,3.65,63.0,1181.0,125.55,96.0,298.0,10.6,4.0,C,25


##### Making 'Stage' a category

In [11]:
df_proc.Stage = df_proc.Stage.astype('category')

In [12]:
def feature_eng_num_to_cat(df):
    df_temp = df.copy()
    df_temp['normal_patelets'] = df_temp.Platelets.map(lambda x: "N" if x < 150 else "Y") # Pateletes
    df_temp['normal_cholesterol'] = df_temp.Cholesterol.map(lambda x: "Y" if x < 201 else "N") # Cholesterol
    df_temp['normal_tryglicerides'] = df_temp.Tryglicerides.map(lambda x: "Y" if x < 151 else "N") # Tryglicerides
    df_temp['normal_SGOT'] = df_temp.SGOT.map(lambda x: "Y" if x < 41 else "N") # SGOT
    df_temp['normal_copper'] = df_temp.Copper.map(lambda x: "Y" if x >= 62 and x <= 140 else "N") # Copper
    df_temp['normal_bilirubin'] = df_temp.Bilirubin.map(lambda x: "Y" if x >= 0.2 and x <= 1.2 else "N") # Bilirubin
    df_temp['normal_albumin'] = df_temp.Albumin.map(lambda x: "Y" if x >= 3.4 and x <= 5.4 else "N") # Albumin

    return df_temp

In [13]:
df_new_features = feature_eng_num_to_cat(df_proc)

In [14]:
df_new_features.head()

Unnamed: 0,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,...,Stage,Status,N_Months,normal_patelets,normal_cholesterol,normal_tryglicerides,normal_SGOT,normal_copper,normal_bilirubin,normal_albumin
0,999,D-penicillamine,58.0,M,N,N,N,N,2.3,316.0,...,3.0,D,32,Y,N,Y,N,N,N,N
1,2574,Placebo,52.0,F,N,N,N,N,0.9,364.0,...,3.0,C,84,Y,N,Y,N,Y,Y,Y
2,3428,Placebo,37.0,F,N,Y,Y,Y,3.3,299.0,...,4.0,D,112,Y,N,Y,N,Y,N,Y
3,2576,Placebo,50.0,F,N,N,N,N,0.6,256.0,...,3.0,C,84,Y,N,Y,N,N,Y,Y
4,788,Placebo,45.0,F,N,Y,N,N,1.1,346.0,...,4.0,C,25,Y,N,Y,N,Y,Y,Y


## Preparing dataset

### Converting Target variable to numerical

In [15]:
df_new_features.Status = df_new_features.Status.map({"D": 0, "C": 1, "CL": 2})

### Split the data and Standarize

In [16]:
X = df_new_features.drop(columns='Status')
y = df_new_features.Status

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123, test_size=0.2, stratify=y)

### DictVectorizer

In [19]:
dv = DictVectorizer(sparse=False)

In [20]:
train_dict = X_train.to_dict(orient='records')
test_dict = X_test.to_dict(orient='records')

X_train = dv.fit_transform(train_dict)
X_test = dv.transform(test_dict)

### Scaler

In [22]:
from sklearn.preprocessing import RobustScaler

sc = RobustScaler()

In [23]:
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [25]:
best_params = {'booster': 'gbtree', 
               'max_depth': 11, 
               'learning_rate': 0.04216709720284281, 
               'n_estimators': 504, 
               'min_child_weight': 1, 
               'subsample': 0.3437341948452076, 
               'colsample_bylevel': 0.9831950290382536, 
               'colsample_bytree': 0.1720049167961935, 
               'colsample_bynode': 0.7420031757532206, 
               'reg_alpha': 0.9242740053249385, 
               'reg_lambda': 0.9419384081768526, 
               'eval_metric': 'mlogloss'}


In [26]:
xgb = XGBClassifier(**best_params)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict_proba(X_test)

In [27]:
log_loss(y_test, xgb_pred)

0.42492633381525924

In [28]:
roc_auc_score(y_test, xgb_pred, multi_class='ovr')

0.8929783934335574

### Export model, dv and scaler

In [29]:
import pickle

In [35]:
with open('xgb_model.bin', 'wb') as f_out:
    pickle.dump(xgb, f_out)

with open('dv_model.bin', 'wb') as f_out:
    pickle.dump(dv, f_out)

with open('scaler.bin', 'wb') as f_out:
    pickle.dump(sc, f_out)

### Loading model and testing it

In [36]:
with open('xgb_model.bin', 'rb') as f_in:
    xgb_model = pickle.load(f_in)

with open('dv_model.bin', 'rb') as f_in:
    dv_model = pickle.load(f_in)

with open('scaler.bin', 'rb') as f_in:
    scaler_model = pickle.load(f_in)

In [32]:
patient = {
 'N_Days': 3839,
 'Drug': 'D-penicillamine',
 'Age': 19724,
 'Sex': 'F',
 'Ascites': 'N',
 'Hepatomegaly': 'Y',
 'Spiders': 'N',
 'Edema': 'N',
 'Bilirubin': 1.2,
 'Cholesterol': 546.0,
 'Albumin': 3.37,
 'Copper': 65.0,
 'Alk_Phos': 1636.0,
 'SGOT': 151.9,
 'Tryglicerides': 90.0,
 'Platelets': 430.0,
 'Prothrombin': 10.6,
 'Stage': 2.0}

In [33]:
X = pd.DataFrame([patient])

In [34]:
X

Unnamed: 0,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,3839,D-penicillamine,19724,F,N,Y,N,N,1.2,546.0,3.37,65.0,1636.0,151.9,90.0,430.0,10.6,2.0


In [40]:
df_test = process_day_columns(X)
df_test.Stage = df_test.Stage.astype('category')
df_test = feature_eng_num_to_cat(df_test)

In [41]:
df_test

Unnamed: 0,N_Days,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,...,Prothrombin,Stage,N_Months,normal_patelets,normal_cholesterol,normal_tryglicerides,normal_SGOT,normal_copper,normal_bilirubin,normal_albumin
0,3839,D-penicillamine,54.0,F,N,Y,N,N,1.2,546.0,...,10.6,2.0,126,Y,N,Y,N,Y,Y,N


In [43]:
df_test_dict = df_test.to_dict(orient='records')

df_test = dv_model.transform(df_test_dict)

In [44]:
df_test = scaler_model.transform(df_test)

In [48]:
prediction = xgb_model.predict(df_test)
proba = xgb_model.predict_proba(df_test)

In [49]:
prediction

array([0])

In [50]:
proba

array([[0.576682  , 0.4033763 , 0.01994173]], dtype=float32)