# Cirrhosis Prediction

## Importing required dataset

In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from statistics import mode

## Getting dataset

In [2]:
df = pd.read_csv("data/proj75/cirrhosis.csv")
df.head()

Unnamed: 0,ID,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,1,400,D,D-penicillamine,21464,F,Y,Y,Y,Y,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0
1,2,4500,C,D-penicillamine,20617,F,N,Y,Y,N,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0
2,3,1012,D,D-penicillamine,25594,M,N,N,N,S,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,4.0
3,4,1925,D,D-penicillamine,19994,F,N,Y,Y,S,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0
4,5,1504,CL,Placebo,13918,F,N,Y,Y,N,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0


## Cleaning dataset

In [3]:
df.isna().sum()

ID                 0
N_Days             0
Status             0
Drug             106
Age                0
Sex                0
Ascites          106
Hepatomegaly     106
Spiders          106
Edema              0
Bilirubin          0
Cholesterol      134
Albumin            0
Copper           108
Alk_Phos         106
SGOT             106
Tryglicerides    136
Platelets         11
Prothrombin        2
Stage              6
dtype: int64

In [6]:
df["Cholesterol"].fillna(np.mean(df["Cholesterol"]), inplace=True)
df["Copper"].fillna(np.mean(df["Copper"]), inplace=True)
df["Alk_Phos"].fillna(np.mean(df["Alk_Phos"]), inplace=True)
df["SGOT"].fillna(np.mean(df["SGOT"]), inplace=True)
df["Tryglicerides"].fillna(np.mean(df["Tryglicerides"]), inplace=True)
df["Platelets"].fillna(np.mean(df["Platelets"]), inplace=True)
df["Prothrombin"].fillna(np.mean(df["Prothrombin"]), inplace=True)

In [7]:
df.isna().sum()

ID                 0
N_Days             0
Status             0
Drug             106
Age                0
Sex                0
Ascites          106
Hepatomegaly     106
Spiders          106
Edema              0
Bilirubin          0
Cholesterol        0
Albumin            0
Copper             0
Alk_Phos           0
SGOT               0
Tryglicerides      0
Platelets          0
Prothrombin        0
Stage              6
dtype: int64

In [9]:
df["Drug"].fillna(mode(df["Drug"]), inplace=True)
df["Ascites"].fillna(mode(df["Ascites"]), inplace=True)
df["Hepatomegaly"].fillna(mode(df["Hepatomegaly"]), inplace=True)
df["Spiders"].fillna(mode(df["Spiders"]), inplace=True)

In [10]:
df.isna().sum()

ID               0
N_Days           0
Status           0
Drug             0
Age              0
Sex              0
Ascites          0
Hepatomegaly     0
Spiders          0
Edema            0
Bilirubin        0
Cholesterol      0
Albumin          0
Copper           0
Alk_Phos         0
SGOT             0
Tryglicerides    0
Platelets        0
Prothrombin      0
Stage            6
dtype: int64

In [29]:
df["Stage"].fillna(mode(df["Stage"]), inplace=True)

In [11]:
labels = {}
for column in df.keys():
    if pd.api.types.is_object_dtype(df[column]):
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        labels[column] = le.classes_
        

In [12]:
labels

{'Status': array(['C', 'CL', 'D'], dtype=object),
 'Drug': array(['D-penicillamine', 'Placebo'], dtype=object),
 'Sex': array(['F', 'M'], dtype=object),
 'Ascites': array(['N', 'Y'], dtype=object),
 'Hepatomegaly': array(['N', 'Y'], dtype=object),
 'Spiders': array(['N', 'Y'], dtype=object),
 'Edema': array(['N', 'S', 'Y'], dtype=object)}

## Pre-Processing

In [30]:
X = df.drop("Stage", axis=1)
y = df["Stage"].astype("int64") - 1

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Training model

In [32]:
xclf = XGBClassifier()
xclf.fit(X_train.values, y_train.values)
xclf.score(X_test.values, y_test.values)

0.42857142857142855

In [33]:
rclf = RandomForestClassifier()
rclf.fit(X_train, y_train)
rclf.score(X_test, y_test)

0.4166666666666667