# Cirrhosis Disease Prediction

## Data Loading

In [1]:
# Importing dataset via kaggle

!kaggle datasets download -d fedesoriano/cirrhosis-prediction-dataset

Dataset URL: https://www.kaggle.com/datasets/fedesoriano/cirrhosis-prediction-dataset
License(s): copyright-authors
Downloading cirrhosis-prediction-dataset.zip to /home/bhxveshhh/ML/Cirrhosis Disease Prediction
  0%|                                               | 0.00/11.0k [00:00<?, ?B/s]
100%|██████████████████████████████████████| 11.0k/11.0k [00:00<00:00, 44.0MB/s]


In [2]:
# Exporting dataset from zipfile

import zipfile
zip_ref = zipfile.ZipFile('/home/bhxveshhh/ML/Cirrhosis Disease Prediction/cirrhosis-prediction-dataset.zip', 'r')
zip_ref.extractall('/home/bhxveshhh/ML/Cirrhosis Disease Prediction')
zip_ref.close()

## Exploratory Data Analysis

In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [45]:
df = pd.read_csv('cirrhosis.csv')

In [46]:
df.head(5)

Unnamed: 0,ID,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,1,400,D,D-penicillamine,21464,F,Y,Y,Y,Y,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0
1,2,4500,C,D-penicillamine,20617,F,N,Y,Y,N,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0
2,3,1012,D,D-penicillamine,25594,M,N,N,N,S,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,4.0
3,4,1925,D,D-penicillamine,19994,F,N,Y,Y,S,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0
4,5,1504,CL,Placebo,13918,F,N,Y,Y,N,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0


In [47]:
df.shape

(418, 20)

In [48]:
df.size

8360

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   ID             418 non-null    int64  
 1   N_Days         418 non-null    int64  
 2   Status         418 non-null    object 
 3   Drug           312 non-null    object 
 4   Age            418 non-null    int64  
 5   Sex            418 non-null    object 
 6   Ascites        312 non-null    object 
 7   Hepatomegaly   312 non-null    object 
 8   Spiders        312 non-null    object 
 9   Edema          418 non-null    object 
 10  Bilirubin      418 non-null    float64
 11  Cholesterol    284 non-null    float64
 12  Albumin        418 non-null    float64
 13  Copper         310 non-null    float64
 14  Alk_Phos       312 non-null    float64
 15  SGOT           312 non-null    float64
 16  Tryglicerides  282 non-null    float64
 17  Platelets      407 non-null    float64
 18  Prothrombi

In [50]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ID,418.0,209.5,120.810458,1.0,105.25,209.5,313.75,418.0
N_Days,418.0,1917.782297,1104.672992,41.0,1092.75,1730.0,2613.5,4795.0
Age,418.0,18533.351675,3815.845055,9598.0,15644.5,18628.0,21272.5,28650.0
Bilirubin,418.0,3.220813,4.407506,0.3,0.8,1.4,3.4,28.0
Cholesterol,284.0,369.510563,231.944545,120.0,249.5,309.5,400.0,1775.0
Albumin,418.0,3.49744,0.424972,1.96,3.2425,3.53,3.77,4.64
Copper,310.0,97.648387,85.61392,4.0,41.25,73.0,123.0,588.0
Alk_Phos,312.0,1982.655769,2140.388824,289.0,871.5,1259.0,1980.0,13862.4
SGOT,312.0,122.556346,56.699525,26.35,80.6,114.7,151.9,457.25
Tryglicerides,282.0,124.702128,65.148639,33.0,84.25,108.0,151.0,598.0


In [51]:
df.isnull().sum()

ID                 0
N_Days             0
Status             0
Drug             106
Age                0
Sex                0
Ascites          106
Hepatomegaly     106
Spiders          106
Edema              0
Bilirubin          0
Cholesterol      134
Albumin            0
Copper           108
Alk_Phos         106
SGOT             106
Tryglicerides    136
Platelets         11
Prothrombin        2
Stage              6
dtype: int64

In [52]:
df.duplicated().sum()

np.int64(0)

In [53]:
df.nunique()

ID               418
N_Days           399
Status             3
Drug               2
Age              344
Sex                2
Ascites            2
Hepatomegaly       2
Spiders            2
Edema              3
Bilirubin         98
Cholesterol      201
Albumin          154
Copper           158
Alk_Phos         295
SGOT             179
Tryglicerides    146
Platelets        243
Prothrombin       48
Stage              4
dtype: int64

## Data Preprocessing

In [54]:
# Fill missing values For numerical columns with median 

df.select_dtypes(include=(['int64', 'float64'])).isna().sum()
df_num_col = df.select_dtypes(include=(['int64', 'float64'])).columns
for c in df_num_col:
    df[c].fillna(df[c].median(), inplace=True)

In [55]:
# Fill missing values For numerical columns with median 

df_cat_col = df.select_dtypes(include=('object')).columns
for c in df_cat_col:
    df[c].fillna(df[c].mode().values[0], inplace=True)

In [56]:
# Label Encoding

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df['Sex'] = df['Sex'].replace({'M':0, 'F':1})                               
df['Ascites'] = df['Ascites'].replace({'N':0, 'Y':1})                       
df['Drug'] = df['Drug'].replace({'D-penicillamine':0, 'Placebo':1})          
df['Hepatomegaly'] = df['Hepatomegaly'].replace({'N':0, 'Y':1})             
df['Spiders'] = df['Spiders'].replace({'N':0, 'Y':1})                
df['Edema'] = df['Edema'].replace({'N':0, 'Y':1, 'S':-1})              
df['Status'] = df['Status'].replace({'C':0, 'CL':1, 'D':-1})        
df['Stage'] = le.fit_transform(df['Stage'])

In [57]:
X = df.drop(['Status', 'N_Days', 'Stage'], axis=1)
y = df['Stage']

In [58]:
# Train Test Split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Model Building

In [59]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [60]:
logistic_clf = LogisticRegression()
ridge_clf = RidgeClassifier()
xgboost_clf = XGBClassifier()
random_forest_clf = RandomForestClassifier()
ada_boost_clf = AdaBoostClassifier()
grad_boost_clf = GradientBoostingClassifier()
bagging_clf = BaggingClassifier()
decision_tree_clf = DecisionTreeClassifier()
svm_clf = SVC()

In [61]:
model_li = [logistic_clf, ridge_clf, xgboost_clf, random_forest_clf, 
            ada_boost_clf, grad_boost_clf, bagging_clf, decision_tree_clf, svm_clf]
scores = []
for model in model_li:
    model.fit(X_train, y_train)
    scores.append(accuracy_score(y_test, model.predict(X_test)))

print(scores)

[0.47619047619047616, 0.47619047619047616, 0.5119047619047619, 0.4880952380952381, 0.5, 0.5357142857142857, 0.5, 0.36904761904761907, 0.42857142857142855]
