In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report,ConfusionMatrixDisplay,precision_score,f1_score
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer

In [4]:
df=pd.read_csv('data/Liver_Disease_data.csv')

In [5]:
df

Unnamed: 0,Age,Gender,BMI,AlcoholConsumption,Smoking,GeneticRisk,PhysicalActivity,Diabetes,Hypertension,LiverFunctionTest,Diagnosis
0,58,0,35.857584,17.272828,0,1,0.658940,0,0,42.734240,1
1,71,1,30.732470,2.201266,0,1,1.670557,1,0,67.309822,1
2,48,0,19.971407,18.500944,0,0,9.928308,0,0,63.738956,0
3,34,1,16.615417,12.632870,0,0,5.630129,0,0,64.555873,1
4,62,1,16.065830,1.087815,0,1,3.566218,1,0,77.868689,1
...,...,...,...,...,...,...,...,...,...,...,...
1695,42,0,38.498295,14.384688,0,1,0.992289,0,0,97.933961,1
1696,40,0,27.600094,5.431009,0,0,8.390492,0,0,70.260528,1
1697,38,0,38.730017,6.324302,1,2,9.314222,0,1,56.053370,1
1698,67,0,35.820798,16.899417,0,2,3.224453,0,0,26.300875,1


In [6]:
X=df.drop(columns='Diagnosis',axis=1)
y=df["Diagnosis"]

In [14]:
cat_feat=X[["Age","Gender","Smoking","GeneticRisk","Diabetes","Hypertension"]].columns
num_feat=X[["BMI","AlcoholConsumption","PhysicalActivity","LiverFunctionTest"]].columns

In [16]:
num_transformer=StandardScaler()
cat_transformer=OneHotEncoder()


transformer=ColumnTransformer(
    [
        ("cat_transformer",cat_transformer,cat_feat),
        ("num_transformer",num_transformer,num_feat)
    ]
)

In [17]:
X=transformer.fit_transform(X)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((1360, 76), (340, 76))

In [28]:
def get_prediction(y_true,y_pred):
    accuracy=accuracy_score(y_true,y_pred)
    #confusionMTRX=ConfusionMatrixDisplay(y_true,y_pred)
    classification_rpt=classification_report(y_true,y_pred)
    precision=precision_score(y_true,y_pred)
    f1=f1_score(y_true,y_pred)
    
    return accuracy,classification_rpt,precision,f1
    

In [36]:
models={
    "Logistic Regression": LogisticRegression(),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "XGBClassifier": XGBClassifier(), 
    "CatBoosting Classifier": CatBoostClassifier(verbose=False),
    "AdaBoost Classifier": AdaBoostClassifier()
}

model_list=[]
acc_list=[]
f1_list=[]
precision_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)


    accuracy,classification_rpt,precision,f1=get_prediction(y_train,y_train_pred)
    accuracy,classification_rpt,precision,f1=get_prediction(y_test,y_test_pred)
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model performance for Training set')
    print("- accuracy: {:.4f}".format(accuracy))
    #print("- confusionMTRX: {:.4f}".format(confusionMTRX))
    #print(f"- classification Score: {classification_rpt}:.4")
    print("- precision Score: {:.4f}".format(precision))
    print("- f1 Score: {:.4f}".format(f1))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- accuracy: {:.4f}".format(accuracy))
    #print("- confusionMTRX: {:.4f}".format(confusionMTRX))
    #print("- classification Score: {:.4f}".format(classification_rpt))
    print("- precision Score: {:.4f}".format(precision))
    print("- f1 Score: {:.4f}".format(f1))

    acc_list.append(accuracy)
    f1_list.append(f1)
    precision_list.append(precision)

    print('='*35)
    print('\n')



pd.DataFrame(list(zip(model_list, acc_list,precision_list,f1_list)), columns=['Model Name', 'Accuracy_Score','Precision_score','F1_score']).sort_values(by=["Accuracy_Score"],ascending=False)

Logistic Regression
Model performance for Training set
- accuracy: 0.8235
- precision Score: 0.8073
- f1 Score: 0.8378
----------------------------------
Model performance for Test set
- accuracy: 0.8235
- precision Score: 0.8073
- f1 Score: 0.8378


K-Neighbors Classifier
Model performance for Training set
- accuracy: 0.8176
- precision Score: 0.8118
- f1 Score: 0.8297
----------------------------------
Model performance for Test set
- accuracy: 0.8176
- precision Score: 0.8118
- f1 Score: 0.8297


Decision Tree
Model performance for Training set
- accuracy: 0.7882
- precision Score: 0.7732
- f1 Score: 0.8065
----------------------------------
Model performance for Test set
- accuracy: 0.7882
- precision Score: 0.7732
- f1 Score: 0.8065


Random Forest Classifier
Model performance for Training set
- accuracy: 0.8588
- precision Score: 0.8155
- f1 Score: 0.8750
----------------------------------
Model performance for Test set
- accuracy: 0.8588
- precision Score: 0.8155
- f1 Score: 0.8



AdaBoost Classifier
Model performance for Training set
- accuracy: 0.8647
- precision Score: 0.8438
- f1 Score: 0.8757
----------------------------------
Model performance for Test set
- accuracy: 0.8647
- precision Score: 0.8438
- f1 Score: 0.8757




Unnamed: 0,Model Name,Accuracy_Score,Precision_score,F1_score
5,CatBoosting Classifier,0.891176,0.854271,0.901857
6,AdaBoost Classifier,0.864706,0.84375,0.875676
3,Random Forest Classifier,0.858824,0.815534,0.875
4,XGBClassifier,0.852941,0.826531,0.86631
0,Logistic Regression,0.823529,0.807292,0.837838
1,K-Neighbors Classifier,0.817647,0.811828,0.82967
2,Decision Tree,0.788235,0.773196,0.806452
