In [16]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator,TransformerMixin
from matplotlib import pyplot as plot
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings("ignore")

# Read data: 

In [17]:
data1=pd.read_csv("indian_liver_patient_dataset.csv")

In [7]:
data1

Unnamed: 0.1,Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Liver_Problem
0,0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.90,1
1,1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.00,1
4,4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.40,1
...,...,...,...,...,...,...,...,...,...,...,...,...
495,495,32,Male,0.7,0.2,165,31,29,6.1,3.0,0.96,2
496,496,24,Male,1.0,0.2,189,52,31,8.0,4.8,1.50,1
497,497,67,Male,2.2,1.1,198,42,39,7.2,3.0,0.70,1
498,498,68,Male,1.8,0.5,151,18,22,6.5,4.0,1.60,1


# Data Engineering

In [9]:
data1.head()

Unnamed: 0.1,Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Liver_Problem
0,0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0                  500 non-null    int64  
 1   Age                         500 non-null    int64  
 2   Gender                      500 non-null    object 
 3   Total_Bilirubin             500 non-null    float64
 4   Direct_Bilirubin            500 non-null    float64
 5   Alkaline_Phosphotase        500 non-null    int64  
 6   Alamine_Aminotransferase    500 non-null    int64  
 7   Aspartate_Aminotransferase  500 non-null    int64  
 8   Total_Protiens              500 non-null    float64
 9   Albumin                     500 non-null    float64
 10  Albumin_and_Globulin_Ratio  496 non-null    float64
 11  Liver_Problem               500 non-null    int64  
dtypes: float64(5), int64(6), object(1)
memory usage: 47.0+ KB


In [11]:
data1.isnull()

Unnamed: 0.1,Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Liver_Problem
0,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
495,False,False,False,False,False,False,False,False,False,False,False,False
496,False,False,False,False,False,False,False,False,False,False,False,False
497,False,False,False,False,False,False,False,False,False,False,False,False
498,False,False,False,False,False,False,False,False,False,False,False,False


In [12]:
data1.isnull().sum()

Unnamed: 0                    0
Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    4
Liver_Problem                 0
dtype: int64

In [26]:
df = data1.drop("Unnamed: 0", axis=1)

In [27]:
df.shape

(500, 11)

In [28]:
data1['Liver_Problem'].value_counts()

1    350
2    150
Name: Liver_Problem, dtype: int64

# model

In [29]:
X = df.drop("Liver_Problem", axis=1)
y = df["Liver_Problem"]

In [30]:
ct = ColumnTransformer(
    [
        ("impute_missing_values_on_Albumin_and_Globulin_Ratio", SimpleImputer(missing_values=np.nan, strategy="mean"), ["Albumin_and_Globulin_Ratio"]),
        ("LabelEncoder_for_gender", OneHotEncoder(), ["Gender"])
    ], remainder="passthrough"
                    )

In [31]:
X = ct.fit_transform(X)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 101)

In [33]:
model = LogisticRegression()

In [34]:
model.fit(X_train, y_train)

In [35]:
pred = model.predict(X_test)

#  model accuracy

In [36]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
print("True Positive", tp)
print("True Negative", tn)
print("False Positive", fp)
print("False Negative", fn)

True Positive 7
True Negative 60
False Positive 6
False Negative 27


In [37]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.67