In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df=pd.read_csv('/content/indian_liver_patient.csv')
df.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         583 non-null    int64  
 1   Gender                      583 non-null    object 
 2   Total_Bilirubin             583 non-null    float64
 3   Direct_Bilirubin            583 non-null    float64
 4   Alkaline_Phosphotase        583 non-null    int64  
 5   Alamine_Aminotransferase    583 non-null    int64  
 6   Aspartate_Aminotransferase  583 non-null    int64  
 7   Total_Protiens              583 non-null    float64
 8   Albumin                     583 non-null    float64
 9   Albumin_and_Globulin_Ratio  579 non-null    float64
 10  Dataset                     583 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


In [None]:
df.shape

(583, 11)

In [None]:
df.isna().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    4
Dataset                       0
dtype: int64

In [None]:
df['Albumin_and_Globulin_Ratio'].fillna(df['Albumin_and_Globulin_Ratio'].mean(), inplace=True)
#df.dropna(inplace=True)

In [None]:
df.duplicated().sum()

13

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])
df.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,0,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,1,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,1,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,1,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,1,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [None]:
df['Dataset'] = df['Dataset'].replace([2], 0)

In [None]:
df['Dataset'].value_counts()

1    406
0    164
Name: Dataset, dtype: int64

In [None]:
from sklearn.utils import resample

#Separate majority and minority classes
majority_class = df[df['Dataset'] == 1]
minority_class = df[df['Dataset'] == 0]

# Upsample minority class
minority_upsampled = resample(minority_class, replace=True, n_samples=len(majority_class), random_state=42)

# Combine majority class with upsampled minority class
balanced_df = pd.concat([majority_class, minority_upsampled])

# Check the class distribution
print(balanced_df['Dataset'].value_counts())


1    406
0    406
Name: Dataset, dtype: int64


In [None]:
X=balanced_df.drop(['Dataset'],axis=1)
y=balanced_df['Dataset']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=42)

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(n_estimators=15)
rf.fit(X_train,y_train)

In [None]:
rf_pred=rf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix
print("Accuracy:",accuracy_score(y_test,rf_pred))

Accuracy: 0.8404907975460123


In [None]:
confusion_matrix(y_test,rf_pred)

array([[74,  9],
       [17, 63]])

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,rf_pred))

              precision    recall  f1-score   support

           0       0.81      0.89      0.85        83
           1       0.88      0.79      0.83        80

    accuracy                           0.84       163
   macro avg       0.84      0.84      0.84       163
weighted avg       0.84      0.84      0.84       163



In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
print("Mean squared error:",mean_squared_error(y_test,rf_pred,squared=False))
print("Mean absolute error:",mean_absolute_error(y_test,rf_pred))

Mean squared error: 0.399386031871406
Mean absolute error: 0.15950920245398773


## Ada Boost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ada=AdaBoostClassifier(n_estimators=100, random_state=0)
ada.fit(X_train,y_train)

In [None]:
ada_pred=ada.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix
print("Accuracy:",accuracy_score(y_test,ada_pred))

Accuracy: 0.7914110429447853


In [None]:
confusion_matrix(y_test,ada_pred)

array([[72, 11],
       [23, 57]])

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,ada_pred))

              precision    recall  f1-score   support

           0       0.76      0.87      0.81        83
           1       0.84      0.71      0.77        80

    accuracy                           0.79       163
   macro avg       0.80      0.79      0.79       163
weighted avg       0.80      0.79      0.79       163



In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
print("Mean squared error:",mean_squared_error(y_test,ada_pred,squared=False))
print("Mean absolute error:",mean_absolute_error(y_test,ada_pred))

Mean squared error: 0.45671540050146625
Mean absolute error: 0.2085889570552147


## Gradient Boost

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb=GradientBoostingClassifier()
gb.fit(X_train,y_train)

In [None]:
gb_pred=gb.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,gb_pred)

0.8282208588957055

In [None]:
confusion_matrix(y_test,gb_pred)

array([[76,  7],
       [21, 59]])

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,gb_pred))

              precision    recall  f1-score   support

           0       0.78      0.92      0.84        83
           1       0.89      0.74      0.81        80

    accuracy                           0.83       163
   macro avg       0.84      0.83      0.83       163
weighted avg       0.84      0.83      0.83       163



In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
print("Mean squared error:",mean_squared_error(y_test,gb_pred,squared=False))
print("Mean absolute error:",mean_absolute_error(y_test,gb_pred))

Mean squared error: 0.41446247249213586
Mean absolute error: 0.17177914110429449


## Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier




# Create base classifiers
lr_classifier = LogisticRegression(max_iter=10000)
svm_classifier = SVC()  # Set probability=True for soft voting
knn_classifier = KNeighborsClassifier(n_neighbors=15)
nb_classifier = GaussianNB()
dt_classifier = DecisionTreeClassifier()

# Create a Bagging Classifier with the base classifiers
bagging_lr = BaggingClassifier(estimator=lr_classifier, n_estimators=10, random_state=42)
bagging_svm = BaggingClassifier(estimator=svm_classifier, n_estimators=10, random_state=42)
bagging_knn = BaggingClassifier(estimator=knn_classifier, n_estimators=10, random_state=42)
bagging_nb = BaggingClassifier(estimator=nb_classifier, n_estimators=10, random_state=42)
bagging_dt = BaggingClassifier(estimator=dt_classifier, n_estimators=10, random_state=42)

# Fit the Bagging Classifiers on the training data
bagging_lr.fit(X_train, y_train)
bagging_svm.fit(X_train, y_train)
bagging_knn.fit(X_train, y_train)
bagging_nb.fit(X_train, y_train)
bagging_dt.fit(X_train, y_train)

# Make predictions using the Bagging Classifiers
y_pred_lr = bagging_lr.predict(X_test)
y_pred_svm = bagging_svm.predict(X_test)
y_pred_knn = bagging_knn.predict(X_test)
y_pred_nb = bagging_nb.predict(X_test)
y_pred_dt = bagging_dt.predict(X_test)

# Calculate and print accuracy scores for each Bagging Classifier

print("Bagging Logistic Regression Score:",accuracy_score(y_test, y_pred_lr))
print("Bagging SVM Score:", accuracy_score(y_test, y_pred_svm))
print("Bagging K-Nearest Neighbors Score:", accuracy_score(y_test, y_pred_knn))
print("Bagging Naive Bayes Score:", accuracy_score(y_test, y_pred_nb))
print("Bagging Decision Tree Score:", accuracy_score(y_test, y_pred_dt))

lr=accuracy_score(y_test, y_pred_lr)
svm= accuracy_score(y_test, y_pred_svm)
knn=accuracy_score(y_test, y_pred_knn)
nb=accuracy_score(y_test, y_pred_nb)
dt=accuracy_score(y_test, y_pred_dt)

print("Maximum Accuracy:",max(lr,svm,knn,nb,dt))

Bagging Logistic Regression Score: 0.7361963190184049
Bagging SVM Score: 0.6748466257668712
Bagging K-Nearest Neighbors Score: 0.7116564417177914
Bagging Naive Bayes Score: 0.7177914110429447
Bagging Decision Tree Score: 0.8404907975460123
Maximum Accuracy: 0.8404907975460123


In [None]:
confusion_matrix(y_test,y_pred_dt)

array([[75,  8],
       [18, 62]])

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred_dt))

              precision    recall  f1-score   support

           0       0.81      0.90      0.85        83
           1       0.89      0.78      0.83        80

    accuracy                           0.84       163
   macro avg       0.85      0.84      0.84       163
weighted avg       0.85      0.84      0.84       163



In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
print("Mean squared error:",mean_squared_error(y_test,y_pred_dt,squared=False))
print("Mean absolute error:",mean_absolute_error(y_test,y_pred_dt))

Mean squared error: 0.399386031871406
Mean absolute error: 0.15950920245398773


## Boosting

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

# Define the base classifiers
base_classifiers = [
    ('lr', LogisticRegression(max_iter=10000)),
    ('svm', SVC()),
    ('knn', KNeighborsClassifier(n_neighbors=15)),
    ('nb', GaussianNB()),
    ('dt', DecisionTreeClassifier())

]

# Define the meta-classifier (you can choose any classifier)
#meta_classifier = GaussianNB()

# Create the stacking classifier
stacking_classifier_lr = StackingClassifier(estimators=base_classifiers, final_estimator=LogisticRegression(max_iter=10000))
stacking_classifier_svm = StackingClassifier(estimators=base_classifiers, final_estimator=SVC(probability=True))
stacking_classifier_knn = StackingClassifier(estimators=base_classifiers, final_estimator=KNeighborsClassifier(n_neighbors=15))
stacking_classifier_nb = StackingClassifier(estimators=base_classifiers, final_estimator=GaussianNB())
stacking_classifier_dt = StackingClassifier(estimators=base_classifiers, final_estimator=DecisionTreeClassifier())

# Train the stacking classifier on the training data
stacking_classifier_lr.fit(X_train, y_train)
stacking_classifier_svm.fit(X_train, y_train)
stacking_classifier_knn.fit(X_train, y_train)
stacking_classifier_nb.fit(X_train, y_train)
stacking_classifier_dt.fit(X_train, y_train)

# Make predictions on the test data
#stacking_predictions = stacking_classifier.predict(X_test)
y_pred_lr = stacking_classifier_lr.predict(X_test)
y_pred_svm = stacking_classifier_svm.predict(X_test)
y_pred_knn = stacking_classifier_knn.predict(X_test)
y_pred_nb = stacking_classifier_nb.predict(X_test)
y_pred_dt = stacking_classifier_dt.predict(X_test)


In [None]:
from sklearn.metrics import accuracy_score
print("Stacking Logistic Regression Score:", accuracy_score(y_test, y_pred_lr))
print("Stacking SVM Score:", accuracy_score(y_test, y_pred_svm))
print("Stacking K-Nearest Neighbors Score:", accuracy_score(y_test, y_pred_knn))
print("Stacking Naive Bayes Score:", accuracy_score(y_test, y_pred_nb))
print("Stacking Decision Tree Score:", accuracy_score(y_test, y_pred_dt))
lr=accuracy_score(y_test, y_pred_lr)
svm= accuracy_score(y_test, y_pred_svm)
knn=accuracy_score(y_test, y_pred_knn)
nb=accuracy_score(y_test, y_pred_nb)
dt=accuracy_score(y_test, y_pred_dt)

print("Maximum Accuracy:",max(lr,svm,knn,nb,dt))

Stacking Logistic Regression Score: 0.8466257668711656
Stacking SVM Score: 0.8220858895705522
Stacking K-Nearest Neighbors Score: 0.8282208588957055
Stacking Naive Bayes Score: 0.8220858895705522
Stacking Decision Tree Score: 0.7484662576687117
Maximum Accuracy: 0.8466257668711656


In [None]:
confusion_matrix(y_test,y_pred_lr)

array([[74,  9],
       [16, 64]])

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred_lr))

              precision    recall  f1-score   support

           0       0.82      0.89      0.86        83
           1       0.88      0.80      0.84        80

    accuracy                           0.85       163
   macro avg       0.85      0.85      0.85       163
weighted avg       0.85      0.85      0.85       163



In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
print("Mean squared error:",mean_squared_error(y_test,y_pred_lr,squared=False))
print("Mean absolute error:",mean_absolute_error(y_test,y_pred_lr))

Mean squared error: 0.39163022499397865
Mean absolute error: 0.15337423312883436


## Deep Learning

In [None]:
from sklearn.neural_network import MLPClassifier
dl=MLPClassifier()
dl.fit(X_train, y_train)

In [None]:
dl_pred = dl.predict(X_test)
print("Accuracy:",accuracy_score(y_test,dl_pred))

Accuracy: 0.7116564417177914


In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix


In [None]:
confusion_matrix(y_test, dl_pred)

array([[80,  3],
       [44, 36]])

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
print("Mean squared error:",mean_squared_error(y_test, dl_pred,squared=False))
print("Mean absolute error:",mean_absolute_error(y_test, dl_pred))

Mean squared error: 0.5369763107272132
Mean absolute error: 0.2883435582822086


## Making Predictions

In [None]:
df

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,0,0.7,0.1,187,16,18,6.8,3.3,0.90,1
1,62,1,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,1,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,1,1.0,0.4,182,14,20,6.8,3.4,1.00,1
4,72,1,3.9,2.0,195,27,59,7.3,2.4,0.40,1
...,...,...,...,...,...,...,...,...,...,...,...
578,60,1,0.5,0.1,500,20,34,5.9,1.6,0.37,0
579,40,1,0.6,0.1,98,35,31,6.0,3.2,1.10,1
580,52,1,0.8,0.2,245,48,49,6.4,3.2,1.00,1
581,31,1,1.3,0.5,184,29,32,6.8,3.4,1.00,1


In [None]:
import pickle
pickle.dump(stacking_classifier_lr,open('liver.pkl','wb'))

In [None]:
model=pickle.load(open('liver.pkl','rb'))

In [None]:
model.predict((np.array([38,1,1.0,0.3,216,21,24,7.3,4.4,1.50]).reshape(1,-1)))



array([0])

In [None]:
model.predict((np.array([31,1,1.3,0.5,184,29,32,6.8,3.4,1.00]).reshape(1,-1)))



array([1])