<a href="https://colab.research.google.com/github/Devansharma/Health-App/blob/main/Liver_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
df = pd.read_csv("liver_patient.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
print(df.columns)
df.isnull().sum()

In [None]:
# Handle Missing Values
imputer = SimpleImputer(strategy='mean')
df[['Albumin_and_Globulin_Ratio']] = imputer.fit_transform(df[['Albumin_and_Globulin_Ratio']])

In [None]:
sns.countplot(data=df, x = 'Dataset', label='Count')
LD, NLD = df['Dataset'].value_counts()
print('Number of patients diagnosed with liver disease: {}'.format(LD))
print('Number of patients not diagnosed with liver disease: {}'.format(NLD))

In [None]:
sns.factorplot(x="Age", y="Gender", hue="Dataset", data=df)

In [None]:
g = sns.FacetGrid(df, col="Dataset", row="Gender", margin_titles=True)
g.map(plt.hist, "Age")
plt.subplots_adjust(top=0.9)
g.fig.suptitle('Disease by Gender and Age');

In [None]:
g = sns.FacetGrid(df, col="Gender", row="Dataset", margin_titles=True)
g.map(plt.scatter,"Direct_Bilirubin", "Total_Bilirubin", edgecolor="w")
plt.subplots_adjust(top=0.9)

In [None]:
#df["Albumin_and_Globulin_Ratio"] = df.Albumin_and_Globulin_Ratio.fillna(df['Albumin_and_Globulin_Ratio'].mean())

In [None]:
X = df.drop(['Gender','Dataset'], axis=1)
y = df['Dataset'] # 1 for liver disease; 2 for no liver disease

In [None]:
liver_corr = X.corr()
plt.figure(figsize=(18,18))
sns.heatmap(liver_corr, cbar = True,  square = True, annot=True, fmt= '.2f',annot_kws={'size': 15},
           cmap= 'coolwarm')
plt.title('Correlation between features');

In [None]:
sns.jointplot("Total_Protiens", "Albumin", data=df, kind="reg")

In [15]:
scaler = StandardScaler()
df[['Age', 'Total_Bilirubin', 'Direct_Bilirubin', 'Alkaline_Phosphotase', 'Alamine_Aminotransferase',
      'Aspartate_Aminotransferase', 'Total_Protiens', 'Albumin', 'Albumin_and_Globulin_Ratio']] = \
    scaler.fit_transform(df[['Age', 'Total_Bilirubin', 'Direct_Bilirubin', 'Alkaline_Phosphotase',
                                'Alamine_Aminotransferase', 'Aspartate_Aminotransferase', 'Total_Protiens',
                                'Albumin', 'Albumin_and_Globulin_Ratio']])

In [19]:
df.head()

Unnamed: 0,Age,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset,Gender_Female,Gender_Male
0,1.252098,-0.418878,-0.493964,-0.426715,-0.354665,-0.318393,0.29212,0.198969,-0.147898,1,1,0
1,1.066637,1.225171,1.430423,1.682629,-0.091599,-0.034333,0.937566,0.073157,-0.650697,1,0,1
2,1.066637,0.644919,0.931508,0.821588,-0.113522,-0.145186,0.476533,0.198969,-0.179323,1,0,1
3,0.819356,-0.370523,-0.387054,-0.447314,-0.365626,-0.311465,0.29212,0.324781,0.166351,1,0,1
4,1.684839,0.096902,0.183135,-0.393756,-0.294379,-0.176363,0.753153,-0.93334,-1.719144,1,0,1


In [18]:
# Encode Categorical Variables
df = pd.get_dummies(df, columns=['Gender'])

In [20]:
# Split the Dataset
from sklearn.model_selection import train_test_split
X = df.drop('Dataset', axis=1)
y = df['Dataset']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
# Display the preprocessed dataset
print(X_train.head())
print(y_train.head())

          Age  Total_Bilirubin  Direct_Bilirubin  Alkaline_Phosphotase  \
77   1.437558        -0.434996         -0.493964              5.476975   
581 -0.849789        -0.322169         -0.351417             -0.439074   
210 -1.035250        -0.402760         -0.422690             -0.414356   
192  0.942997        -0.160988         -0.315780             -0.076531   
449  0.201154        -0.386642         -0.458327             -0.484392   

     Alamine_Aminotransferase  Aspartate_Aminotransferase  Total_Protiens  \
77                   0.078298                    0.059200       -1.736427   
581                 -0.283418                   -0.269895        0.292120   
210                 -0.332743                   -0.332250       -2.197461   
192                 -0.009391                   -0.204076        0.107706   
449                 -0.299860                   -0.287216       -0.261120   

      Albumin  Albumin_and_Globulin_Ratio  Gender_Female  Gender_Male  
77  -1.310776       

In [40]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(class_weight='balanced')
model.fit(X_train, y_train)

RandomForestClassifier(class_weight='balanced')

In [41]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [42]:
print(f"Accuracy is {round(accuracy_score(y_test, model.predict(X_test))*100,2)}")

Accuracy is 76.07


In [37]:
confusion_matrix(y_test, model.predict(X_test))

array([[78,  9],
       [18, 12]], dtype=int64)

In [None]:
scoring = 'accuracy'

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

models= []
models.append(('CART', DecisionTreeClassifier()))
models.append(('SVM', SVC())) 
models.append(('NB', GaussianNB()))
models.append(('KNN', KNeighborsClassifier()))

# evaluate each model in turn
results = []
names = []

for name, model in models:
    kfold = KFold(n_splits=10)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "For %s Model:Mean accuracy is %f (Std accuracy is %f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)