In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PowerTransformer, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import GaussianNB
from scipy import stats

In [2]:
df=pd.read_csv('hepatitis.csv')

In [3]:
print(df.shape)
df.head()

(142, 20)


Unnamed: 0,class,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology
0,2,30,2,1,2,2,2,2,1,2,2,2,2,2,1.0,85,18,4.0,61,1
1,2,50,1,1,2,1,2,2,1,2,2,2,2,2,0.9,135,42,3.5,61,1
2,2,78,1,2,2,1,2,2,2,2,2,2,2,2,0.7,96,32,4.0,61,1
3,2,34,1,2,2,2,2,2,2,2,2,2,2,2,1.0,105,200,4.0,61,1
4,2,34,1,2,2,2,2,2,2,2,2,2,2,2,0.9,95,28,4.0,75,1


In [4]:
df.replace('?', np.nan, inplace=True)

df.head()

Unnamed: 0,class,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology
0,2,30,2,1,2,2,2,2,1,2,2,2,2,2,1.0,85,18,4.0,61,1
1,2,50,1,1,2,1,2,2,1,2,2,2,2,2,0.9,135,42,3.5,61,1
2,2,78,1,2,2,1,2,2,2,2,2,2,2,2,0.7,96,32,4.0,61,1
3,2,34,1,2,2,2,2,2,2,2,2,2,2,2,1.0,105,200,4.0,61,1
4,2,34,1,2,2,2,2,2,2,2,2,2,2,2,0.9,95,28,4.0,75,1


In [5]:
for col in df.columns:
    if df[col].min() < 0:
        df = df[df[col] >= 0]

df.head()

Unnamed: 0,class,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology
0,2,30,2,1,2,2,2,2,1,2,2,2,2,2,1.0,85,18,4.0,61,1
1,2,50,1,1,2,1,2,2,1,2,2,2,2,2,0.9,135,42,3.5,61,1
2,2,78,1,2,2,1,2,2,2,2,2,2,2,2,0.7,96,32,4.0,61,1
3,2,34,1,2,2,2,2,2,2,2,2,2,2,2,1.0,105,200,4.0,61,1
4,2,34,1,2,2,2,2,2,2,2,2,2,2,2,0.9,95,28,4.0,75,1


In [6]:
df.dropna(inplace=True)

df.head()

Unnamed: 0,class,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology
0,2,30,2,1,2,2,2,2,1,2,2,2,2,2,1.0,85,18,4.0,61,1
1,2,50,1,1,2,1,2,2,1,2,2,2,2,2,0.9,135,42,3.5,61,1
2,2,78,1,2,2,1,2,2,2,2,2,2,2,2,0.7,96,32,4.0,61,1
3,2,34,1,2,2,2,2,2,2,2,2,2,2,2,1.0,105,200,4.0,61,1
4,2,34,1,2,2,2,2,2,2,2,2,2,2,2,0.9,95,28,4.0,75,1


In [7]:
Q1 = df.quantile(0.25)   # 25th percentile
Q3 = df.quantile(0.75)   # 75th percentile
IQR = Q3 - Q1            # Interquartile Range (middle 50% of data)

# Outlier bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove rows where any column has a value outside the IQR range
df = df[~((df < lower_bound) | (df > upper_bound)).any(axis=1)]

df.head()

Unnamed: 0,class,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology
3,2,34,1,2,2,2,2,2,2,2,2,2,2,2,1.0,105,200,4.0,61,1
4,2,34,1,2,2,2,2,2,2,2,2,2,2,2,0.9,95,28,4.0,75,1
6,2,23,1,2,2,2,2,2,2,2,2,2,2,2,1.0,105,85,3.81,61,1
7,2,39,1,2,2,1,2,2,2,1,2,2,2,2,0.7,105,48,4.4,61,1
8,2,30,1,2,2,2,2,2,2,2,2,2,2,2,1.0,105,120,3.9,61,1


In [8]:
X = df.drop(columns='histology')  # Assuming 'target' column exists
y = df['histology']

In [9]:
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

X.head()

Unnamed: 0,class,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime
0,2.0,34.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,105.0,200.0,4.0,61.0
1,2.0,34.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.9,95.0,28.0,4.0,75.0
2,2.0,23.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,105.0,85.0,3.81,61.0
3,2.0,39.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,0.7,105.0,48.0,4.4,61.0
4,2.0,30.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,105.0,120.0,3.9,61.0


In [10]:
# 3. Skewness resolution (using PowerTransformer for centering and normality)
pt = PowerTransformer()
X = pd.DataFrame(pt.fit_transform(X), columns=X.columns)

X.head()

Unnamed: 0,class,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime
0,0.0,-0.159413,0.0,0.677003,0.0,0.816497,0.306186,0.0,0.0,0.454859,0.0,0.171499,0.0,0.0,0.426347,0.6256,1.824351,-0.168986,-0.165732
1,0.0,-0.159413,0.0,0.677003,0.0,0.816497,0.306186,0.0,0.0,0.454859,0.0,0.171499,0.0,0.0,0.044528,0.22717,-1.215702,-0.168986,2.054075
2,0.0,-1.565917,0.0,0.677003,0.0,0.816497,0.306186,0.0,0.0,0.454859,0.0,0.171499,0.0,0.0,0.426347,0.6256,0.393738,-0.605057,-0.165732
3,0.0,0.337229,0.0,0.677003,0.0,-1.224745,0.306186,0.0,0.0,-2.198484,0.0,0.171499,0.0,0.0,-0.987107,0.6256,-0.46836,0.842848,-0.165732
4,0.0,-0.611308,0.0,0.677003,0.0,0.816497,0.306186,0.0,0.0,0.454859,0.0,0.171499,0.0,0.0,0.426347,0.6256,0.94962,-0.401996,-0.165732


In [11]:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

X.head()

Unnamed: 0,class,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime
0,0.0,-0.159413,0.0,0.677003,0.0,0.816497,0.306186,0.0,0.0,0.454859,0.0,0.171499,0.0,0.0,0.426347,0.6256,1.824351,-0.168986,-0.165732
1,0.0,-0.159413,0.0,0.677003,0.0,0.816497,0.306186,0.0,0.0,0.454859,0.0,0.171499,0.0,0.0,0.044528,0.22717,-1.215702,-0.168986,2.054075
2,0.0,-1.565917,0.0,0.677003,0.0,0.816497,0.306186,0.0,0.0,0.454859,0.0,0.171499,0.0,0.0,0.426347,0.6256,0.393738,-0.605057,-0.165732
3,0.0,0.337229,0.0,0.677003,0.0,-1.224745,0.306186,0.0,0.0,-2.198484,0.0,0.171499,0.0,0.0,-0.987107,0.6256,-0.46836,0.842848,-0.165732
4,0.0,-0.611308,0.0,0.677003,0.0,0.816497,0.306186,0.0,0.0,0.454859,0.0,0.171499,0.0,0.0,0.426347,0.6256,0.94962,-0.401996,-0.165732


In [12]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# 1. Logistic Regression model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred_lr = logreg.predict(X_test)
acc_lr = accuracy_score(y_test, y_pred_lr)

acc_lr

0.8571428571428571

In [14]:
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           1       0.83      1.00      0.91         5
           2       1.00      0.50      0.67         2

    accuracy                           0.86         7
   macro avg       0.92      0.75      0.79         7
weighted avg       0.88      0.86      0.84         7



In [15]:
# 2. k-NN model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
acc_knn = accuracy_score(y_test, y_pred_knn)

acc_knn

0.7142857142857143

In [16]:
print(classification_report(y_test, y_pred_knn))

              precision    recall  f1-score   support

           1       0.71      1.00      0.83         5
           2       0.00      0.00      0.00         2

    accuracy                           0.71         7
   macro avg       0.36      0.50      0.42         7
weighted avg       0.51      0.71      0.60         7



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
# 3. Naive-Bayes model
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
acc_nb = accuracy_score(y_test, y_pred_nb)

acc_nb

0.2857142857142857

In [18]:
print(classification_report(y_test, y_pred_nb))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00         5
           2       0.29      1.00      0.44         2

    accuracy                           0.29         7
   macro avg       0.14      0.50      0.22         7
weighted avg       0.08      0.29      0.13         7



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
