In [1]:
'''
Perform the following operations using Python on Hepatitis dataset.  
q. Data cleaning(Remove NA, ?, Negative values etc.)  
r. Error correcting(Outlier detection and removal)  
s. Data transformation  
t. Build Data model using regression and Naïve Bayes methods for prediction class 
DIE, LIVE and compare accuracy Prediction. 
'''

'\nPerform the following operations using Python on Hepatitis dataset.  \nq. Data cleaning(Remove NA, ?, Negative values etc.)  \nr. Error correcting(Outlier detection and removal)  \ns. Data transformation  \nt. Build Data model using regression and Naïve Bayes methods for prediction class \nDIE, LIVE and compare accuracy Prediction. \n'

In [26]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

In [27]:
# Load dataset
data = pd.read_csv('DSBDALExam DataSets/Hepatitis/hepatitis.csv', header=None)

In [28]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,2,30,2,1,2,2,2,2,1,2,2,2,2,2,1.0,85,18,4.0,?,1
1,2,50,1,1,2,1,2,2,1,2,2,2,2,2,0.9,135,42,3.5,?,1
2,2,78,1,2,2,1,2,2,2,2,2,2,2,2,0.7,96,32,4.0,?,1
3,2,31,1,?,1,2,2,2,2,2,2,2,2,2,0.7,46,52,4.0,80,1
4,2,34,1,2,2,2,2,2,2,2,2,2,2,2,1.0,?,200,4.0,?,1


In [29]:
# Add column names
columns = [
    'Class', 'Age', 'Sex', 'Steroid', 'Antivirals', 'Fatigue', 'Malaise', 'Anorexia',
    'Liver_big', 'Liver_firm', 'Spleen_palpable', 'Spiders', 'Ascites', 'Varices',
    'Bilirubin', 'Alk_Phosphate', 'SGOT', 'Albumin', 'Protime', 'Histology'
]
data.columns = columns

In [30]:
# Convert '?' to NaN
data.replace('?', np.nan, inplace=True)

# Manually define categorical and numerical columns
categorical_cols = ['Sex', 'Steroid', 'Antivirals', 'Fatigue', 'Malaise', 'Anorexia',
                    'Liver_big', 'Liver_firm', 'Spleen_palpable', 'Spiders',
                    'Ascites', 'Varices', 'Histology']

numerical_cols = ['Age', 'Bilirubin', 'Alk_Phosphate', 'SGOT', 'Albumin', 'Protime']

# Convert numerical columns to float
for col in numerical_cols:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Impute numerical columns with mean
imputer_num = SimpleImputer(strategy='mean')
data[numerical_cols] = imputer_num.fit_transform(data[numerical_cols])

# Impute categorical columns with most frequent
imputer_cat = SimpleImputer(strategy='most_frequent')
data[categorical_cols] = imputer_cat.fit_transform(data[categorical_cols])

# Convert categorical columns to numeric (Label Encoding)
label_enc = LabelEncoder()
for col in categorical_cols + ['Sex']:
    data[col] = label_enc.fit_transform(data[col])

# Convert 'Class' to 0 and 1
data['Class'] = label_enc.fit_transform(data['Class'])


In [31]:
# Handle outliers (e.g., via IQR)
def remove_outliers(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower) & (df[col] <= upper)]
    return df

data = remove_outliers(data, ['Bilirubin', 'Alk_Phosphate', 'SGOT', 'Albumin', 'Protime'])

In [32]:
# Encode categorical variables (Class, Sex, Yes/No columns)
label_enc = LabelEncoder()
data['Class'] = label_enc.fit_transform(data['Class'])  # LIVE=1, DIE=0
data['Sex'] = label_enc.fit_transform(data['Sex'])

binary_cols = ['Steroid', 'Antivirals', 'Fatigue', 'Malaise', 'Anorexia',
               'Liver_big', 'Liver_firm', 'Spleen_palpable', 'Spiders',
               'Ascites', 'Varices', 'Histology']

for col in binary_cols:
    data[col] = label_enc.fit_transform(data[col])

# Split data
X = data.drop('Class', axis=1)
y = data['Class']

# Normalize numerical columns
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)


In [33]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)

# Naïve Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_pred = nb_model.predict(X_test)

In [34]:
# Accuracy comparison
print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_pred))
print("Naïve Bayes Accuracy:", accuracy_score(y_test, nb_pred))

# Detailed report
print("\nLogistic Regression Report:\n", classification_report(y_test, lr_pred, target_names=['DIE', 'LIVE']))
print("Naive Bayes Report:\n", classification_report(y_test, nb_pred, target_names=['DIE', 'LIVE']))

Logistic Regression Accuracy: 0.8666666666666667
Naïve Bayes Accuracy: 0.6666666666666666

Logistic Regression Report:
               precision    recall  f1-score   support

         DIE       0.00      0.00      0.00         1
        LIVE       0.93      0.93      0.93        14

    accuracy                           0.87        15
   macro avg       0.46      0.46      0.46        15
weighted avg       0.87      0.87      0.87        15

Naive Bayes Report:
               precision    recall  f1-score   support

         DIE       0.00      0.00      0.00         1
        LIVE       0.91      0.71      0.80        14

    accuracy                           0.67        15
   macro avg       0.45      0.36      0.40        15
weighted avg       0.85      0.67      0.75        15

