In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# ML libraries
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# to ignore warnings
import warnings
warnings.filterwarnings('ignore')


In [2]:
# load the dataset
df = pd.read_csv('hepatitis.csv', na_values='?')
df.head()

Unnamed: 0,ID,target,age,gender,steroid,antivirals,fatigue,malaise,anorexia,liverBig,...,spleen,spiders,ascites,varices,bili,alk,sgot,albu,protime,histology
0,1,2,30,2,1.0,2,2.0,2.0,2.0,1.0,...,2.0,2.0,2.0,2.0,1.0,85.0,18.0,4.0,,1
1,2,2,50,1,1.0,2,1.0,2.0,2.0,1.0,...,2.0,2.0,2.0,2.0,0.9,135.0,42.0,3.5,,1
2,3,2,78,1,2.0,2,1.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,0.7,96.0,32.0,4.0,,1
3,4,2,31,1,,1,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,0.7,46.0,52.0,4.0,80.0,1
4,5,2,34,1,2.0,2,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,1.0,,200.0,4.0,,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 21 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ID          155 non-null    int64  
 1   target      155 non-null    int64  
 2   age         155 non-null    int64  
 3   gender      155 non-null    int64  
 4   steroid     154 non-null    float64
 5   antivirals  155 non-null    int64  
 6   fatigue     154 non-null    float64
 7   malaise     154 non-null    float64
 8   anorexia    154 non-null    float64
 9   liverBig    145 non-null    float64
 10  liverFirm   144 non-null    float64
 11  spleen      150 non-null    float64
 12  spiders     150 non-null    float64
 13  ascites     150 non-null    float64
 14  varices     150 non-null    float64
 15  bili        149 non-null    float64
 16  alk         126 non-null    float64
 17  sgot        151 non-null    float64
 18  albu        139 non-null    float64
 19  protime     88 non-null     f

In [4]:
df.describe()

Unnamed: 0,ID,target,age,gender,steroid,antivirals,fatigue,malaise,anorexia,liverBig,...,spleen,spiders,ascites,varices,bili,alk,sgot,albu,protime,histology
count,155.0,155.0,155.0,155.0,154.0,155.0,154.0,154.0,154.0,145.0,...,150.0,150.0,150.0,150.0,149.0,126.0,151.0,139.0,88.0,155.0
mean,78.0,1.793548,41.2,1.103226,1.506494,1.845161,1.350649,1.603896,1.792208,1.827586,...,1.8,1.66,1.866667,1.88,1.427517,105.325397,85.89404,3.817266,61.852273,1.451613
std,44.888751,0.40607,12.565878,0.30524,0.501589,0.362923,0.47873,0.490682,0.407051,0.379049,...,0.40134,0.475296,0.341073,0.32605,1.212149,51.508109,89.65089,0.651523,22.875244,0.499266
min,1.0,1.0,7.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.3,26.0,14.0,2.1,0.0,1.0
25%,39.5,2.0,32.0,1.0,1.0,2.0,1.0,1.0,2.0,2.0,...,2.0,1.0,2.0,2.0,0.7,74.25,31.5,3.4,46.0,1.0
50%,78.0,2.0,39.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,1.0,85.0,58.0,4.0,61.0,1.0
75%,116.5,2.0,50.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,1.5,132.25,100.5,4.2,76.25,2.0
max,155.0,2.0,78.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,8.0,295.0,648.0,6.4,100.0,2.0


In [5]:
df.target.value_counts()

target
2    123
1     32
Name: count, dtype: int64

In [6]:
# drop the id column
df.drop('ID', axis=1, inplace=True)
df.head()

Unnamed: 0,target,age,gender,steroid,antivirals,fatigue,malaise,anorexia,liverBig,liverFirm,spleen,spiders,ascites,varices,bili,alk,sgot,albu,protime,histology
0,2,30,2,1.0,2,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0,85.0,18.0,4.0,,1
1,2,50,1,1.0,2,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,0.9,135.0,42.0,3.5,,1
2,2,78,1,2.0,2,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,96.0,32.0,4.0,,1
3,2,31,1,,1,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.7,46.0,52.0,4.0,80.0,1
4,2,34,1,2.0,2,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,,200.0,4.0,,1


In [7]:
# check the missing values
print("Missing values in each column:\n", df.isnull().sum())

Missing values in each column:
 target         0
age            0
gender         0
steroid        1
antivirals     0
fatigue        1
malaise        1
anorexia       1
liverBig      10
liverFirm     11
spleen         5
spiders        5
ascites        5
varices        5
bili           6
alk           29
sgot           4
albu          16
protime       67
histology      0
dtype: int64


In [8]:
# drop the missing value by using Simple imputer
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputer = imputer.fit(df)
imputer.statistics_
df1 = imputer.transform(df)
data = pd.DataFrame(df1, columns= df.columns)
# check the missing values in the dataset   
print("Missing values in the dataset:\n", data.isnull().sum().sort_values(ascending=False))


Missing values in the dataset:
 target        0
age           0
protime       0
albu          0
sgot          0
alk           0
bili          0
varices       0
ascites       0
spiders       0
spleen        0
liverFirm     0
liverBig      0
anorexia      0
malaise       0
fatigue       0
antivirals    0
steroid       0
gender        0
histology     0
dtype: int64


In [9]:
# split the data into X and y
X = data.drop('target', axis=1)
y = data['target']

In [10]:
# split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Call the model
model = SVC(kernel='rbf')

In [12]:
# Train the model
model.fit(X_train, y_train)

In [13]:
# Predict with the model
y_pred = model.predict(X_test)
y_pred

array([2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.])

In [14]:
# evalute the model
# accuracy check

accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: ', accuracy)
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy:  0.7741935483870968
Confusion Matrix:
 [[ 0  7]
 [ 0 24]]
              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00         7
         2.0       0.77      1.00      0.87        24

    accuracy                           0.77        31
   macro avg       0.39      0.50      0.44        31
weighted avg       0.60      0.77      0.68        31

