In [122]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import style
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

In [123]:
data = pd.read_csv('../data/hcvdat0.csv').drop(columns=["Unnamed: 0"], errors="ignore")
data.tail()

Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
610,3=Cirrhosis,62,f,32.0,416.6,5.9,110.3,50.0,5.57,6.3,55.7,650.9,68.5
611,3=Cirrhosis,64,f,24.0,102.8,2.9,44.4,20.0,1.54,3.02,63.0,35.9,71.3
612,3=Cirrhosis,64,f,29.0,87.3,3.5,99.0,48.0,1.66,3.63,66.7,64.2,82.0
613,3=Cirrhosis,46,f,33.0,,39.0,62.0,20.0,3.56,4.2,52.0,50.0,71.0
614,3=Cirrhosis,59,f,36.0,,100.0,80.0,12.0,9.07,5.3,67.0,34.0,68.0


In [124]:
data.dtypes

Category     object
Age           int64
Sex          object
ALB         float64
ALP         float64
ALT         float64
AST         float64
BIL         float64
CHE         float64
CHOL        float64
CREA        float64
GGT         float64
PROT        float64
dtype: object

In [125]:
## print shape of dataset with rows and columns
print(data.shape)

(615, 13)


In [126]:
# See the column data types and non-missing values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 615 entries, 0 to 614
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Category  615 non-null    object 
 1   Age       615 non-null    int64  
 2   Sex       615 non-null    object 
 3   ALB       614 non-null    float64
 4   ALP       597 non-null    float64
 5   ALT       614 non-null    float64
 6   AST       615 non-null    float64
 7   BIL       615 non-null    float64
 8   CHE       615 non-null    float64
 9   CHOL      605 non-null    float64
 10  CREA      615 non-null    float64
 11  GGT       615 non-null    float64
 12  PROT      614 non-null    float64
dtypes: float64(10), int64(1), object(2)
memory usage: 62.6+ KB


In [127]:
data.describe()

Unnamed: 0,Age,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
count,615.0,614.0,597.0,614.0,615.0,615.0,615.0,605.0,615.0,615.0,614.0
mean,47.40813,41.620195,68.28392,28.450814,34.786341,11.396748,8.196634,5.368099,81.287805,39.533171,72.044137
std,10.055105,5.780629,26.028315,25.469689,33.09069,19.67315,2.205657,1.132728,49.756166,54.661071,5.402636
min,19.0,14.9,11.3,0.9,10.6,0.8,1.42,1.43,8.0,4.5,44.8
25%,39.0,38.8,52.5,16.4,21.6,5.3,6.935,4.61,67.0,15.7,69.3
50%,47.0,41.95,66.2,23.0,25.9,7.3,8.26,5.3,77.0,23.3,72.2
75%,54.0,45.2,80.1,33.075,32.9,11.2,9.59,6.06,88.0,40.2,75.4
max,77.0,82.2,416.6,325.3,324.0,254.0,16.41,9.67,1079.1,650.9,90.0


In [128]:
from ydata_profiling import ProfileReport

profile = ProfileReport(data, title="hcv data Report")
profile.to_file("hcv_data_report.html")

In [129]:
num_cols = data.select_dtypes(include='number').columns
cat_cols = data.select_dtypes(include='object').columns

print(num_cols)
print(cat_cols)

Index(['Age', 'ALB', 'ALP', 'ALT', 'AST', 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT',
       'PROT'],
      dtype='object')
Index(['Category', 'Sex'], dtype='object')


In [130]:
data['AgeCat'] = pd.cut(data['Age'], bins=[-np.inf, 18, 30, 45, np.inf], labels=['child', 'young', 'middle-aged', 'aged'])
data.head()

Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT,AgeCat
0,0=Blood Donor,32,m,38.5,52.5,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0,middle-aged
1,0=Blood Donor,32,m,38.5,70.3,18.0,24.7,3.9,11.17,4.8,74.0,15.6,76.5,middle-aged
2,0=Blood Donor,32,m,46.9,74.7,36.2,52.6,6.1,8.84,5.2,86.0,33.2,79.3,middle-aged
3,0=Blood Donor,32,m,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7,middle-aged
4,0=Blood Donor,32,m,39.2,74.1,32.6,24.8,9.6,9.15,4.32,76.0,29.9,68.7,middle-aged


In [131]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2, random_state=7, stratify=data['AgeCat'])

In [132]:
print(train['AgeCat'].value_counts(normalize=True) * 100)
print(test['AgeCat'].value_counts(normalize=True) * 100)

AgeCat
aged           57.317073
middle-aged    41.463415
young           1.219512
child           0.000000
Name: proportion, dtype: float64
AgeCat
aged           57.723577
middle-aged    41.463415
young           0.813008
child           0.000000
Name: proportion, dtype: float64


In [133]:
train_set, val_set = train_test_split(train, test_size=0.2, random_state=7, stratify=train['AgeCat'])

train_set.drop(columns=['AgeCat'], axis=1, inplace= True)
val_set.drop(columns=['AgeCat'], axis=1, inplace=True)

In [134]:
X_train = train_set.drop(columns=['Category'])
y_train = train_set['Category']

X_val = val_set.drop(columns=['Category'])
y_val = val_set['Category']

In [135]:
num_cols = X_train.select_dtypes(include='number').columns
cat_cols = X_train.select_dtypes(include='object').columns

In [136]:
from sklearn.impute import SimpleImputer

num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

X_train[num_cols] = num_imputer.fit_transform(X_train[num_cols])
X_train[cat_cols] = cat_imputer.fit_transform(X_train[cat_cols])

X_val[num_cols] = num_imputer.transform(X_val[num_cols])
X_val[cat_cols] = cat_imputer.transform(X_val[cat_cols])

In [137]:
# Compute Q1 (25th percentile) and Q3 (75th percentile)
Q1 = data['Age'].quantile(0.25)
Q3 = data['Age'].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Detect outliers
outliers = data[(data['Age'] < lower_bound) | (data['Age'] > upper_bound)]
print(outliers)


          Category  Age Sex   ALB   ALP   ALT   AST   BIL   CHE  CHOL   CREA  \
317  0=Blood Donor   77   m  52.2  52.2  12.0  23.5  10.9  5.51  4.41  103.0   

      GGT  PROT AgeCat  
317  25.8  67.2   aged  


In [138]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

scaler = StandardScaler()
encoder = OrdinalEncoder()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_val[num_cols] = scaler.transform(X_val[num_cols])

X_train[cat_cols] = encoder.fit_transform(X_train[cat_cols])
X_val[cat_cols] = encoder.transform(X_val[cat_cols])

In [139]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()

log_reg.fit(X_train, y_train)

In [140]:
log_reg.score(X_val, y_val)

0.9090909090909091

In [141]:
pred = log_reg.predict(X_val)

In [144]:
# Build a model using RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
print(rf.score(X_val, y_val))



0.9292929292929293


In [145]:
# Build a model using GradientBoostingClassifier
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
print(gb.score(X_val, y_val))

0.9191919191919192
