In [7]:
import matplotlib.pyplot as plt 
import seaborn as sns 
import pandas as pd
import numpy as np
from scipy import stats

from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.model_selection import StratifiedKFold



from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, KBinsDiscretizer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [8]:
raw_train = pd.read_csv('train.csv', index_col=0)
test_data = pd.read_csv('test.csv', index_col=0)

In [9]:
# check for missing values
print(raw_train.isnull().sum())
raw_train.shape

index                         0
AFP (ng/mL)                   0
ALP (U/L)                     0
ALT (U/L)                     0
AST (U/L)                     0
Age                           0
Albumin (g/dL)                0
Alcohol_Use (yes/no)          0
Bilirubin (mg/dL)             0
CRP (mg/L)                    0
Diabetes (yes/no)             0
Fibroscan (kPa)               0
GGT (U/L)                     0
Gender                        0
Hemoglobin (g/dL)             0
IL-6 (pg/mL)                  0
Obesity (yes/no)              0
PT/INR                        0
Platelets (10^9/L)            0
RBC (10^12/L)                 0
Serum_Ammonia (μmol/L)        0
Serum_Copper (μg/dL)          0
Serum_Creatinine (mg/dL)      0
Serum_Iron (μg/dL)            0
Serum_Lactate (mmol/L)        0
Serum_Urea (mg/dL)            0
Serum_Zinc (μg/dL)            0
TIBC (μg/dL)                  0
Transferrin_Saturation (%)    0
WBC (10^9/L)                  0
pH                            0
Diagnosi

(703, 32)

In [10]:
#raw_train.head()
print(raw_train['Diagnosis'].value_counts())

Diagnosis
Healthy                      250
Hepatitis                    113
Autoimmune Liver Diseases     79
Drug-induced Liver Injury     73
Cirrhosis                     71
Fatty Liver Disease           62
Liver Cancer                  55
Name: count, dtype: int64


In [11]:
print(test_data.isnull().sum())
raw_train.shape

index                         0
AFP (ng/mL)                   0
ALP (U/L)                     0
ALT (U/L)                     0
AST (U/L)                     0
Age                           0
Albumin (g/dL)                0
Alcohol_Use (yes/no)          0
Bilirubin (mg/dL)             0
CRP (mg/L)                    0
Diabetes (yes/no)             0
Fibroscan (kPa)               0
GGT (U/L)                     0
Gender                        0
Hemoglobin (g/dL)             0
IL-6 (pg/mL)                  0
Obesity (yes/no)              0
PT/INR                        0
Platelets (10^9/L)            0
RBC (10^12/L)                 0
Serum_Ammonia (μmol/L)        0
Serum_Copper (μg/dL)          0
Serum_Creatinine (mg/dL)      0
Serum_Iron (μg/dL)            0
Serum_Lactate (mmol/L)        0
Serum_Urea (mg/dL)            0
Serum_Zinc (μg/dL)            0
TIBC (μg/dL)                  0
Transferrin_Saturation (%)    0
WBC (10^9/L)                  0
pH                            0
dtype: i

(703, 32)

In [15]:
# convert the categorical train data to numerical data
df = raw_train
columns_to_encode = ['Alcohol_Use (yes/no)','Diabetes (yes/no)', 'Gender','Obesity (yes/no)', 'Diagnosis']
le = LabelEncoder()
for col in columns_to_encode:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col])
df

Unnamed: 0,index,AFP (ng/mL),ALP (U/L),ALT (U/L),AST (U/L),Age,Albumin (g/dL),Alcohol_Use (yes/no),Bilirubin (mg/dL),CRP (mg/L),...,Serum_Creatinine (mg/dL),Serum_Iron (μg/dL),Serum_Lactate (mmol/L),Serum_Urea (mg/dL),Serum_Zinc (μg/dL),TIBC (μg/dL),Transferrin_Saturation (%),WBC (10^9/L),pH,Diagnosis
0,1124,13.571425,1653.138489,40.405592,45.598427,4,4.477126,1,0.541997,1.002121,...,2.569954,97.242495,2.923385,26.696110,96.124260,315.439318,30.827639,4.180007,6.163600,4
1,1188,13.649342,1940.518305,21.336986,34.064095,55,3.190724,1,1.199063,0.582746,...,3.024642,128.597676,2.719966,23.021218,50.554650,268.492437,47.896200,4.590995,5.244266,1
2,530,10.898943,1557.369920,29.665496,16.044488,30,4.506351,0,0.740952,1.670375,...,2.219844,98.026938,3.267757,20.123808,82.900681,312.107620,31.408057,4.524580,5.769592,4
3,686,13.872275,1273.840525,142.418649,64.204257,0,3.665655,1,1.939879,1.220646,...,1.704282,63.105133,1.937625,36.436855,72.360010,182.506010,34.577016,4.234483,5.982228,2
4,296,10.102457,1461.622515,22.437303,23.940205,59,4.005109,0,0.769535,0.139863,...,2.656926,89.966713,1.979801,20.036074,63.216136,212.261519,42.384844,4.306892,4.837404,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
698,664,13.086307,1663.149063,94.844911,81.525503,82,3.893254,1,1.514709,4.102990,...,2.510571,71.561815,3.255364,13.806666,74.751383,331.761651,21.570249,4.239866,6.525677,5
699,1227,12.425079,1580.772607,19.152473,29.765881,8,4.434494,1,2.497997,3.827726,...,3.267194,139.135093,3.741368,24.133150,51.577230,266.947184,52.120832,4.374619,6.071335,1
700,1019,12.887508,1164.383286,116.825787,71.610295,14,4.002324,0,1.760204,2.456043,...,3.056218,141.278410,2.945887,13.363649,95.869655,260.085180,54.320054,4.369082,5.059313,5
701,767,8.947665,1178.449705,7.818343,46.739457,2,3.866326,0,0.552524,4.907847,...,2.322946,112.506385,2.180009,15.892570,72.641767,344.926350,32.617510,4.249171,6.546267,4


In [16]:
# Separate features and target variable
train = df.copy()
X = train.drop(columns=['Diagnosis'])
y = train['Diagnosis']

# Define preprocessing steps
numeric_features = X.select_dtypes(include=['float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the model pipelines
# Pipeline A: Regression Analysis with Linear Regression
pipeline_A = Pipeline(steps=[('preprocessor', preprocessor),
                             ('regressor', LinearRegression())])

# Pipeline B: Multi-class Classification Analysis with Random Forest Regressor
pipeline_B = Pipeline(steps=[('preprocessor', preprocessor),
                             ('regressor', RandomForestRegressor())])



In [17]:
pipelines = [pipeline_A, pipeline_B]

for pipe in pipelines:
    scores = cross_val_score(pipe, X, y, cv=5, scoring='neg_mean_absolute_error')
    print(f"Model: {pipe.named_steps['regressor'].__class__.__name__}")
    print("Mean MAE:", -scores.mean())

Model: LinearRegression
Mean MAE: 1.0923438505895717
Model: RandomForestRegressor
Mean MAE: 0.7999239108409321


In [18]:
# Hyperparameters for Pipeline B (Random Forest Regressor)
param_grid_B = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4]
}

# Grid search for best hyperparameters
grid_search_B = GridSearchCV(pipeline_B, param_grid_B, cv=5, scoring='neg_mean_absolute_error')
grid_search_B.fit(X, y)

print("Best MAE:", -grid_search_B.best_score_)
print("Best parameters:", grid_search_B.best_params_)


Best MAE: 0.7928181135434968
Best parameters: {'regressor__max_depth': 20, 'regressor__min_samples_leaf': 2, 'regressor__min_samples_split': 5, 'regressor__n_estimators': 50}
