In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import xgboost as xgb
import joblib

# Load dataset
df = pd.read_csv("Liver_data.csv")

# Handle missing values
df['Albumin_and_Globulin_Ratio'].fillna(df['Albumin_and_Globulin_Ratio'].mean(), inplace=True)

# Encode categorical feature (Gender)
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])  # Male = 1, Female = 0

# Prepare features and target
X = df.drop('output', axis=1)
y = df['output'] - 1  # Make sure labels are 0 and 1

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Base model
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Hyperparameter grid
param_grid = {
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [50, 100, 150],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

# Grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid,
                           cv=3, scoring='accuracy', verbose=1, n_jobs=-1)

grid_search.fit(X_train, y_train)

# Best model and its accuracy
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Tuned Model Accuracy: {accuracy:.2f}")
print("Best Parameters:", grid_search.best_params_)

# Save the tuned model
joblib.dump(best_model, "xgboost_liver_model_tuned.pkl")
print("Tuned model saved as xgboost_liver_model_tuned.pkl")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Albumin_and_Globulin_Ratio'].fillna(df['Albumin_and_Globulin_Ratio'].mean(), inplace=True)


Fitting 3 folds for each of 432 candidates, totalling 1296 fits
Tuned Model Accuracy: 0.75
Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 50, 'subsample': 0.7}
Tuned model saved as xgboost_liver_model_tuned.pkl


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest, f_classif
import warnings
warnings.filterwarnings("ignore")

# Load dataset
data = pd.read_csv('Liver_data.csv')  # Replace with your actual dataset path

# Print columns to verify
print("Columns in dataset:", data.columns.tolist())

# Define target column
target_col = 'output'
print(f"Using '{target_col}' as target column.")

# Drop rows where target value is missing
data.dropna(subset=[target_col], inplace=True)

# Encode categorical column 'Gender' (if present) to numeric values
if 'Gender' in data.columns:
    le_gender = LabelEncoder()
    data['Gender'] = le_gender.fit_transform(data['Gender'])

# Now fill missing values using median (all columns should be numeric now)
imputer = SimpleImputer(strategy='median')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

# Remove outliers using the Interquartile Range (IQR) method
Q1 = data_imputed.quantile(0.25)
Q3 = data_imputed.quantile(0.75)
IQR = Q3 - Q1
data_clean = data_imputed[~((data_imputed < (Q1 - 1.5 * IQR)) | (data_imputed > (Q3 + 1.5 * IQR))).any(axis=1)]

# Separate features and target
X = data_clean.drop([target_col], axis=1)
y = data_clean[target_col]

# Remap target values to binary classes [0, 1]
# Assuming in the original dataset 'output' has values 1 and 2, with 2 as the positive class.
y = np.where(y == 2, 1, 0)

# Feature Selection using SelectKBest (keeping all features)
selector = SelectKBest(score_func=f_classif, k='all')
X_selected = selector.fit_transform(X, y)
X = pd.DataFrame(X_selected, columns=X.columns[selector.get_support(indices=True)])

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Initialize models
rf = RandomForestClassifier(random_state=42)
ada = AdaBoostClassifier(random_state=42)
gb = GradientBoostingClassifier(random_state=42)
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Train models
rf.fit(X_train, y_train)
ada.fit(X_train, y_train)
gb.fit(X_train, y_train)
xgb.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = rf.predict(X_test)
y_pred_ada = ada.predict(X_test)
y_pred_gb = gb.predict(X_test)
y_pred_xgb = xgb.predict(X_test)

# Define a helper function to print evaluation metrics
def print_results(name, y_true, y_pred):
    print(f"\n=== {name} ===")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("Classification Report:\n", classification_report(y_true, y_pred))

# Evaluation of each model
print_results("Random Forest", y_test, y_pred_rf)
print_results("AdaBoost", y_test, y_pred_ada)
print_results("Gradient Boosting", y_test, y_pred_gb)
print_results("XGBoost", y_test, y_pred_xgb)

# XGBoost Hyperparameter Tuning with RandomizedSearchCV
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

xgb_random = RandomizedSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    param_distributions=param_grid,
    n_iter=20,
    scoring='accuracy',
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

xgb_random.fit(X_train, y_train)
best_xgb = xgb_random.best_estimator_
y_pred_best_xgb = best_xgb.predict(X_test)

print_results("Tuned XGBoost", y_test, y_pred_best_xgb)
print("Best Parameters from RandomizedSearchCV:", xgb_random.best_params_)

# Summary of Model Accuracies
print("\n=== Model Accuracy Comparison ===")
print(f"Random Forest:       {accuracy_score(y_test, y_pred_rf):.2f}")
print(f"AdaBoost:            {accuracy_score(y_test, y_pred_ada):.2f}")
print(f"Gradient Boosting:   {accuracy_score(y_test, y_pred_gb):.2f}")
print(f"XGBoost:             {accuracy_score(y_test, y_pred_xgb):.2f}")
print(f"Tuned XGBoost:       {accuracy_score(y_test, y_pred_best_xgb):.2f}")


Columns in dataset: ['Age', 'Gender', 'Total_Bilirubin', 'Direct_Bilirubin', 'Alkaline_Phosphotase', 'Alamine_Aminotransferase', 'Aspartate_Aminotransferase', 'Total_Protiens', 'Albumin', 'Albumin_and_Globulin_Ratio', 'output']
Using 'output' as target column.

=== Random Forest ===
Accuracy: 0.603448275862069
Confusion Matrix:
 [[25  8]
 [15 10]]
Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.76      0.68        33
           1       0.56      0.40      0.47        25

    accuracy                           0.60        58
   macro avg       0.59      0.58      0.58        58
weighted avg       0.60      0.60      0.59        58


=== AdaBoost ===
Accuracy: 0.6206896551724138
Confusion Matrix:
 [[21 12]
 [10 15]]
Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.64      0.66        33
           1       0.56      0.60      0.58        25

    accuracy              