### Setup and Imports

In [None]:
!pip install -q xgboost imbalanced-learn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier


### Load and Pre process Data


In [None]:
train = pd.read_csv('/content/Training.csv')
test  = pd.read_csv('/content/Testing.csv')

print("Raw train shape:", train.shape)
print("Raw test shape: ", test.shape)
display(train.head())
display(test.head())


Raw train shape: (4920, 134)
Raw test shape:  (42, 133)


Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis,Unnamed: 133
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Fungal infection,
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Fungal infection,
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Fungal infection,
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Fungal infection,
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Fungal infection,


Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Allergy
2,0,0,0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,GERD
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Chronic cholestasis
4,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,Drug Reaction


In [None]:
X = train.drop(columns=[target]).astype(int)
y_raw = train[target].astype(str)

X_test = test.drop(columns=[target]).astype(int)
y_test_raw = test[target].astype(str)

le = LabelEncoder()
y = le.fit_transform(y_raw)
y_test = le.transform(y_test_raw)

print("Classes (label encoder):", list(le.classes_))
print("X shape:", X.shape, "y shape:", y.shape)


NameError: name 'target' is not defined

In [None]:
print("Unique rows in X (post-clean):", X.duplicated().sum(), "duplicates remain")
print("Train class distribution (counts):")
display(pd.Series(y).value_counts().sort_index())
print("Test class distribution (counts):")
display(pd.Series(y_test).value_counts().sort_index())


In [None]:
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Internal split: train", X_tr.shape, "val", X_val.shape)


### Baseline Model


In [None]:
# Baseline model
xgb = XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=6,
                    subsample=0.8, colsample_bytree=0.8,
                    random_state=42, eval_metric='mlogloss')

xgb.fit(X, y)   # train on full cleaned training set
y_pred = xgb.predict(X_test)
print("Baseline XGBoost Test Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=le.classes_))


### Baseline Model Metrics

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("=== Baseline Model Evaluation ===")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1 Score : {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

plt.figure(figsize=(10, 8))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=False, cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()


### Cross Validation

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(xgb, X, y, cv=cv, scoring='accuracy', n_jobs=-1)
print("CV scores per fold:", cv_scores)
print("CV mean: {:.4f}, std: {:.4f}".format(cv_scores.mean(), cv_scores.std()))

# Visualize fold scores
plt.figure(figsize=(7,4))
plt.bar(range(1, len(cv_scores)+1), cv_scores)
plt.axhline(cv_scores.mean(), color='r', linestyle='--', label=f'Mean {cv_scores.mean():.4f}')
plt.xlabel('Fold'); plt.ylabel('Accuracy'); plt.title('5-fold CV accuracies'); plt.legend(); plt.show()


### Hyperparameter Tuning


In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1, 0.3]
}

xgb_base = XGBClassifier(random_state=42, eval_metric='mlogloss')
grid = GridSearchCV(xgb_base, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)
grid.fit(X, y)

print("Best params:", grid.best_params_)
print("Best CV (grid) score:", grid.best_score_)
best_xgb = grid.best_estimator_


### Tuned Model Metrics


In [None]:
y_pred_best = best_xgb.predict(X_test)
print("Tuned XGBoost Test Accuracy:", accuracy_score(y_test, y_pred_best))
print(classification_report(y_test, y_pred_best, target_names=le.classes_))

cm = confusion_matrix(y_test, y_pred_best)
plt.figure(figsize=(10,8))
sns.heatmap(cm, cmap='Blues', annot=False, xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('Confusion Matrix (Tuned XGBoost)')
plt.xlabel('Predicted'); plt.ylabel('Actual')
plt.xticks(rotation=45, ha='right'); plt.yticks(rotation=0)
plt.show()


In [None]:
import joblib
from google.colab import files

# Save the tuned XGBoost model
joblib.dump(best_xgb, 'xgb_tuned_model.pkl')

# Download the file to your local machine
files.download('xgb_tuned_model.pkl')