# Marge + split dataset

In [None]:
import os
import shutil
import random

# Paths
source_dir = '/kaggle/working/merged_dataset'  # merged folder
dest_dir = '/kaggle/working/merged_dataset_split'  # new split folder
classes = ['Benign', 'Malignant']

# Create folders
for split in ['train', 'val', 'test']:
    for cls in classes:
        os.makedirs(os.path.join(dest_dir, split, cls), exist_ok=True)

# Split ratio
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

# Split images
for cls in classes:
    cls_path = os.path.join(source_dir, cls)
    images = os.listdir(cls_path)
    random.shuffle(images)

    total = len(images)
    train_end = int(total * train_ratio)
    val_end = train_end + int(total * val_ratio)

    train_images = images[:train_end]
    val_images = images[train_end:val_end]
    test_images = images[val_end:]

    # Copy images
    for img in train_images:
        shutil.copy(os.path.join(cls_path, img), os.path.join(dest_dir, 'train', cls))
    for img in val_images:
        shutil.copy(os.path.join(cls_path, img), os.path.join(dest_dir, 'val', cls))
    for img in test_images:
        shutil.copy(os.path.join(cls_path, img), os.path.join(dest_dir, 'test', cls))

print("✅ Dataset successfully split into Train, Validation, Test!")


# Overview of Dataset + Display train images

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import random
import cv2

# === Set your split base directory ===
split_base_dir = '/kaggle/working/merged_dataset_split'  # <-- your split folder here

# Step 1: Bar Chart of counts
split_dirs = ['train', 'val', 'test']
classes = ['Benign', 'Malignant']

counts = {split: [] for split in split_dirs}

for split in split_dirs:
    for cls in classes:
        path = os.path.join(split_base_dir, split, cls)
        counts[split].append(len(os.listdir(path)))

# Plot bar chart
x = np.arange(len(classes))  # Label locations
width = 0.25  # Width of the bars

fig, ax = plt.subplots(figsize=(10,6))
rects1 = ax.bar(x - width, counts['train'], width, label='Train')
rects2 = ax.bar(x, counts['val'], width, label='Validation')
rects3 = ax.bar(x + width, counts['test'], width, label='Test')

# Add text labels and title
ax.set_ylabel('Number of Images')
ax.set_title('Number of Images per Class and Split')
ax.set_xticks(x)
ax.set_xticklabels(classes)
ax.legend()

# Add counts on top of bars
for rect in rects1 + rects2 + rects3:
    height = rect.get_height()
    ax.annotate(f'{height}', xy=(rect.get_x() + rect.get_width() / 2, height),
                xytext=(0, 3), textcoords="offset points",
                ha='center', va='bottom')

plt.show()

# ---------------------------------------------
# Step 2: Randomly show 5 train images from each class

def show_random_images(base_dir, classes, n_images=5):
    fig, axes = plt.subplots(2, n_images, figsize=(n_images*3, 6))
    fig.suptitle('Random Train Images: Benign vs Malignant', fontsize=16)

    for i, cls in enumerate(classes):
        cls_dir = os.path.join(base_dir, 'train', cls)
        images = os.listdir(cls_dir)
        selected_imgs = random.sample(images, n_images)

        for j, img_name in enumerate(selected_imgs):
            img_path = os.path.join(cls_dir, img_name)
            img = cv2.imread(img_path)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

            axes[i, j].imshow(img)
            axes[i, j].axis('off')
            axes[i, j].set_title(cls)

    plt.tight_layout()
    plt.show()

# Display random train images
show_random_images(split_base_dir, classes)


# Preprocessing

# Display Preprocessed images

# Augmentation

# Display Augmented Images

# ViT Feature Extractor

# Train Model+Evalution+ Visualization

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve, f1_score, accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# Load extracted ViT features and labels
X_vit = np.load('/kaggle/working/X_vit_features.npy')
y_vit = np.load('/kaggle/working/y_vit_labels.npy')

# Check shapes and balance of dataset
print(f"X_vit shape: {X_vit.shape}, y_vit shape: {y_vit.shape}")
print(f"Unique labels in y_vit: {np.unique(y_vit, return_counts=True)}")

# Split data (with stratify and fixed seed)
X_train, X_test, y_train, y_test = train_test_split(
    X_vit, y_vit, test_size=0.2, random_state=42, stratify=y_vit
)

# Save test set for future evaluation (optional)
np.save('/kaggle/working/X_test.npy', X_test)
np.save('/kaggle/working/y_test.npy', y_test)

# Define Logistic Regression grid with class weights for imbalance handling
lr_params = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs', 'liblinear'],
    'class_weight': ['balanced', None]  # Added class weight to handle imbalance
}
lr_grid = GridSearchCV(LogisticRegression(max_iter=1000), lr_params, cv=3, scoring='accuracy')
lr_grid.fit(X_train, y_train)
print("✅ Best Logistic Regression Params:", lr_grid.best_params_)

# Define XGBoost random grid with class weights for imbalance handling
xgb_params = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'scale_pos_weight': [1, 2, 5]  # For handling class imbalance in XGBoost
}
xgb_grid = RandomizedSearchCV(
    XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    xgb_params, cv=3, n_iter=10, scoring='accuracy', random_state=42
)
xgb_grid.fit(X_train, y_train)
print("✅ Best XGBoost Params:", xgb_grid.best_params_)

# Define MLP random grid with class weights for imbalance handling
mlp_params = {
    'hidden_layer_sizes': [(128,), (256,), (256, 128)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001],
    'class_weight': ['balanced', None],
    'max_iter': [200, 500]
}
mlp_grid = RandomizedSearchCV(MLPClassifier(), mlp_params, cv=3, n_iter=10, scoring='accuracy', random_state=42)
mlp_grid.fit(X_train, y_train)
print("✅ Best MLP Params:", mlp_grid.best_params_)

# Create the ensemble model with VotingClassifier
ensemble_model = VotingClassifier(
    estimators=[
        ('lr', lr_grid.best_estimator_),
        ('mlp', mlp_grid.best_estimator_),
        ('xgb', xgb_grid.best_estimator_)
    ], voting='hard'
)

# Train ensemble model
ensemble_model.fit(X_train, y_train)

# Evaluate ensemble model
y_pred = ensemble_model.predict(X_test)
y_proba = ensemble_model.predict_proba(X_test)[:, 1]  # Probability for ROC/AUC

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Benign', 'Malignant'], yticklabels=['Benign', 'Malignant'])
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Accuracy Score
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

# F1 Score
print(f"F1 Score: {f1_score(y_test, y_pred)}")

# ROC Curve & AUC Score
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
roc_auc = roc_auc_score(y_test, y_proba)
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()

# Save the trained ensemble model for future use
joblib.dump(ensemble_model, '/kaggle/working/ensemble_model.joblib')
