In [None]:
import pandas as pd

In [None]:

df = pd.read_csv('metadata.csv')

print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())


Null Value Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Define columns to exclude
columns_to_exclude = ['patient_id', 'lesion_id', 'age', 'background_mother', 'background_father', 'gender', 'img_id', 'region', 'biopsed']

# Filter the dataframe to exclude unwanted columns while keeping 'diagnostic' as target
df_filtered = df.drop(columns=[col for col in columns_to_exclude if col in df.columns])

# Create plots directory if it doesn't exist
os.makedirs("plots", exist_ok=True)

plt.figure(figsize=(10,6))
sns.heatmap(df_filtered.isnull(), cbar=False, cmap="viridis")
plt.title("Missing Values Heatmap (Filtered Dataset)")
plt.savefig("plots/missing_values_filtered.png")
plt.show()

In [None]:
# Drop rows with null values
print("Dataset shape before dropping null values:", df.shape)
df = df.dropna()
print("Dataset shape after dropping null values:", df.shape)
print("\nRemaining null values per column:")
print(df.isnull().sum())


In [None]:
num_cols = df.select_dtypes(include=["int64","float64"]).columns

for col in num_cols:
    plt.figure(figsize=(8,5))
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(f"Histogram of {col}")
    plt.savefig(f"plots/histogram_{col}.png")
    plt.show()

In [None]:
num_cols = df.select_dtypes(include=["int64","float64"]).columns

for col in num_cols:
    plt.figure(figsize=(8,5))
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(f"Histogram of {col}")
    plt.savefig(f"plots/histogram_{col}.png")
    plt.show()

In [None]:
plt.figure(figsize=(12,8))
corr = df[num_cols].corr()
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix of Numerical Features")
plt.savefig("plots/correlation_matrix.png")
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pandas as pd

# Prepare the data for maximum accuracy
# First, let's handle categorical variables properly
df_processed = df.copy()

# Encode categorical variables
categorical_cols = df_processed.select_dtypes(include=['object', 'bool']).columns
categorical_cols = categorical_cols.drop('diagnostic')  # Remove target variable

label_encoders = {}
for col in categorical_cols:
    if col in df_processed.columns:
        le = LabelEncoder()
        df_processed[col] = le.fit_transform(df_processed[col].astype(str))
        label_encoders[col] = le

# Feature (X) and Target (y) separation
X = df_processed.drop("diagnostic", axis=1)
y = df_processed["diagnostic"]

# Encode target variable
target_encoder = LabelEncoder()
y_encoded = target_encoder.fit_transform(y)

# Feature scaling for better model performance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Optimized train-test split for maximum accuracy
# Using 70-30 split for better training data, stratified sampling for balanced classes
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, 
    test_size=0.3, 
    random_state=42, 
    stratify=y_encoded
)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)
print("Class distribution in training set:")
print(pd.Series(y_train).value_counts())
print("Class distribution in testing set:")
print(pd.Series(y_test).value_counts())
print("Feature columns:", list(X.columns))

model Start


In [None]:
# Logistic Regression Model

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
data = pd.read_csv("metadata.csv")

# Drop irrelevant columns
drop_cols = ["patient_id", "lesion_id", "img_id"]
df = data.drop(columns=drop_cols)

# Remove rows with missing target
df = df.dropna(subset=["diagnostic"])

# Features and target
X = df.drop(columns=["diagnostic"])
y = df["diagnostic"]

# Encode categorical variables
for col in X.select_dtypes(include=["object", "bool"]).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

# Encode target
target_encoder = LabelEncoder()
y_encoded = target_encoder.fit_transform(y)

# Fill missing values with median
X = X.fillna(X.median(numeric_only=True))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, y_train)

# Predictions
y_pred = log_reg.predict(X_test_scaled)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Logistic Regression Accuracy:", accuracy)

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=target_encoder.classes_))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

# Plot Confusion Matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=target_encoder.classes_, 
            yticklabels=target_encoder.classes_)
plt.title('Confusion Matrix - Logistic Regression')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Feature Importance Plot
feature_importance = abs(log_reg.coef_[0])
feature_names = X.columns

# Create DataFrame for easier plotting
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(data=importance_df.head(15), x='importance', y='feature')
plt.title('Top 15 Feature Importance - Logistic Regression')
plt.xlabel('Absolute Coefficient Value')
plt.tight_layout()
plt.show()


In [None]:
# Decision Tree Classifier with Diagram

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
import pandas as pd
import matplotlib.pyplot as plt

# Load dataset
data = pd.read_csv("metadata.csv")

# Drop irrelevant columns
drop_cols = ["patient_id", "lesion_id", "img_id"]
df = data.drop(columns=drop_cols)

# Remove rows with missing target
df = df.dropna(subset=["diagnostic"])

# Features and target
X = df.drop(columns=["diagnostic"])
y = df["diagnostic"]

# Encode categorical variables
for col in X.select_dtypes(include=["object", "bool"]).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

# Encode target
y = LabelEncoder().fit_transform(y)

# Fill missing values with median
X = X.fillna(X.median(numeric_only=True))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42, max_depth=4)  # limit depth for clarity
dt_model.fit(X_train, y_train)

# Predictions
y_pred = dt_model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Decision Tree Accuracy:", accuracy)

# Plot Decision Tree
plt.figure(figsize=(20,10))
plot_tree(dt_model, 
          feature_names=X.columns, 
          class_names=[str(c) for c in set(y)], 
          filled=True, rounded=True, fontsize=10)
plt.show()



In [None]:
# KNN Classifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd

# Load dataset
data = pd.read_csv("metadata.csv")

# Drop irrelevant columns
drop_cols = ["patient_id", "lesion_id", "img_id"]
df = data.drop(columns=drop_cols)

# Remove rows with missing target
df = df.dropna(subset=["diagnostic"])

# Features and target
X = df.drop(columns=["diagnostic"])
y = df["diagnostic"]

# Encode categorical variables
for col in X.select_dtypes(include=["object", "bool"]).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

# Encode target
y = LabelEncoder().fit_transform(y)

# Fill missing values with median
X = X.fillna(X.median(numeric_only=True))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features (important for KNN)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)  # default k=5
knn_model.fit(X_train_scaled, y_train)

# Predictions
y_pred = knn_model.predict(X_test_scaled)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("KNN Accuracy:", accuracy)


In [None]:
# Random Forest Classifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
data = pd.read_csv("metadata.csv")

# Drop irrelevant columns
drop_cols = ["patient_id", "lesion_id", "img_id"]
df = data.drop(columns=drop_cols)

# Remove rows with missing target
df = df.dropna(subset=["diagnostic"])

# Features and target
X = df.drop(columns=["diagnostic"])
y = df["diagnostic"]

# Encode categorical variables
for col in X.select_dtypes(include=["object", "bool"]).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

# Encode target
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Fill missing values with median
X = X.fillna(X.median(numeric_only=True))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=100, random_state=42
)
rf_model.fit(X_train, y_train)

# Predictions
y_pred = rf_model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Random Forest Accuracy:", accuracy)

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Confusion Matrix
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=label_encoder.classes_, 
            yticklabels=label_encoder.classes_)
plt.title('Random Forest - Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Feature Importance
plt.figure(figsize=(12, 8))
feature_importance = rf_model.feature_importances_
features = X.columns
importance_df = pd.DataFrame({
    'feature': features,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

plt.barh(range(len(importance_df)), importance_df['importance'])
plt.yticks(range(len(importance_df)), importance_df['feature'])
plt.xlabel('Feature Importance')
plt.title('Random Forest - Feature Importance')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()


In [None]:
# Support Vector Machine (SVM) with Diagram

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.decomposition import PCA
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Load dataset
data = pd.read_csv("metadata.csv")

# Drop irrelevant columns
drop_cols = ["patient_id", "lesion_id", "img_id"]
df = data.drop(columns=drop_cols)

# Remove rows with missing target
df = df.dropna(subset=["diagnostic"])

# Features and target
X = df.drop(columns=["diagnostic"])
y = df["diagnostic"]

# Encode categorical variables
for col in X.select_dtypes(include=["object", "bool"]).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

# Encode target
le_y = LabelEncoder()
y = le_y.fit_transform(y)

# Fill missing values with median
X = X.fillna(X.median(numeric_only=True))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# SVM model
svm_model = SVC(kernel="rbf")  # radial basis function kernel
svm_model.fit(X_train_scaled, y_train)

# Predictions
y_pred = svm_model.predict(X_test_scaled)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("SVM Accuracy:", accuracy)

# ---- Visualization with PCA (2D projection) ----
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_train_scaled)

svm_2d = SVC(kernel="rbf")
svm_2d.fit(X_pca, y_train)

# Create meshgrid
x_min, x_max = X_pca[:, 0].min() - 1, X_pca[:, 0].max() + 1
y_min, y_max = X_pca[:, 1].min() - 1, X_pca[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 300),
                     np.linspace(y_min, y_max, 300))

Z = svm_2d.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# Plot decision regions
plt.figure(figsize=(10, 6))
plt.contourf(xx, yy, Z, alpha=0.3, cmap=plt.cm.coolwarm)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_train, cmap=plt.cm.coolwarm, edgecolors="k")
plt.title("SVM Decision Boundary (PCA-reduced data)")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.show()


In [None]:
# Final Confusion Matrix for Best Performing Model
plt.figure(figsize=(12, 8))

# Create a 2x2 subplot layout for confusion matrices
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Get all model predictions for confusion matrices
models_to_plot = ['Random Forest', 'Logistic Regression', 'SVM', 'Gradient Boosting']

for idx, model_name in enumerate(models_to_plot):
    if model_name in model_predictions:
        row = idx // 2
        col = idx % 2
        
        # Create confusion matrix
        cm = confusion_matrix(y_test, model_predictions[model_name])
        
        # Plot confusion matrix
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                   xticklabels=le.classes_, yticklabels=le.classes_, 
                   ax=axes[row, col])
        
        axes[row, col].set_title(f'{model_name} - Confusion Matrix\nAccuracy: {model_accuracies[model_name]:.3f}', 
                                fontsize=12, fontweight='bold')
        axes[row, col].set_xlabel('Predicted Label')
        axes[row, col].set_ylabel('True Label')

# Hide any empty subplots
for idx in range(len(models_to_plot), 4):
    row = idx // 2
    col = idx % 2
    axes[row, col].axis('off')

plt.tight_layout()
plt.show()

# Final Best Model Confusion Matrix (Larger)
plt.figure(figsize=(10, 8))
cm_best = confusion_matrix(y_test, model_predictions[best_model_name])

# Create detailed confusion matrix with percentages
cm_normalized = cm_best.astype('float') / cm_best.sum(axis=1)[:, np.newaxis]

# Plot the confusion matrix
sns.heatmap(cm_best, annot=True, fmt='d', cmap='Blues', 
           xticklabels=le.classes_, yticklabels=le.classes_,
           cbar_kws={'label': 'Count'})

plt.title(f'Final Confusion Matrix - {best_model_name}\nAccuracy: {model_accuracies[best_model_name]:.3f}', 
          fontsize=16, fontweight='bold')
plt.xlabel('Predicted Label', fontsize=12)
plt.ylabel('True Label', fontsize=12)
plt.show()

# Print confusion matrix with class-wise metrics
print(f"\nFinal Confusion Matrix for {best_model_name}:")
print("="*60)
print(f"Classes: {le.classes_}")
print(f"\nConfusion Matrix:")
print(cm_best)

# Calculate per-class metrics
print(f"\nPer-Class Performance Metrics:")
print("-"*40)
for i, class_name in enumerate(le.classes_):
    tp = cm_best[i, i]
    fp = cm_best[:, i].sum() - tp
    fn = cm_best[i, :].sum() - tp
    tn = cm_best.sum() - tp - fp - fn
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    
    print(f"{class_name}:")
    print(f"  Precision: {precision:.3f}")
    print(f"  Recall: {recall:.3f}")
    print(f"  Specificity: {specificity:.3f}")
    print(f"  True Positives: {tp}")
    print(f"  False Positives: {fp}")
    print(f"  False Negatives: {fn}")
    print("")


In [None]:
# Compare 5 Models and Show Accuracy in Histogram with Annotations

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import pandas as pd
import matplotlib.pyplot as plt

# Load dataset
data = pd.read_csv("metadata.csv")

# Drop irrelevant columns
drop_cols = ["patient_id", "lesion_id", "img_id"]
df = data.drop(columns=drop_cols)

# Remove rows with missing target
df = df.dropna(subset=["diagnostic"])

# Features and target
X = df.drop(columns=["diagnostic"])
y = df["diagnostic"]

# Encode categorical variables
for col in X.select_dtypes(include=["object", "bool"]).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

# Encode target
y = LabelEncoder().fit_transform(y)

# Fill missing values with median
X = X.fillna(X.median(numeric_only=True))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features for models that need it
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC()
}

# Train and evaluate
results = {}
for name, model in models.items():
    if name in ["Logistic Regression", "KNN", "SVM"]:  # need scaled data
        model.fit(X_train_scaled, y_train)
        preds = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
    results[name] = accuracy_score(y_test, preds)

# Print results
print("Model Accuracies:", results)

# Plot histogram with annotations
plt.figure(figsize=(8,6))
bars = plt.bar(results.keys(), results.values(), color=['blue','green','orange','red','purple'])
plt.ylabel("Accuracy")
plt.title("Accuracy Comparison of 5 Models")
plt.ylim(0,1)

# Annotate accuracy values on top of bars
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.02, f"{yval:.2f}", 
             ha='center', va='bottom', fontsize=10, fontweight="bold")

plt.show()


In [None]:
# Confusion Matrices for All 5 Models in One Figure

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import pandas as pd
import matplotlib.pyplot as plt

# Load dataset
data = pd.read_csv("metadata.csv")

# Drop irrelevant columns
drop_cols = ["patient_id", "lesion_id", "img_id"]
df = data.drop(columns=drop_cols)

# Remove rows with missing target
df = df.dropna(subset=["diagnostic"])

# Features and target
X = df.drop(columns=["diagnostic"])
y = df["diagnostic"]

# Encode categorical variables
for col in X.select_dtypes(include=["object", "bool"]).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

# Encode target
le_y = LabelEncoder()
y = le_y.fit_transform(y)
class_names = le_y.classes_

# Fill missing values with median
X = X.fillna(X.median(numeric_only=True))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features for models that need it
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC()
}

# Plot confusion matrices in a grid
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for idx, (name, model) in enumerate(models.items()):
    if name in ["Logistic Regression", "KNN", "SVM"]:
        model.fit(X_train_scaled, y_train)
        preds = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        preds = model.predict(X_test)

    cm = confusion_matrix(y_test, preds)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
    disp.plot(ax=axes[idx], cmap="Blues", colorbar=False)
    axes[idx].set_title(f"{name}")

# Hide the last empty subplot if 5 models
fig.delaxes(axes[-1])

plt.tight_layout()
plt.show()


In [None]:
# F1-Score Analysis for All Diagnostic Categories

# First, let's get F1-scores for each diagnostic category (not just weighted average)
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import numpy as np

print("F1-Score Analysis by Diagnostic Category")
print("=" * 50)

# Get unique diagnostic categories
diagnostic_categories = sorted(np.unique(y))
print(f"Diagnostic Categories: {diagnostic_categories}")

# Calculate F1-scores for each category for each model
f1_scores_by_category = {}

for name, model in models.items():
    if name in ["Logistic Regression", "KNN", "SVM"]:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    # Calculate F1-score for each category
    f1_per_category = f1_score(y_test, y_pred, average=None, zero_division=0)
    f1_scores_by_category[name] = f1_per_category
    
    print(f"\n{name}:")
    for i, category in enumerate(diagnostic_categories):
        print(f"  {category}: {f1_per_category[i]:.4f}")

# Create visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 8))

# Plot 1: F1-Score Heatmap
f1_df = pd.DataFrame(f1_scores_by_category, index=diagnostic_categories)
im = ax1.imshow(f1_df.values, cmap='RdYlGn', aspect='auto', vmin=0, vmax=1)

# Add text annotations
for i in range(len(diagnostic_categories)):
    for j, model in enumerate(f1_df.columns):
        text = ax1.text(j, i, f'{f1_df.iloc[i, j]:.3f}', 
                       ha="center", va="center", color="black", fontweight='bold')

ax1.set_xticks(range(len(f1_df.columns)))
ax1.set_yticks(range(len(diagnostic_categories)))
ax1.set_xticklabels(f1_df.columns, rotation=45, ha='right')
ax1.set_yticklabels(diagnostic_categories)
ax1.set_title('F1-Score Heatmap by Model and Diagnostic Category', fontsize=14, fontweight='bold')

# Add colorbar
cbar = plt.colorbar(im, ax=ax1, shrink=0.8)
cbar.set_label('F1-Score', rotation=270, labelpad=15)

# Plot 2: Bar Chart Comparison
x = np.arange(len(diagnostic_categories))
width = 0.15
multiplier = 0

colors = plt.cm.Set3(np.linspace(0, 1, len(models)))

for i, (model_name, f1_scores) in enumerate(f1_scores_by_category.items()):
    offset = width * multiplier
    bars = ax2.bar(x + offset, f1_scores, width, label=model_name, color=colors[i], alpha=0.8)
    
    # Add value labels on bars
    for bar, score in zip(bars, f1_scores):
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                f'{score:.3f}', ha='center', va='bottom', fontsize=8)
    
    multiplier += 1

ax2.set_xlabel('Diagnostic Categories', fontweight='bold')
ax2.set_ylabel('F1-Score', fontweight='bold')
ax2.set_title('F1-Score Comparison by Diagnostic Category', fontsize=14, fontweight='bold')
ax2.set_xticks(x + width * 2)
ax2.set_xticklabels(diagnostic_categories, rotation=45, ha='right')
ax2.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
ax2.set_ylim(0, 1.1)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Summary statistics
print(f"\nF1-Score Summary Statistics:")
print("=" * 40)
f1_summary = pd.DataFrame(f1_scores_by_category, index=diagnostic_categories)
print(f"Average F1-Score per Model:")
for model in f1_summary.columns:
    avg_f1 = f1_summary[model].mean()
    std_f1 = f1_summary[model].std()
    print(f"  {model}: {avg_f1:.4f} (±{std_f1:.4f})")

print(f"\nAverage F1-Score per Diagnostic Category:")
for category in f1_summary.index:
    avg_f1 = f1_summary.loc[category].mean()
    std_f1 = f1_summary.loc[category].std()
    print(f"  {category}: {avg_f1:.4f} (±{std_f1:.4f})")

# Find best performing model for each category
print(f"\nBest Performing Model per Category:")
for category in diagnostic_categories:
    best_model = f1_summary.loc[category].idxmax()
    best_score = f1_summary.loc[category].max()
    print(f"  {category}: {best_model} (F1={best_score:.4f})")


In [None]:
# System Analysis - Model Performance Metrics

# Calculate detailed performance metrics for all models
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

print("System Performance Analysis")
print("=" * 60)

# Store results for comparison
results = {}

for name, model in models.items():
    if name in ["Logistic Regression", "KNN", "SVM"]:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Calculate loss (error rate)
    loss = 1 - accuracy
    
    results[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Loss': loss
    }
    
    print(f"\n{name}:")
    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")
    print(f"  F1-Score:  {f1:.4f}")
    print(f"  Loss:      {loss:.4f} ⚠️")

# Create comparison DataFrame
results_df = pd.DataFrame(results).T
print(f"\nModel Performance Comparison:")
print("-" * 50)
print(results_df.round(4))

# Find best and worst performing models
best_model = results_df['Accuracy'].idxmax()
best_accuracy = results_df['Accuracy'].max()
worst_model = results_df['Accuracy'].idxmin()
worst_accuracy = results_df['Accuracy'].min()
highest_loss = results_df['Loss'].max()

print(f"\nBest Performing Model: {best_model}")
print(f"Best Accuracy: {best_accuracy:.4f}")
print(f"\nWorst Performing Model: {worst_model}")
print(f"Worst Accuracy: {worst_accuracy:.4f}")
print(f"Highest Loss: {highest_loss:.4f} ⚠️")

# Highlight models with high loss (>10%)
print(f"\nModels with High Loss (>10%):")
print("-" * 30)
high_loss_models = results_df[results_df['Loss'] > 0.1]
if not high_loss_models.empty:
    for model_name in high_loss_models.index:
        loss_val = high_loss_models.loc[model_name, 'Loss']
        print(f"  {model_name}: {loss_val:.4f} ⚠️ HIGH LOSS")
else:
    print("  No models with loss >10% 👍")

# Visualize performance comparison including losses
plt.figure(figsize=(12, 8))
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'Loss']
colors = ['green', 'blue', 'orange', 'purple', 'red']
x = np.arange(len(results_df.index))
width = 0.15

for i, metric in enumerate(metrics):
    bars = plt.bar(x + i*width, results_df[metric], width, label=metric, alpha=0.8, color=colors[i])
    
    # Highlight high loss values
    if metric == 'Loss':
        for j, bar in enumerate(bars):
            if results_df[metric].iloc[j] > 0.1:
                bar.set_edgecolor('red')
                bar.set_linewidth(3)
                plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
                        '⚠️', ha='center', va='bottom', fontsize=12, color='red')

plt.xlabel('Models')
plt.ylabel('Score')
plt.title('Model Performance Comparison - System Analysis (Losses Highlighted)')
plt.xticks(x + width*2, results_df.index, rotation=45)
plt.legend()
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
# Analyze Random Forest Performance and Feature Importance

# Random Forest is particularly well-suited for this skin cancer detection task because:
print("Why Random Forest is ideal for skin cancer detection:")
print("=" * 60)
print("1. Handles mixed data types well (numerical and categorical features)")
print("2. Robust to outliers and missing values")
print("3. Provides feature importance rankings")
print("4. Reduces overfitting through ensemble learning")
print("5. Works well with high-dimensional medical data")
print("6. No need for feature scaling")
print("7. Can capture non-linear relationships between features")

# Get the trained Random Forest model
rf_model = models["Random Forest"]

# Get feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\nTarget Variable: diagnostic")
print("Classes:", class_names)
print(f"Total samples: {len(y)}")
print(f"Class distribution:")
for i, class_name in enumerate(class_names):
    count = sum(y == i)
    percentage = (count / len(y)) * 100
    print(f"  {class_name}: {count} samples ({percentage:.1f}%)")

# Display top 10 most important features
print(f"\nTop 10 Most Important Features for Random Forest:")
print("-" * 50)
for idx, row in feature_importance.head(10).iterrows():
    print(f"{row['feature']:<25}: {row['importance']:.4f}")

# Visualize feature importance
plt.figure(figsize=(12, 8))
top_features = feature_importance.head(15)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Feature Importance')
plt.title('Top 15 Feature Importances - Random Forest Model')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

# Get Random Forest predictions and probabilities
rf_preds = rf_model.predict(X_test)
rf_probs = rf_model.predict_proba(X_test)

# Calculate and display Random Forest specific metrics
from sklearn.metrics import classification_report, accuracy_score
rf_accuracy = accuracy_score(y_test, rf_preds)
print(f"\nRandom Forest Model Performance:")
print(f"Accuracy: {rf_accuracy:.4f}")
print("\nDetailed Classification Report:")
print(classification_report(y_test, rf_preds, target_names=class_names))
