In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                           f1_score, confusion_matrix, classification_report, roc_auc_score, roc_curve)
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D, Reshape
import warnings
warnings.filterwarnings('ignore')

# Load dataset
data = load_breast_cancer()
X, y = data.data, data.target

# Create DataFrame for better understanding
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y

print("Dataset Shape:", X.shape)
print("Class Distribution:")
print("Benign (1):", sum(y == 1))
print("Malignant (0):", sum(y == 0))

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set shape: {X_train_scaled.shape}")
print(f"Test set shape: {X_test_scaled.shape}")

Dataset Shape: (569, 30)
Class Distribution:
Benign (1): 357
Malignant (0): 212
Training set shape: (455, 30)
Test set shape: (114, 30)


In [None]:
print("1. LOGISTIC REGRESSION")
print("=" * 50)

lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_scaled, y_train)
lr_pred = lr_model.predict(X_test_scaled)
lr_pred_proba = lr_model.predict_proba(X_test_scaled)[:, 1]

# Performance metrics
lr_accuracy = accuracy_score(y_test, lr_pred)
lr_precision = precision_score(y_test, lr_pred)
lr_recall = recall_score(y_test, lr_pred)
lr_f1 = f1_score(y_test, lr_pred)
lr_auc = roc_auc_score(y_test, lr_pred_proba)

print(f"Accuracy: {lr_accuracy:.4f}")
print(f"Precision: {lr_precision:.4f}")
print(f"Recall: {lr_recall:.4f}")
print(f"F1-Score: {lr_f1:.4f}")
print(f"AUC-ROC: {lr_auc:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, lr_pred))


In [None]:
print("\n2. RANDOM FOREST")
print("=" * 50)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)
rf_pred = rf_model.predict(X_test_scaled)
rf_pred_proba = rf_model.predict_proba(X_test_scaled)[:, 1]

# Performance metrics
rf_accuracy = accuracy_score(y_test, rf_pred)
rf_precision = precision_score(y_test, rf_pred)
rf_recall = recall_score(y_test, rf_pred)
rf_f1 = f1_score(y_test, rf_pred)
rf_auc = roc_auc_score(y_test, rf_pred_proba)

print(f"Accuracy: {rf_accuracy:.4f}")
print(f"Precision: {rf_precision:.4f}")
print(f"Recall: {rf_recall:.4f}")
print(f"F1-Score: {rf_f1:.4f}")
print(f"AUC-ROC: {rf_auc:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, rf_pred))
# Feature importance
feature_importance = pd.DataFrame({
    'feature': data.feature_names,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Important Features:")
print(feature_importance.head(10))

In [None]:
print("\n3. SUPPORT VECTOR MACHINE")
print("=" * 50)

svm_model = SVC(kernel='rbf', probability=True, random_state=42)
svm_model.fit(X_train_scaled, y_train)
svm_pred = svm_model.predict(X_test_scaled)
svm_pred_proba = svm_model.predict_proba(X_test_scaled)[:, 1]

# Performance metrics
svm_accuracy = accuracy_score(y_test, svm_pred)
svm_precision = precision_score(y_test, svm_pred)
svm_recall = recall_score(y_test, svm_pred)
svm_f1 = f1_score(y_test, svm_pred)
svm_auc = roc_auc_score(y_test, svm_pred_proba)

print(f"Accuracy: {svm_accuracy:.4f}")
print(f"Precision: {svm_precision:.4f}")
print(f"Recall: {svm_recall:.4f}")
print(f"F1-Score: {svm_f1:.4f}")
print(f"AUC-ROC: {svm_auc:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, svm_pred))
# Multi-Layer Perceptron
print("\n4. MULTI-LAYER PERCEPTRON (NEURAL NETWORK)")
print("=" * 50)

mlp_model = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)
mlp_model.fit(X_train_scaled, y_train)
mlp_pred = mlp_model.predict(X_test_scaled)
mlp_pred_proba = mlp_model.predict_proba(X_test_scaled)[:, 1]

# Performance metrics
mlp_accuracy = accuracy_score(y_test, mlp_pred)
mlp_precision = precision_score(y_test, mlp_pred)
mlp_recall = recall_score(y_test, mlp_pred)
mlp_f1 = f1_score(y_test, mlp_pred)
mlp_auc = roc_auc_score(y_test, mlp_pred_proba)

print(f"Accuracy: {mlp_accuracy:.4f}")
print(f"Precision: {mlp_precision:.4f}")
print(f"Recall: {mlp_recall:.4f}")
print(f"F1-Score: {mlp_f1:.4f}")
print(f"AUC-ROC: {mlp_auc:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, mlp_pred))

In [None]:
# Deep Neural Network with TensorFlow
print("\n5. DEEP NEURAL NETWORK (TensorFlow)")
print("=" * 50)

# Build the model
tf.random.set_seed(42)
dnn_model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

dnn_model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

# Train the model
history = dnn_model.fit(X_train_scaled, y_train,
                       epochs=100,
                       batch_size=32,
                       validation_split=0.2,
                       verbose=0)

# Make predictions
dnn_pred_proba = dnn_model.predict(X_test_scaled).flatten()
dnn_pred = (dnn_pred_proba > 0.5).astype(int)

# Performance metrics
dnn_accuracy = accuracy_score(y_test, dnn_pred)
dnn_precision = precision_score(y_test, dnn_pred)
dnn_recall = recall_score(y_test, dnn_pred)
dnn_f1 = f1_score(y_test, dnn_pred)
dnn_auc = roc_auc_score(y_test, dnn_pred_proba)

print(f"Accuracy: {dnn_accuracy:.4f}")
print(f"Precision: {dnn_precision:.4f}")
print(f"Recall: {dnn_recall:.4f}")
print(f"F1-Score: {dnn_f1:.4f}")
print(f"AUC-ROC: {dnn_auc:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, dnn_pred))
# 1D CNN (adapted for tabular data)
print("\n6. 1D CONVOLUTIONAL NEURAL NETWORK")
print("=" * 50)

# Reshape data for CNN (treat features as sequence)
X_train_cnn = X_train_scaled.reshape(X_train_scaled.shape[0], X_train_scaled.shape[1], 1)
X_test_cnn = X_test_scaled.reshape(X_test_scaled.shape[0], X_test_scaled.shape[1], 1)

# Build CNN model
tf.random.set_seed(42)
cnn_model = Sequential([
    Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(X_train_scaled.shape[1], 1)),
    Conv1D(filters=64, kernel_size=3, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(50, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

cnn_model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

# Train the model
cnn_history = cnn_model.fit(X_train_cnn, y_train,
                           epochs=100,
                           batch_size=32,
                           validation_split=0.2,
                           verbose=0)

# Make predictions
cnn_pred_proba = cnn_model.predict(X_test_cnn).flatten()
cnn_pred = (cnn_pred_proba > 0.5).astype(int)

# Performance metrics
cnn_accuracy = accuracy_score(y_test, cnn_pred)
cnn_precision = precision_score(y_test, cnn_pred)
cnn_recall = recall_score(y_test, cnn_pred)
cnn_f1 = f1_score(y_test, cnn_pred)
cnn_auc = roc_auc_score(y_test, cnn_pred_proba)

print(f"Accuracy: {cnn_accuracy:.4f}")
print(f"Precision: {cnn_precision:.4f}")
print(f"Recall: {cnn_recall:.4f}")
print(f"F1-Score: {cnn_f1:.4f}")
print(f"AUC-ROC: {cnn_auc:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, cnn_pred))
# Create comparison dataframe
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'SVM', 'MLP', 'Deep NN', '1D CNN'],
    'Accuracy': [lr_accuracy, rf_accuracy, svm_accuracy, mlp_accuracy, dnn_accuracy, cnn_accuracy],
    'Precision': [lr_precision, rf_precision, svm_precision, mlp_precision, dnn_precision, cnn_precision],
    'Recall': [lr_recall, rf_recall, svm_recall, mlp_recall, dnn_recall, cnn_recall],
    'F1-Score': [lr_f1, rf_f1, svm_f1, mlp_f1, dnn_f1, cnn_f1],
    'AUC-ROC': [lr_auc, rf_auc, svm_auc, mlp_auc, dnn_auc, cnn_auc]
})

print("\n" + "="*80)
print("MODEL COMPARISON RESULTS")
print("="*80)
print(results.round(4))

# Find best model for each metric
print("\nBEST PERFORMING MODELS:")
print("-" * 30)
for metric in ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC-ROC']:
    best_idx = results[metric].idxmax()
    best_model = results.loc[best_idx, 'Model']
    best_score = results.loc[best_idx, metric]
    print(f"{metric}: {best_model} ({best_score:.4f})")

# Visualization
plt.figure(figsize=(15, 10))

# Performance comparison
plt.subplot(2, 2, 1)
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC-ROC']
x = np.arange(len(results))
width = 0.15

for i, metric in enumerate(metrics):
    plt.bar(x + i*width, results[metric], width, label=metric, alpha=0.8)

plt.xlabel('Models')
plt.ylabel('Score')
plt.title('Model Performance Comparison')
plt.xticks(x + width*2, results['Model'], rotation=45)
plt.legend()
plt.grid(True, alpha=0.3)

# ROC Curves
plt.subplot(2, 2, 2)
models_data = [
    ('Logistic Regression', y_test, lr_pred_proba),
    ('Random Forest', y_test, rf_pred_proba),
    ('SVM', y_test, svm_pred_proba),
    ('MLP', y_test, mlp_pred_proba),
    ('Deep NN', y_test, dnn_pred_proba),
    ('1D CNN', y_test, cnn_pred_proba)
]

for name, y_true, y_prob in models_data:
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    auc = roc_auc_score(y_true, y_prob)
    plt.plot(fpr, tpr, label=f'{name} (AUC={auc:.3f})', linewidth=2)

plt.plot([0, 1], [0, 1], 'k--', alpha=0.5)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparison')
plt.legend()
plt.grid(True, alpha=0.3)

# Training history for deep learning models
plt.subplot(2, 2, 3)
plt.plot(history.history['accuracy'], label='DNN Train Accuracy', linewidth=2)
plt.plot(history.history['val_accuracy'], label='DNN Val Accuracy', linewidth=2)
plt.plot(cnn_history.history['accuracy'], label='CNN Train Accuracy', linewidth=2)
plt.plot(cnn_history.history['val_accuracy'], label='CNN Val Accuracy', linewidth=2)
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Deep Learning Models Training History')
plt.legend()
plt.grid(True, alpha=0.3)

# Feature importance (Random Forest)
plt.subplot(2, 2, 4)
top_features = feature_importance.head(10)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Importance')
plt.title('Top 10 Feature Importance (Random Forest)')
plt.gca().invert_yaxis()

plt.tight_layout()
plt.show()

# Statistical significance testing
from scipy import stats

print("\n" + "="*50)
print("STATISTICAL ANALYSIS")
print("="*50)

# Cross-validation scores
cv_scores = {}
models_cv = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='rbf', random_state=42),
    'MLP': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)
}

for name, model in models_cv.items():
    scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')
    cv_scores[name] = scores
    print(f"{name}: CV Accuracy = {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")

print("\nCross-Validation Results Summary:")
cv_df = pd.DataFrame({name: scores for name, scores in cv_scores.items()})
print(cv_df.describe())