In [13]:

# -*- coding: utf-8 -*-
"""
Created on Sat Mar 30 15:25:48 2024
Updated for Supervised Learning: 1D CNN for Tabular Data with Hyperparameter Tuning,
Diagnostic Checks, Class Mapping, and Output Management in 'results' Folder.
Author: Fan Yang
"""

import os
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization
import json
import kerastuner as kt  # Ensure keras-tuner is installed (pip install keras-tuner)
import matplotlib.pyplot as plt

# Set seeds for reproducibility
np.random.seed(42)
random.seed(42)
tf.random.set_seed(42)

# Define the results folder inside the project directory
results_dir = r"C:\Users\maily\Desktop\COMP263_Group_Project\results"
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

def load_and_prepare_data(csv_path):
    """Load dataset, drop non-informative columns, and map class values (2,4) to binary (0,1)."""
    df = pd.read_csv(csv_path)
    print("Data Head:\n", df.head())
    print("Columns:", df.columns.values)
    
    # Drop non-informative columns (e.g., ID)
    if 'ID' in df.columns:
        df = df.drop('ID', axis=1)
    
    # Map class labels from {2, 4} to {0, 1}
    if df['class'].nunique() == 2:
        df['class'] = df['class'].map({2: 0, 4: 1})
    
    return df

def feature_selection(df):
    """Select features and target from the dataframe."""
    feature_cols = ['thickness', 'size', 'shape', 'Marg', 'Epith', 'b1', 'nucleoli', 'Mitoses']
    X = df[feature_cols].values
    y = df['class'].values
    return X, y

def preprocess_data(X):
    """Standardize features and reshape for 1D CNN input."""
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    # Reshape to (samples, timesteps, channels) where timesteps equals number of features
    X_cnn = np.expand_dims(X_scaled, axis=2)
    return X_cnn, scaler

def build_hypermodel(hp):
    """Build a CNN model with hyperparameters to tune.
    
    Tunable parameters:
      - Number of filters and kernel sizes for two Conv1D layers.
      - Number of dense units.
      - Dropout rate.
      - Learning rate.
    """
    model = Sequential()
    
    # First Conv1D layer
    model.add(Conv1D(filters=hp.Int("filters_1", min_value=16, max_value=64, step=16),
                     kernel_size=hp.Choice("kernel_size_1", values=[3, 5]),
                     activation='relu',
                     input_shape=input_shape,
                     padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=2))
    
    # Second Conv1D layer
    model.add(Conv1D(filters=hp.Int("filters_2", min_value=32, max_value=128, step=32),
                     kernel_size=hp.Choice("kernel_size_2", values=[3, 5]),
                     activation='relu',
                     padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=2))
    
    model.add(Flatten())
    
    # Dense layer
    model.add(Dense(units=hp.Int("dense_units", min_value=32, max_value=128, step=32),
                    activation='relu'))
    model.add(Dropout(rate=hp.Float("dropout_rate", min_value=0.2, max_value=0.6, step=0.1)))
    
    # Output layer for binary classification
    model.add(Dense(1, activation='sigmoid'))
    
    # Choose optimizer learning rate
    lr = hp.Choice("learning_rate", values=[1e-3, 1e-4])
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
    
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

def save_model(model, model_save_path):
    """Save the trained model to a file with a valid extension (.keras or .h5)."""
    model.save(model_save_path)
    print("Model saved to", model_save_path)

# Main execution block
if __name__ == "__main__":
    csv_path = r"C:\Users\maily\Desktop\COMP263_Group_Project\breast_cancer.csv"
    # Save model in results folder with a valid extension (.keras)
    model_save_path = os.path.join(results_dir, "breast_cancer_cnn_model.keras")
    metrics_save_path = os.path.join(results_dir, "model_metrics.json")
    report_save_path = os.path.join(results_dir, "classification_report.txt")
    history_plot_path = os.path.join(results_dir, "training_history.png")
    
    # Load and prepare data
    df = load_and_prepare_data(csv_path)
    X, y = feature_selection(df)
    X_cnn, scaler = preprocess_data(X)
    
    # Split data into training and testing sets (stratify to maintain class balance)
    X_train, X_test, y_train, y_test = train_test_split(X_cnn, y, test_size=0.2, random_state=42, stratify=y)
    
    # Define global variable for input shape (used in hypermodel)
    input_shape = X_train.shape[1:]  # e.g., (8, 1)
    
    # Hyperparameter tuning with Keras Tuner
    tuner = kt.RandomSearch(
        build_hypermodel,
        objective='val_accuracy',
        max_trials=10,
        executions_per_trial=1,
        directory='kt_tuner_dir',
        project_name='breast_cancer_cnn'
    )
    
    tuner.search(X_train, y_train, epochs=20, batch_size=32, validation_split=0.1)
    
    best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
    print("Best hyperparameters:")
    print("filters_1:", best_hps.get("filters_1"))
    print("kernel_size_1:", best_hps.get("kernel_size_1"))
    print("filters_2:", best_hps.get("filters_2"))
    print("kernel_size_2:", best_hps.get("kernel_size_2"))
    print("dense_units:", best_hps.get("dense_units"))
    print("dropout_rate:", best_hps.get("dropout_rate"))
    print("learning_rate:", best_hps.get("learning_rate"))
    
    # Build the model with the best hyperparameters and train further if needed
    model = tuner.hypermodel.build(best_hps)
    model.summary()
    history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1)
    
    # Plot training history
    plt.figure()
    plt.plot(history.history['accuracy'], label='train_accuracy')
    plt.plot(history.history['val_accuracy'], label='val_accuracy')
    plt.title('Training History')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.savefig(history_plot_path)
    plt.close()
    print("Training history plot saved to", history_plot_path)
    
    # Evaluate the model
    y_pred_prob = model.predict(X_test)
    # Convert probabilities to binary predictions using threshold 0.5
    y_pred = (y_pred_prob > 0.5).astype("int32").reshape(-1)
    
    # Diagnostic: Print distribution of true and predicted labels
    unique_y_test, counts_y_test = np.unique(y_test, return_counts=True)
    unique_y_pred, counts_y_pred = np.unique(y_pred, return_counts=True)
    print("True label distribution:", dict(zip(unique_y_test, counts_y_test)))
    print("Predicted label distribution:", dict(zip(unique_y_pred, counts_y_pred)))
    
    # Generate classification report and save to file
    report = classification_report(y_test, y_pred, zero_division=0)
    print("Classification Report:\n", report)
    with open(report_save_path, 'w') as f:
        f.write(report)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
    
    print("Test Accuracy:", accuracy)
    print("Test Precision:", precision)
    print("Test Recall:", recall)
    
    # Save the trained model
    save_model(model, model_save_path)
    
    # Save evaluation metrics and best hyperparameters to a JSON file
    evaluation_metrics = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'best_hyperparameters': best_hps.values
    }
    
    with open(metrics_save_path, 'w') as f:
        json.dump(evaluation_metrics, f)
    
    print("Evaluation metrics saved to", metrics_save_path)


Data Head:
         ID  thickness  size  shape  Marg  Epith bare  b1  nucleoli  Mitoses  \
0  1000025          5     1      1     1      2    1   3         1        1   
1  1002945          5     4      4     5      7   10   3         2        1   
2  1015425          3     1      1     1      2    2   3         1        1   
3  1016277          6     8      8     1      3    4   3         7        1   
4  1017023          4     1      1     3      2    1   3         1        1   

   class  
0      2  
1      2  
2      2  
3      2  
4      2  
Columns: ['ID' 'thickness' 'size' 'shape' 'Marg' 'Epith' 'bare' 'b1' 'nucleoli'
 'Mitoses' 'class']
Reloading Tuner from kt_tuner_dir\breast_cancer_cnn\tuner0.json
Best hyperparameters:
filters_1: 32
kernel_size_1: 5
filters_2: 128
kernel_size_2: 5
dense_units: 32
dropout_rate: 0.2
learning_rate: 0.001


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.7849 - loss: 0.4794 - val_accuracy: 0.9643 - val_loss: 0.4818
Epoch 2/50
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9768 - loss: 0.0844 - val_accuracy: 0.9107 - val_loss: 0.4758
Epoch 3/50
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9782 - loss: 0.0758 - val_accuracy: 0.9464 - val_loss: 0.4440
Epoch 4/50
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9782 - loss: 0.0562 - val_accuracy: 0.9464 - val_loss: 0.4363
Epoch 5/50
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9887 - loss: 0.0421 - val_accuracy: 0.9464 - val_loss: 0.4119
Epoch 6/50
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9853 - loss: 0.0369 - val_accuracy: 0.9464 - val_loss: 0.3920
Epoch 7/50
[1m16/16[0m [32m━━━━━━━━━