In [35]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import math
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
import time
prog_start = time.time()
data = pd.read_csv('breastcancer.csv') 

features_selected = ['radius_mean', 'texture_mean', 'smoothness_mean', 'compactness_mean', 
                     'concavity_mean', 'symmetry_mean', 'radius_se', 'concave points_se', 
                     'smoothness_worst', 'compactness_worst', 'concavity_worst', 
                     'symmetry_worst', 'fractal_dimension_worst'] # Features that were highly correlated when feature selection was done


X1 = data[features_selected]
data['diagnosis'] = data['diagnosis'].map({'M': 1, 'B': 0})  # Convert labels to binary
y1 = data['diagnosis']


X1 = X1.dropna()
y1 = y1.dropna()

X1 = X1.loc[y1.index] # make sure X & y line up together
RF_AUCs = []
timerf = []
for seed in range(1,201):
    rf_start = time.time()
    X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=seed)

    best_rf = RandomForestClassifier(
        bootstrap=False,
        max_depth=None,
        max_features='sqrt',
        min_samples_leaf=1,
        min_samples_split=2,
        n_estimators=200,
        random_state=seed
    )
    # Fit the model
    best_rf.fit(X_train1, y_train1)

    # Predict and evaluate the model
    y_pred_best_rf = best_rf.predict(X_test1)

    #metric
    y_pred_probs_rf = best_rf.predict_proba(X_test1)[:,1]
    roc_auc_rf = roc_auc_score(y_test1, y_pred_probs_rf)
    timerf.append(time.time()-rf_start)
    RF_AUCs.append(roc_auc_rf)


######################################################-ANN-N############################################################
X2 = data[features_selected]
y2 = data['diagnosis']
ANN_AUCs = []
timea = []
for seed in range(1,201):
    # training and test sets
    a_start = time.time()
    X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=seed, stratify=y2)
    
    # scaler
    scaler = StandardScaler()
    X_train2 = scaler.fit_transform(X_train2)
    X_test2 = scaler.transform(X_test2)
    
    # ANN architechture
    def create_model(learning_rate=0.001, dropout_rate=0.3):
        model = Sequential([
            Input(shape=(X_train2.shape[1],)),  # Input layer
            Dense(64, activation='relu'),     # First hidden layer
            Dropout(dropout_rate),            # Dropout for regularization
            Dense(32, activation='relu'),     # Second hidden layer
            Dropout(dropout_rate),
            Dense(1, activation='sigmoid')    # Output layer for binary classification
        ])
        model.compile(optimizer=Adam(learning_rate=learning_rate),
                      loss='binary_crossentropy',
                      metrics=['accuracy'])
        return model
    
    # initial lr and training 
    learning_rate = 0.001
    dropout_rate = 0.3
    model = create_model(learning_rate=learning_rate, dropout_rate=dropout_rate)

    model.fit(
        X_train2, y_train2,
        epochs=40,           
        batch_size=32,       
        validation_split=0.2,     
        verbose=0
    )
    # metrics 
    y_pred_probs_ANN = model.predict(X_test2).flatten()  
    y_pred_ANN = (y_pred_probs_ANN > 0.5).astype(int)  
    # AUC-ROC
    roc_auc_ANN = roc_auc_score(y_test2, y_pred_probs_ANN)
    timea.append(time.time()-a_start)
    ANN_AUCs.append(roc_auc_ANN)
####################################-COMPARE-##################
total=0
rfwin=0
annwin=0
equal=0
for r, a in zip(RF_AUCs,ANN_AUCs):
    if r > a:
        rfwin += 1
    elif r == a:
        equal += 1
    else:
        annwin += 1
    print(f'RF vs. ANN: {r:.6f} vs {a:.6f}')
    total += 1

rfwinP = rfwin/total
annwinP = annwin/total
equalP = equal/total
print("how often a models AUC is better")
print(rfwinP)
print(annwinP)
print(equalP)
prog_total = time.time()-prog_start
print(f'average rf runtime: {sum(timerf)/len(timerf)}')
print(f'average ann runtime: {sum(timea)/len(timea)}')
print(f'total runtime: {prog_total}')



[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9m