In [20]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import math
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
import time
import random
import tensorflow as tf

prog_start = time.time()
data = pd.read_csv('breastcancer.csv') 

features_selected = ['radius_mean', 'texture_mean', 'smoothness_mean', 'compactness_mean', 
                     'concavity_mean', 'symmetry_mean', 'radius_se', 'concave points_se', 
                     'smoothness_worst', 'compactness_worst', 'concavity_worst', 
                     'symmetry_worst', 'fractal_dimension_worst'] # Features that were highly correlated when feature selection was done


X1 = data[features_selected]
data['diagnosis'] = data['diagnosis'].map({'M': 1, 'B': 0})  # Convert labels to binary
y1 = data['diagnosis']


X1 = X1.dropna()
y1 = y1.dropna()

X1 = X1.loc[y1.index] # make sure X & y line up together
RF_AUCs = []
timerf = [] 
ac_rf = []
for i in range(1,201):
    rf_start = time.time()
    random.seed(i)
    np.random.seed(i)
    tf.random.set_seed(i)
    #train test
    X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2)
    
    #define classifier with best parameters
    rf = RandomForestClassifier(
        bootstrap=False,
        max_depth=None,
        max_features='sqrt',
        min_samples_leaf=1,
        min_samples_split=2,
        n_estimators=200
    )
    
    # Fit the model
    rf.fit(X_train1, y_train1)
    '''
    #Feature importance
    importance = rf.feature_importances_
    
    # Combine with feature names
    feature_importance_df = pd.DataFrame({
        'Feature': X1.columns,
        'Importance': importance
    }).sort_values(by='Importance', ascending=False)
    
    print(feature_importance_df)
    '''
    #metrics for Random Forest
    y_pred_probs_rf = rf.predict_proba(X_test1)[:,1] 
    roc_auc_rf = roc_auc_score(y_test1, y_pred_probs_rf)
    
    #roc and auc
    fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test1, y_pred_probs_rf)
    tpr_max_rf=tpr_rf.argmax()
    best_thresh_rf=thresholds_rf[tpr_max_rf]
    
    #apply threshold to probabilities
    y_pred_rf = (y_pred_probs_rf >= best_thresh_rf).astype(int) 
    timerf.append(time.time()-rf_start)
    #gives accuracy precision F1 of pred vs test
    ac_rf.append(accuracy_score(y_test1,y_pred_rf))
    RF_AUCs.append(roc_auc_rf)

######################################################-ANN-N############################################################
X2 = data[features_selected]
y2 = data['diagnosis']
ANN_AUCs = []
timea = []
ac_ann = []
for i in range(1,201):
    a_start = time.time() 
    print(i)
    random.seed(i)
    np.random.seed(i)
    tf.random.set_seed(i)
    # training and test sets
    X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, stratify=y2)
    
    # scaler
    scaler = StandardScaler()
    X_train2 = scaler.fit_transform(X_train2)
    X_test2 = scaler.transform(X_test2)
    
    # ANN architechture
    def create_model(learning_rate=0.001, dropout_rate=0.3):
        model = Sequential([
            Input(shape=(X_train2.shape[1],)),  # Input layer
            Dense(64, activation='relu'),     # First hidden layer
            Dropout(dropout_rate),            # Dropout for regularization
            Dense(32, activation='relu'),     # Second hidden layer
            Dropout(dropout_rate),
            Dense(1, activation='sigmoid')    # Output layer for binary classification
        ])
        model.compile(optimizer=Adam(learning_rate=learning_rate),
                      loss='binary_crossentropy',
                      metrics=['accuracy'])
        return model
    
    # initial lr and training 
    learning_rate = 0.001
    dropout_rate = 0.3
    model = create_model(learning_rate=learning_rate, dropout_rate=dropout_rate)
    
    history = model.fit(
        X_train2, y_train2,
        epochs=60,           
        batch_size=32,       
        validation_split=0.2,     
        verbose=0
    )
    
    # metrics 
    #get probabilities from ANN
    y_pred_probs_ANN = model.predict(X_test2).flatten()  
    
    #use probs with different threshold to develop ROC curve
    fpr_ANN, tpr_ANN, thresholds_ANN = roc_curve(y_test2, y_pred_probs_ANN)
    #calculate AUC of ROC
    roc_auc_ANN = roc_auc_score(y_test2, y_pred_probs_ANN)
    
    #find best threshold and assign to model
    tpr_max_ANN=tpr_ANN.argmax()
    best_thresh_ANN=thresholds_ANN[tpr_max_ANN]
    
    #make binary predictions
    y_pred_ANN = (y_pred_probs_ANN >= best_thresh_ANN).astype(int)
    timea.append(time.time()-a_start)
    ac_ann.append(accuracy_score(y_test2,y_pred_ANN))
    ANN_AUCs.append(roc_auc_ANN)
    
####################################-COMPARE-##################
total=0
rfwin=0
annwin=0
equal=0
for r, a in zip(RF_AUCs,ANN_AUCs):
    if r > a:
        rfwin += 1
        print(f'RF is better {r}')
    elif r == a:
        equal += 1
        print("equal")
    else:
        annwin += 1
        print(f'ANN is better {a}')
    print(f'RF vs. ANN: {r} vs {a}')
    total += 1
print(f'Totals rfwins: {rfwin} \nequal {equal} \nannwins {annwin} \ntotal {total}')
rfwinP = rfwin/total
annwinP = annwin/total
equalP = equal/total
print("how often a models AUC is better")
print(f'rf winrate: {rfwinP}')
print(f'ann winrate: {annwinP}')
print(f'tie rate: {equalP}\n')

print(f'Averages and standard deviations')
print(f'average RF auc: {np.mean(RF_AUCs)} |std RF auc: {np.std(RF_AUCs)}')
print(f'average ANN auc: {np.mean(ANN_AUCs)} |std ANN auc: {np.std(ANN_AUCs)}')
rf_av = np.mean(ac_rf)
rf_std = np.std(ac_rf)
ann_av = np.mean(ac_ann)
ann_std = np.std(ac_ann)
print(f'Accuracy measurements:\naverage RF accuracy: {rf_av} |std RF accuracy{rf_std} \naverage ANN accuracy: {ann_av} |std ANN accuracy {ann_std}\n')
print(f'average rf runtime: {np.mean(timerf)} |std rf runtime: {np.std(timerf)}')
print(f'average ann runtime: {np.mean(timea)} |std ann runtime: {np.std(timea)}')
prog_total = time.time()-prog_start
print(f'total runtime: {prog_total}')

1
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
2
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
3
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
4
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
6
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
7
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
8
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
9
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
11
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 
12
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
13
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
14
[1m4/4[0m [32m━━━━━━━━━━━━━━