In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_regression
import datetime

In [2]:
# the supplied CSV data file is the raw_data directory
bc_data = pd.read_csv('/Users/angelique/Documents/GitHub/Capstone-2---Breast-Cancer-Tumor-Predictions/data.csv')

bc_data.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [5]:
df = bc_data.drop(columns=['id', 'Unnamed: 32'])

Based on the previous notebook (04 - Preprocessing and Training), I am dropping the same columns we previously dropped due to their high correlation with other features. This is done to prevent duplication of information and avoid multicollinearity, which can distort model performance and the interpretation of results.

In [8]:
# Drop perimeter_mean and area_mean
df.drop(['perimeter_mean', 'area_mean'], axis=1, inplace=True)

# Drop perimeter_worst, area_worst
df.drop(['perimeter_worst', 'area_worst'], axis=1, inplace=True)

# Drop perimeter_se, area_se
df.drop(['perimeter_se', 'area_se'], axis=1, inplace=True)

# Based on the high correlation I'm dropping some of the mean vs worst variables
df.drop(['radius_mean', 'texture_mean', 'concave points_worst', 'concavity_mean', 'texture_worst', 'fractal_dimension_worst'], axis=1, inplace=True)
df.shape

(569, 19)

In [10]:
df.columns

Index(['diagnosis', 'smoothness_mean', 'compactness_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'smoothness_se', 'compactness_se',
       'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'symmetry_worst'],
      dtype='object')

In [12]:
#Encode the diagnosis data numerically

df['diagnosis'] = df['diagnosis'].replace({'B':0, 'M':1})

To address the minority class of Malignant vs Benign, I'm using the SMOTE oversampling technique to even out the data

In [15]:
#Assign variables to x and y based on all variables vs diagnosis
x = df.drop(columns=['diagnosis'])
y = df['diagnosis']

In [17]:
from imblearn.over_sampling import SMOTE
s = SMOTE()
x,y = s.fit_resample(x,y)

In [19]:
from collections import Counter
print(Counter(y))

Counter({1: 357, 0: 357})


In [21]:
#Scale the data using standard Scaler
ss = StandardScaler()

X_scaled = ss.fit_transform(x)

In [23]:
# Use train_test_split to split the data for testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

In [25]:
#Check shape of the X_train and X_test
X_train.shape, X_test.shape

((499, 18), (215, 18))

In [27]:
#Check shape of y_train and y_test
y_train.shape, y_test.shape

((499,), (215,))

In [29]:
#Evaluate the model using accuracy, precision, recall and ROC scores under evaluate_model fuction
def evaluate_model(y_test, y_pred):
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Precision: {precision_score(y_test, y_pred)}")
    print(f"Recall: {recall_score(y_test, y_pred)}")
    print(f"ROC AUC: {roc_auc_score(y_test, y_pred)}")

In [31]:
final_results = []

Logistic Regression - Testing

In [34]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score

lr = LogisticRegression()

# Define the hyperparameter grid for tuning
param_grid = {
    'C': [0.1, 1, 10],            # Regularization strength
    'solver': ['liblinear', 'saga']  # Solver options
}

# Define the custom scoring function (e.g., F1 score)
custom_scorer = make_scorer(f1_score)

# GridSearchCV with custom scoring
grid_search = GridSearchCV(estimator=lr, param_grid=param_grid, scoring=custom_scorer, cv=5)
grid_search.fit(X_train, y_train)

# Output the best parameters and the best score
print("Best Parameters:", grid_search.best_params_)
print("Best F1 Score:", grid_search.best_score_)

Best Parameters: {'C': 10, 'solver': 'saga'}
Best F1 Score: 0.9704100513992004




In [36]:
# Best parameters found during the grid search
best_params = {'C': 10, 'solver': 'saga'}

# Create the Logistic Regression model with the best parameters
lr = LogisticRegression(
    C=best_params['C'],
    solver=best_params['solver'],
    max_iter=10000
)

In [38]:
#Train the model
lr.fit(X_train, y_train)

In [40]:
from sklearn.metrics import classification_report, confusion_matrix,recall_score , accuracy_score, precision_score, roc_auc_score

In [42]:
#Make a prediction
y_pred_lr = lr.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_lr)
final_results.append(accuracy)

In [44]:
evaluate_model(y_test, y_pred_lr)

Accuracy: 0.958139534883721
Precision: 0.9428571428571428
Recall: 0.9705882352941176
ROC AUC: 0.9587454450806873


Cross Validation of Logistic Regression

In [47]:
from sklearn.model_selection import StratifiedKFold
skf=StratifiedKFold(n_splits=5,shuffle=True,random_state=0)
skf.split(X_scaled,y)
print(skf)

StratifiedKFold(n_splits=5, random_state=0, shuffle=True)


In [49]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
y_pred1=cross_val_predict(lr,X_test,y_test,cv=skf)
score1=cross_val_score(lr,X_train,y_train,cv=skf)
print(y_pred1)
print(score1)
AA1=print(score1.mean()*100)
AA1

[0 0 0 0 1 0 1 1 1 1 1 0 0 0 0 0 1 1 1 1 1 0 1 1 1 1 0 1 1 0 0 1 0 0 1 1 1
 1 0 1 0 1 1 1 0 0 0 0 1 0 1 0 0 0 1 1 0 1 0 1 1 0 1 1 1 0 1 0 1 1 0 0 1 1
 0 0 0 1 1 1 1 0 1 0 1 0 0 1 0 0 0 1 0 0 0 1 0 1 1 1 0 0 1 0 1 0 0 1 0 0 0
 0 0 0 0 0 1 1 0 1 1 1 0 1 0 1 1 0 0 0 0 0 0 0 1 1 1 0 1 1 0 0 1 1 1 1 1 0
 1 0 0 1 0 1 0 1 0 0 1 0 0 1 1 1 1 0 1 0 0 0 0 0 1 1 0 1 1 1 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 1 0 1 0]
[0.96       0.95       0.95       0.95       0.94949495]
95.18989898989899


Random Forest Classifier

In [54]:
from sklearn.ensemble import RandomForestClassifier


# Hyperparameter grid
param_dist = {
    'n_estimators': np.arange(100, 1001, 100),  # 100, 200, ..., 1000
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

# Initialize RandomForestClassifier
rf = RandomForestClassifier()

# RandomizedSearchCV initialization
random_search = RandomizedSearchCV(estimator=rf, 
                                   param_distributions=param_dist, 
                                   n_iter=100, cv=5, 
                                   n_jobs=-1, verbose=2)

# Fit the model
random_search.fit(X_train, y_train)

# Output the best parameters and the best score
print("Best Parameters:", random_search.best_params_)  # note the underscore
print("Best F1 Score:", random_search.best_score_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 30, 'bootstrap': False}
Best F1 Score: 0.9559393939393939


In [56]:
# Best parameters found from GridSearchCV
best_params = {
    'n_estimators': 300,
    'min_samples_split': 5,
    'min_samples_leaf': 4,
    'max_features': 'log2',
    'max_depth': 30,
    'bootstrap': False
}

# Initialize the Random Forest classifier with the best parameters
rf = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    max_features=best_params['max_features'],
    max_depth=best_params['max_depth'],
    bootstrap=best_params['bootstrap']
)

In [58]:
# Fit the model
rf.fit(X_train,y_train)

In [60]:
y_pred_rf = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_rf)
final_results.append(accuracy)

In [62]:
evaluate_model(y_test, y_pred_rf)

Accuracy: 0.9441860465116279
Precision: 0.95
Recall: 0.9313725490196079
ROC AUC: 0.9435623807044943


Cross Validation of Random Forest Classifier

In [65]:
y_pred2=cross_val_predict(rf,X_test,y_test,cv=skf)
score2=cross_val_score(rf,X_train,y_train,cv=skf)
print(y_pred2)
print(score2)
AA2=print(score2.mean()*100)
AA2

[0 1 0 0 1 0 1 1 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 1 0 0 0 1 0 0 1 1 1
 1 0 1 0 1 1 1 0 0 0 0 1 0 0 0 0 0 1 1 0 1 0 1 0 0 1 1 1 0 1 0 1 1 0 1 1 1
 0 0 0 1 1 1 1 0 1 0 1 0 0 1 0 0 0 1 0 0 0 1 0 1 1 1 0 0 1 0 1 0 0 0 0 0 0
 0 0 0 0 0 1 1 0 0 1 0 0 1 0 1 1 0 0 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 0 1 0
 1 0 0 1 0 1 0 1 0 0 1 0 0 1 1 1 1 0 1 0 0 0 0 0 1 1 0 1 1 1 1 1 0 0 0 0 0
 0 0 0 0 0 1 0 0 1 1 1 0 0 0 0 0 1 0 1 0 0 1 1 0 0 0 1 0 1 0]
[0.95       0.95       0.99       0.92       0.96969697]
95.5939393939394


KNN 

In [68]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

In [70]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define the parameter grid for RandomizedSearchCV
param_dist = {
    'n_neighbors': randint(3, 20),  # Randomly select n_neighbors between 3 and 20
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2]
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=knn, param_distributions=param_dist, n_iter=100, cv=5, scoring='accuracy')

# Fit the model
random_search.fit(X_scaled, y)

# Print best parameters and best score
print("Best parameters found: ", random_search.best_params_)
print("Best cross-validation score: {:.2f}".format(random_search.best_score_))

Best parameters found:  {'algorithm': 'kd_tree', 'n_neighbors': 9, 'p': 2, 'weights': 'distance'}
Best cross-validation score: 0.97


In [72]:
# Best parameters found from GridSearchCV
best_params = {
    'algorithm': 'kd_tree',
    'n_neighbors': 9,
    'p': 2,
    'weights': 'distance'
}

# Initialize the KNN classifier with the best parameters
knn = KNeighborsClassifier(
    algorithm=best_params['algorithm'],
    n_neighbors=best_params['n_neighbors'],
    p=best_params['p'],
    weights=best_params['weights']
)

In [74]:
#Fit the Model
knn.fit(X_train,y_train)

In [76]:
y_pred_knn = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_knn)
final_results.append(accuracy)

In [78]:
evaluate_model(y_test, y_pred_knn)

Accuracy: 0.9674418604651163
Precision: 0.9523809523809523
Recall: 0.9803921568627451
ROC AUC: 0.9680721846260629


KNN Cross Validation

In [81]:
y_pred3=cross_val_predict(knn,X_test,y_test,cv=skf)
score3=cross_val_score(knn,X_train,y_train,cv=skf)
print(y_pred3)
print(score3)
AA3=print(score3.mean()*100)
AA3

[0 1 1 0 1 0 1 1 1 1 1 0 0 0 0 0 1 1 1 1 1 0 1 1 1 1 0 1 1 0 0 1 0 0 1 1 1
 1 0 1 0 1 1 1 0 0 0 0 1 0 0 0 0 0 1 1 0 1 0 1 0 0 1 1 1 0 1 0 1 1 0 0 1 1
 0 0 0 1 1 1 1 0 1 0 1 0 0 1 0 0 0 1 0 0 0 1 0 1 1 1 1 0 1 0 1 0 0 0 0 0 0
 0 0 0 0 0 1 1 0 0 1 1 0 1 0 1 1 0 0 0 1 0 0 0 1 1 1 0 1 1 0 1 1 1 1 0 1 0
 1 0 0 1 0 1 0 1 0 0 1 0 0 1 1 1 1 0 1 0 0 0 0 0 1 1 0 1 1 1 1 1 0 0 0 0 0
 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0 1 0 0 1 1 0 0 0 1 0 1 0]
[0.97       0.96       0.98       0.91       0.94949495]
95.389898989899


Support Vector Classifier (SVC)

In [84]:
from sklearn.svm import SVC
svc= SVC()

# Define the parameter distribution
param_dist = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf'],
    'gamma': ['scale', 'auto', 0.1, 1],
    'degree': [2, 3, 4],
    'coef0': [0, 0.5, 1]
}

# Use RandomizedSearchCV instead of GridSearchCV (randomly sample from the grid)
random_search = RandomizedSearchCV(estimator=svc, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', random_state=42)

# Fit the random search model
random_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters found: ", random_search.best_params_)
print("Best cross-validation score: ", random_search.best_score_)

Best parameters found:  {'kernel': 'linear', 'gamma': 'auto', 'degree': 2, 'coef0': 1, 'C': 0.1}
Best cross-validation score:  0.9699595959595959


In [86]:
# Best parameters found from RandomizedSearchCV
best_params = {
    'kernel': 'linear',
    'gamma': 'auto',
    'degree': 2,  
    'coef0': 1,
    'C': 0.1
}

# Initialize the SVC with the best parameters
svc = SVC(
    kernel=best_params['kernel'],
    gamma=best_params['gamma'],
    degree=best_params['degree'],
    coef0=best_params['coef0'],
    C=best_params['C']
)

In [88]:
svc.fit(X_train, y_train)

In [90]:
y_pred_svc = svc.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_svc)
final_results.append(accuracy)

In [92]:
evaluate_model(y_test, y_pred_svc)

Accuracy: 0.9534883720930233
Precision: 0.9423076923076923
Recall: 0.9607843137254902
ROC AUC: 0.9538434842963736


Cross Validate SVC results

In [95]:
y_pred4 = cross_val_predict(svc,X_test,y_test,cv=skf)
score4 = cross_val_score(svc,X_train,y_train,cv=skf)
print(y_pred4)
print(score4)
AA4=print(score4.mean()*100)
AA4

[0 0 1 0 1 0 1 1 1 1 1 0 0 0 0 0 1 1 1 1 1 0 1 1 1 1 0 1 1 0 0 1 0 0 1 1 1
 1 0 1 0 1 1 1 0 0 0 0 1 0 0 0 0 0 1 1 0 1 0 1 0 0 1 1 1 0 1 0 1 1 0 1 1 1
 0 0 0 1 1 1 1 0 1 0 1 0 0 1 0 0 0 1 0 0 0 1 1 1 1 1 0 0 1 0 1 0 0 0 0 0 0
 0 0 0 0 0 1 1 0 1 1 1 0 1 0 1 1 0 0 0 1 0 0 0 1 1 1 0 1 1 0 1 1 1 1 0 1 0
 1 0 0 1 0 1 0 1 0 0 1 0 0 1 1 1 1 0 1 0 0 0 0 0 1 1 0 1 1 1 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 1 0 1 0]
[0.97       0.96       0.95       0.96       0.96969697]
96.19393939393939


In [97]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()


param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 20, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2'],
    'random_state': [42]  # for reproducibility
}

# Use GridSearchCV 
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy', verbose=42)

# Fit the GridSearchCV model
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

Fitting 5 folds for each of 270 candidates, totalling 1350 fits
[CV] END bootstrap=True, max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=   0.4s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1000; total time=   1.2s
[CV] END bootstrap=True, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=10, n_estimators=900; total time=   1.0s
[CV] END bootstrap=True, max_depth=None, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   0.3s
[CV] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estim

In [99]:
# Best parameters found from GridSearchCV
best_params = {
    'criterion': 'gini',
    'max_depth': 5,
    'max_features': 'sqrt',
    'min_samples_leaf': 1,  
    'min_samples_split': 2,
    'random_state': 42
}

# Initialize the SVC with the best parameters
dt = DecisionTreeClassifier(
    criterion=best_params['criterion'],
    max_depth=best_params['max_depth'],
    max_features=best_params['max_features'],
    min_samples_leaf=best_params['min_samples_leaf'],
    min_samples_split=best_params['min_samples_split'],
    random_state=best_params['random_state']
)

In [101]:
#Train the model
dt.fit(X_train,y_train)

In [103]:
y_pred_dt = dt.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_dt)
final_results.append(accuracy)

In [105]:
evaluate_model(y_test, y_pred_dt)

Accuracy: 0.8790697674418605
Precision: 0.88
Recall: 0.8627450980392157
ROC AUC: 0.8782752038868645


Cross Validation of Decision Tree

In [108]:
y_pred5=cross_val_predict(dt,X_test,y_test,cv=skf)
score5=cross_val_score(dt,X_train,y_train,cv=skf)
print(y_pred5)
print(score5)
AA5=print(score5.mean()*100)
AA5

[0 1 1 0 1 0 1 1 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 0 1 0 0 0 1 0 0 1 1 1
 1 0 1 0 1 1 1 0 0 0 0 1 0 0 0 0 0 1 1 0 1 0 1 0 0 1 1 0 0 1 0 1 1 0 1 1 1
 0 0 0 1 1 1 1 0 1 0 1 0 0 1 0 0 1 0 0 1 0 1 1 1 1 1 0 0 0 0 1 0 0 1 0 0 0
 0 1 1 0 0 0 1 0 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 1 1 0 0 1 0 1 0 1 0
 1 0 0 1 0 1 0 1 0 0 1 0 0 1 1 1 1 0 1 0 0 0 0 0 1 1 0 1 1 1 1 1 0 0 0 0 0
 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0 1 1 0 1 0 0 0 0 1 0 1 0]
[0.89       0.91       0.93       0.83       0.88888889]
88.97777777777777


In [110]:
cv_avg=[
    score1.mean()*100,
    score2.mean()*100,
    score3.mean()*100,
    score4.mean()*100,
    score5.mean()*100
]
precision_list=[
    precision_score(y_test, y_pred_lr),
    precision_score(y_test, y_pred_rf),
    precision_score(y_test, y_pred_knn),
    precision_score(y_test, y_pred_svc),
    precision_score(y_test, y_pred_dt)
]
recall_list=[
    recall_score(y_test, y_pred_lr),
    recall_score(y_test, y_pred_rf),
    recall_score(y_test, y_pred_knn),
    recall_score(y_test, y_pred_svc),
    recall_score(y_test, y_pred_dt)
]
print(cv_avg)
print(precision_list)
print(recall_list)
print(final_results)

[95.18989898989899, 95.5939393939394, 95.389898989899, 96.19393939393939, 88.97777777777777]
[0.9428571428571428, 0.95, 0.9523809523809523, 0.9423076923076923, 0.88]
[0.9705882352941176, 0.9313725490196079, 0.9803921568627451, 0.9607843137254902, 0.8627450980392157]
[0.958139534883721, 0.9441860465116279, 0.9674418604651163, 0.9534883720930233, 0.8790697674418605]


In [112]:
final_result = pd.DataFrame(zip(final_results, recall_list, precision_list, cv_avg), columns=['Accuracy', 'Recall', 'Precision', 'Cross Val Avg'], index=['Logistic Regression','Random Forest','KNN','SVM' ,'Decision Tree'])


In [114]:
final_result

Unnamed: 0,Accuracy,Recall,Precision,Cross Val Avg
Logistic Regression,0.95814,0.970588,0.942857,95.189899
Random Forest,0.944186,0.931373,0.95,95.593939
KNN,0.967442,0.980392,0.952381,95.389899
SVM,0.953488,0.960784,0.942308,96.193939
Decision Tree,0.87907,0.862745,0.88,88.977778


Considering how important Recall is in this model, since missing a malignant cancer tumor is much more important than the other metrics, I would use the KNN model since it has the highest recall score. Due to the nature of the consequences for a false negative, KNN seems to have the highest rate of accurately depicting True Positives while also having the highest overall accuracy score.