In [2]:
# Import packages and functions
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
# Import dataset
cancer = pd.read_csv('../data/raw/breast.csv')

# Changing the names of the columns to improve their understanding
new_columns = ['ID', 'Diagnosis']
for i in range (30):       
       if cancer.columns[i+2][-1:] == '1': 
               column_type = 'mean'
       elif cancer.columns[i+2][-1:] == '2': 
               column_type = 'se'
       else: 
               column_type = 'worst'
       temp = cancer.columns[i+2][:-1] + '_' + column_type
       new_columns.append(temp)

cancer.columns = new_columns
cancer.describe()

Unnamed: 0,ID,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,30371830.0,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,...,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,125020600.0,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,...,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,8670.0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,...,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,869218.0,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,...,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,906024.0,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,...,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,8813129.0,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,...,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,911320500.0,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,...,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [4]:
# Standardize features using the z-score method
scaler = StandardScaler()

features = cancer.columns[2:]

standardized_cancer = scaler.fit_transform(cancer[features])

# Convert the standardized data back to a DataFrame
standardized_cancer = pd.DataFrame(standardized_cancer, columns=features)

standardized_cancer.describe()
y = (cancer['Diagnosis'] == 'M').astype(int)

In [5]:
# Split the dataset into test and training sets at 25-75 split.

X_train, X_test, Y_train, Y_test = train_test_split(standardized_cancer, 
                                                    y, stratify=y,
                                                    test_size=0.25, random_state=42)

# Verify shapes
print(X_train.shape, X_test.shape)
print(Y_train.shape, Y_test.shape)

(426, 30) (143, 30)
(426,) (143,)


In [6]:
# As part of our analysis we will need a metric to decide which feature to select in case other metrics (e.g. accuracy produces equal results)
# Therefore, we suggest to use Random Forest model to select features based on their statistical significance relative to the target variable (great for continuous variables).
# Higher Importance values indicate greater importance, but they don't imply a direct effect size.

model = RandomForestClassifier()
model.fit(standardized_cancer, y)
feature_importances = model.feature_importances_
feature_importances

rf_importance_results = pd.DataFrame({
    'feature': standardized_cancer.columns,
    'importance': feature_importances
}).sort_values(by='importance', ascending=False)

rf_importance_results


Unnamed: 0,feature,importance
23,area_worst,0.143995
22,perimeter_worst,0.120904
27,concave_points_worst,0.116678
20,radius_worst,0.11013
7,concave_points_mean,0.082151
6,concavity_mean,0.048866
2,perimeter_mean,0.045577
0,radius_mean,0.040593
3,area_mean,0.039622
26,concavity_worst,0.038263


In [7]:
# 1) We suggest to follow the Forward Feature Selection method to make the decision on the most important features.
# 2) The general idea is to iteratively add features based on the KNN method performance (we'll be optimizing accuracy).
# 3) At the first step we'll run independently KNN for every feature to decide which 
# one of them produces KNN with the highest accuracy score (such feature will be selected as feature1).
# 4) At the second step we'll run independently KNN for feature1 and every other feature to decide which 
# combination of feature1 and another feature produces KNN with the highest accuracy score (this another feature is feature2).
# 5) We'll be doing this up to 15 features. We think it should be enough for our purpose 
# to limit the number of features at least two times as compared to the initial 30 features.
knn = KNeighborsClassifier()
parameter_grid = {
    "n_neighbors": range(1, 30),
}

cancer_tune_grid = GridSearchCV(
    estimator=knn,
    param_grid=parameter_grid,
    cv=10
)

# structure of the final DataFrame with the results. The results presented in every line will be for all features above
KNN_forward_results = pd.DataFrame(columns=
                                           ['feature',
                                            'opt_n_neighbors', 
                                            'accuracy', 
                                            'precision', 
                                            'recall', 
                                            'f1']
                                           )

KNN_interim_results = pd.DataFrame(columns=
                                           ['feature', 
                                            'opt_n_neighbors', 
                                            'accuracy', 
                                            'precision', 
                                            'recall', 
                                            'f1']
                                           )

stage = 1
feature_count = 0
while feature_count < 15: 
    
    if stage == 1:
        working_features = features	# list of features to used for identifing the feature that helps to produce the highest accuracy score
    else:
        working_features = pd.Index(list(set(working_features) - set(selected_features['feature'])))
        X_train_base = X_train[KNN_forward_results['feature'].tolist()] # "base" dataset to train the models (it will be iteratively expanded, see below)
        X_test_base = X_test[KNN_forward_results['feature'].tolist()] # "base" dataset to test the models (it will be iteratively expanded, see below)
        KNN_interim_results = KNN_interim_results.drop(KNN_interim_results.index)
        selected_features = selected_features.drop(selected_features.index)    

    for i in working_features:
        if stage == 1:
            X_train_exp = X_train[[i]]
            X_test_exp = X_test[[i]]
        else:
            X_train_exp = X_train_base.join(X_train[i], how='inner')
            X_test_exp = X_test_base.join(X_test[i], how='inner')

        cancer_tune_grid.fit(
            X_train_exp,
            Y_train
        )

        # Refit the KNN model with the best k value
        knn_best = KNeighborsClassifier(n_neighbors=cancer_tune_grid.best_params_['n_neighbors'])
        knn_best.fit(X_train_exp, Y_train)

        # Predict on the test set
        
        X_pred = X_test_exp
        Y_pred = knn_best.predict(X_pred)

        new_row = pd.DataFrame({
            'feature': [i],
            'opt_n_neighbors': [cancer_tune_grid.best_params_['n_neighbors']],
            'accuracy': [accuracy_score(Y_test, Y_pred)],
            'precision': [precision_score(Y_test,Y_pred,pos_label=1)],  # "M" is 1 in one-hot encoding
            'recall': [recall_score(Y_test,Y_pred,pos_label=1)],
            'f1': [f1_score(Y_test,Y_pred, average='weighted')] 
        })
        if KNN_interim_results.empty: 
            KNN_interim_results = new_row
        else:
            KNN_interim_results = pd.concat([KNN_interim_results, new_row], ignore_index=True)
    
    KNN_interim_results['accuracy_rank'] = KNN_interim_results['accuracy'].rank(ascending=False)

    KNN_interim_results = pd.merge(KNN_interim_results, rf_importance_results[['feature','importance']], on='feature', how='inner')
    
    interim_selection = KNN_interim_results[KNN_interim_results['accuracy_rank'] == min(KNN_interim_results['accuracy_rank'])]
    selected_features = interim_selection[interim_selection['importance'] == max(interim_selection['importance'])]    
    
    for j in range(len(selected_features)): # in case there could be 2 or more lines with the same accuracy scores and importance scores
        new_row = pd.DataFrame({
            'feature': [selected_features.iloc[j]['feature']],
            'opt_n_neighbors': [selected_features.iloc[j]['opt_n_neighbors']],
            'accuracy': [selected_features.iloc[j]['accuracy']],
            'precision': [selected_features.iloc[j]['precision']],
            'recall': [selected_features.iloc[j]['recall']],
            'f1': [selected_features.iloc[j]['f1']]
        })
        if KNN_forward_results.empty: 
            KNN_forward_results = new_row
        else:
            KNN_forward_results = pd.concat([KNN_forward_results, new_row], ignore_index=True)
    stage += 1
    feature_count = KNN_forward_results.shape[0]

KNN_forward_results = pd.merge(KNN_forward_results, rf_importance_results[['feature','importance']], on='feature', how='inner')
KNN_forward_results

Unnamed: 0,feature,opt_n_neighbors,accuracy,precision,recall,f1,importance
0,concave_points_worst,23,0.93007,0.957447,0.849057,0.9291,0.116678
1,area_se,17,0.958042,0.979592,0.90566,0.957675,0.037258
2,radius_se,11,0.972028,0.980392,0.943396,0.971913,0.024513
3,texture_mean,9,0.972028,1.0,0.924528,0.971784,0.014467
4,perimeter_se,7,0.979021,1.0,0.943396,0.978887,0.016282
5,radius_mean,5,0.972028,1.0,0.924528,0.971784,0.040593
6,texture_worst,28,0.972028,1.0,0.924528,0.971784,0.018949
7,fractal_dimension_worst,3,0.986014,1.0,0.962264,0.985956,0.007687
8,area_worst,5,0.986014,1.0,0.962264,0.985956,0.143995
9,fractal_dimension_mean,7,0.986014,1.0,0.962264,0.985956,0.002286


In [9]:
# Based on the results in the table above, adding any features 
# beyond the first eight does not improve any of the provided metrics, including the accuracy score

knn = KNeighborsClassifier()
parameter_grid = {
    "n_neighbors": range(1, 100), #attention!
}

cancer_tune_grid = GridSearchCV(
    estimator=knn,
    param_grid=parameter_grid,
    cv=10 #attention!
)

parementer_list = KNN_forward_results['feature'][:8].tolist()

cancer_tune_grid.fit(
    X_train[parementer_list],
    Y_train
)

knn_best = KNeighborsClassifier(n_neighbors=cancer_tune_grid.best_params_['n_neighbors'])
knn_best.fit(X_train[parementer_list], Y_train)

# Predict on the test set
X_pred = X_test[parementer_list]
Y_pred = knn_best.predict(X_pred)

print(parementer_list)
print(f"Number of N neighbors in KNN is {cancer_tune_grid.best_params_['n_neighbors']}")
print(f"Accuracy score is {accuracy_score(Y_test, Y_pred)}")
print(f"Precision score is {precision_score(Y_test,Y_pred,pos_label=1)}") # 'M' is 1 in one-hot encoding
print(f"Recall score is {recall_score(Y_test,Y_pred,pos_label=1)}")
print(f"F1 score is {f1_score(Y_test,Y_pred, average='weighted')}") 


['concave_points_worst', 'area_se', 'radius_se', 'texture_mean', 'perimeter_se', 'radius_mean', 'texture_worst', 'fractal_dimension_worst']
Number of N neighbors in KNN is 3
Accuracy score is 0.986013986013986
Precision score is 1.0
Recall score is 0.9622641509433962
F1 score is 0.9859563513409668
