# Breast Cancer Prediction Model using KNN



In [4]:
import pandas as pd          # For loading, cleaning, and handling datasets (DataFrame operations)
import numpy as np           # For numerical computations and array manipulation
import matplotlib.pyplot as plt  # For plotting graphs and visualizing results
from sklearn.preprocessing import LabelEncoder  # For converting categorical labels into numeric form


In [5]:
data =pd.read_csv("/content/Dataset - Dataset.csv")

In [7]:
le = LabelEncoder()
# Initialize LabelEncoder to convert categorical labels into numeric values

data['diagnosis'] = le.fit_transform(data['diagnosis'])
# Encode the 'diagnosis' column (M/B) into numerical form (0/1) for model processing


In [None]:
data.info()
data.drop(['id'],axis =1, inplace = True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    int64  
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [9]:
from sklearn.preprocessing import MinMaxScaler  # Normalize feature values to a 0–1 range

target_col = 'diagnosis'  # Target column for prediction

X = data.drop(target_col, axis=1)  # Feature set (all columns except target)
Y = data[target_col]  # Target values

cols = X.columns  # Save feature column names

scaler = MinMaxScaler()  # Initialize MinMax scaler
X_scaled = scaler.fit_transform(X)  # Fit and scale feature values

X_df = pd.DataFrame(X_scaled, columns=cols, index=X.index)  # Convert scaled data back to DataFrame
X_df.head()  # Preview first 5 rows


Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,0.000915,0.521037,0.022658,0.545989,0.363733,0.593753,0.792037,0.70314,0.731113,0.686364,...,0.620776,0.141525,0.66831,0.450698,0.601136,0.619292,0.56861,0.912027,0.598462,0.418864
1,0.000915,0.643144,0.272574,0.615783,0.501591,0.28988,0.181768,0.203608,0.348757,0.379798,...,0.606901,0.303571,0.539818,0.435214,0.347553,0.154563,0.192971,0.639175,0.23359,0.222878
2,0.092495,0.601496,0.39026,0.595743,0.449417,0.514309,0.431017,0.462512,0.635686,0.509596,...,0.556386,0.360075,0.508442,0.374508,0.48359,0.385375,0.359744,0.835052,0.403706,0.213433
3,0.092547,0.21009,0.360839,0.233501,0.102906,0.811321,0.811361,0.565604,0.522863,0.776263,...,0.24831,0.385928,0.241347,0.094008,0.915472,0.814012,0.548642,0.88488,1.0,0.773711
4,0.092559,0.629893,0.156578,0.630986,0.48929,0.430351,0.347893,0.463918,0.51839,0.378283,...,0.519744,0.123934,0.506948,0.341575,0.437364,0.172415,0.319489,0.558419,0.1575,0.142595


In [11]:
from sklearn.model_selection import train_test_split  # Split dataset into training and testing sets

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X_df, Y, test_size=0.2, random_state=42)
# 80/20 split with fixed random seed


In [None]:
Xtrain.shape
Xtest.shape

(114, 30)

In [None]:
Ytrain.shape

(455,)

In [12]:
from sklearn.cluster import KMeans  # Import KMeans for clustering
import pandas as pd  # Data handling with DataFrame support

kmeans = KMeans(n_clusters=2, random_state=42)  # Create KMeans model with 2 clusters
kmeans.fit(Xtrain)  # Fit the model using training features

Xtrain['cl'] = kmeans.labels_  # Add cluster labels to training data
Xtest['cl'] = kmeans.predict(Xtest)  # Predict and add cluster labels to test data

Xtrain  # Display training set with new cluster column


Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,cl
68,0.000934,0.096928,0.257694,0.103656,0.045387,0.487226,0.373965,0.733365,0.217445,0.530808,...,0.283316,0.075153,0.034285,0.508684,0.397018,1.000000,0.601375,0.524936,0.409681,1
181,0.000949,0.667755,0.570172,0.683505,0.495228,0.554934,0.809214,0.582709,0.743539,0.674242,...,0.571962,0.627970,0.467902,0.514627,0.709327,0.541534,0.997595,0.499310,0.481175,1
63,0.000933,0.103744,0.140345,0.106489,0.049799,0.221901,0.208975,0.140300,0.108350,0.646970,...,0.192164,0.075601,0.030697,0.179555,0.136324,0.111581,0.174811,0.338459,0.195855,0
248,0.097067,0.173648,0.524518,0.167369,0.086320,0.396678,0.162444,0.055740,0.080268,0.422727,...,0.617537,0.137308,0.066482,0.519910,0.109158,0.089856,0.210859,0.363493,0.173357,0
60,0.000933,0.150930,0.174839,0.143459,0.071432,0.548614,0.187811,0.025398,0.064115,0.850000,...,0.144723,0.096867,0.045075,0.371987,0.069244,0.017316,0.088625,0.392667,0.165027,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,0.000934,0.090255,0.166723,0.103656,0.042630,0.408053,0.410159,0.201640,0.142744,0.425253,...,0.097281,0.060511,0.024381,0.327082,0.209865,0.114537,0.164467,0.135817,0.349993,0
106,0.000938,0.220503,0.291512,0.216847,0.114104,0.555836,0.252500,0.165651,0.173211,0.374242,...,0.459488,0.174810,0.082703,0.644720,0.231598,0.229473,0.418557,0.244628,0.235668,0
270,0.009768,0.345923,0.240446,0.321401,0.207466,0.105263,0.022606,0.016987,0.031064,0.226263,...,0.230011,0.219284,0.122739,0.095754,0.022383,0.030879,0.114536,0.176030,0.040404,0
435,0.000987,0.331251,0.335137,0.327068,0.193425,0.481809,0.288080,0.263824,0.321223,0.307576,...,0.500533,0.316201,0.168133,0.595192,0.319692,0.325000,0.627835,0.318155,0.330972,1


In [13]:
from sklearn.neighbors import KNeighborsClassifier  # Import KNN classifier

knn = KNeighborsClassifier(n_neighbors=5)  # Create KNN model with k = 5

knn.fit(Xtrain, Ytrain)  # Train KNN using training data

Ypred = knn.predict(Xtest)  # Predict labels for test data


In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [16]:
accuracy = accuracy_score(Ytest, Ypred)  # Model accuracy

precision = precision_score(Ytest, Ypred)  # Precision: correctness of positive predictions

recall = recall_score(Ytest, Ypred)  # Recall: ability to detect actual positives

f1 = f1_score(Ytest, Ypred)  # Harmonic mean of precision and recall

print("Accuracy :", accuracy)  # Display accuracy
print("Precision:", precision)  # Display precision
print("Recall   :", recall)  # Display recall
print("F1-score :", f1)  # Display F1-score


Accuracy : 0.9736842105263158
Precision: 0.9761904761904762
Recall   : 0.9534883720930233
F1-score : 0.9647058823529412
