In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score,confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')


# read data

In [2]:
df = pd.read_csv('data.csv')
df.info()
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

(569, 33)

# clean data --> drop Unnamed

In [3]:
df=df.drop(['Unnamed: 32'],axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

# Converting string labels into numbers

In [4]:
df['diagnosis']=LabelEncoder().fit_transform(df['diagnosis'])
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    int32  
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,1,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,842517,1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,84300903,1,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,84348301,1,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,84358402,1,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,1,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,926682,1,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,926954,1,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,927241,1,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


# Extracting Independent and dependent Variable

In [5]:
x=df.drop(['diagnosis'],axis=1)
y=df['diagnosis']
print(x)
print(y)

           id  radius_mean  texture_mean  perimeter_mean  area_mean  \
0      842302        17.99         10.38          122.80     1001.0   
1      842517        20.57         17.77          132.90     1326.0   
2    84300903        19.69         21.25          130.00     1203.0   
3    84348301        11.42         20.38           77.58      386.1   
4    84358402        20.29         14.34          135.10     1297.0   
..        ...          ...           ...             ...        ...   
564    926424        21.56         22.39          142.00     1479.0   
565    926682        20.13         28.25          131.20     1261.0   
566    926954        16.60         28.08          108.30      858.1   
567    927241        20.60         29.33          140.10     1265.0   
568     92751         7.76         24.54           47.92      181.0   

     smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0            0.11840           0.27760         0.30010              

# feature Scaling

In [6]:
x=StandardScaler().fit_transform(x)
print(x)
print(y)

[[-0.23640517  1.09706398 -2.07333501 ...  2.29607613  2.75062224
   1.93701461]
 [-0.23640344  1.82982061 -0.35363241 ...  1.0870843  -0.24388967
   0.28118999]
 [ 0.43174109  1.57988811  0.45618695 ...  1.95500035  1.152255
   0.20139121]
 ...
 [-0.23572747  0.70228425  2.0455738  ...  0.41406869 -1.10454895
  -0.31840916]
 [-0.23572517  1.83834103  2.33645719 ...  2.28998549  1.91908301
   2.21963528]
 [-0.24240586 -1.80840125  1.22179204 ... -1.74506282 -0.04813821
  -0.75120669]]
0      1
1      1
2      1
3      1
4      1
      ..
564    1
565    1
566    1
567    1
568    0
Name: diagnosis, Length: 569, dtype: int32


# Splitting the dataset into training and test set

In [27]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)
print(x_train)

[[-0.23600968 -0.64678318 -0.42577149 ... -0.35335182  0.32395133
  -0.76893975]
 [-0.23596621 -0.82571213  0.13272462 ... -1.43718102  0.63294742
  -1.03770647]
 [-0.23572964  1.70485436  2.08513394 ...  0.73382724 -0.53185462
  -0.97397828]
 ...
 [ 6.97028059 -1.33239345 -0.22564372 ... -0.97581512 -0.72275273
  -0.14329518]
 [-0.2361426  -1.25173342 -0.24891439 ... -1.74506282 -1.60444316
  -1.01720262]
 [-0.23574078 -0.74334801  1.07984094 ... -0.27523937 -1.2760337
   0.1869831 ]]


# Fitting K-NN classifier to the training set  


In [28]:
# Convert x_train to a NumPy array
x_train = np.array(x_train)

classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(x_train, y_train)


# Predicting the train set result

In [29]:

y_pred = classifier.predict(x_train)

# Evaluate performance
print('confusion_matrix : \n ', confusion_matrix(y_train, y_pred))
print('accuracy_score : ', accuracy_score(y_train, y_pred))
print('precision : ', precision_score(y_train, y_pred, average='micro'))
print('recall : ', recall_score(y_train, y_pred, average='micro'))
print(classification_report(y_train, y_pred))

confusion_matrix : 
  [[263   4]
 [  8 151]]
accuracy_score :  0.971830985915493
precision :  0.971830985915493
recall :  0.971830985915493
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       267
           1       0.97      0.95      0.96       159

    accuracy                           0.97       426
   macro avg       0.97      0.97      0.97       426
weighted avg       0.97      0.97      0.97       426



# Predicting the test set result

In [30]:
x_test = np.array(x_test)
y_pred=classifier.predict(x_test)

print('confusion_matrix : \n ', confusion_matrix(y_test, y_pred))
print('accuracy_score : ',accuracy_score(y_test,y_pred))
print('precision : ',precision_score(y_test,y_pred,average='micro'))
print('recall : ',recall_score(y_test,y_pred,average='micro'))
print(classification_report(y_test,y_pred))

confusion_matrix : 
  [[89  1]
 [ 6 47]]
accuracy_score :  0.951048951048951
precision :  0.951048951048951
recall :  0.951048951048951
              precision    recall  f1-score   support

           0       0.94      0.99      0.96        90
           1       0.98      0.89      0.93        53

    accuracy                           0.95       143
   macro avg       0.96      0.94      0.95       143
weighted avg       0.95      0.95      0.95       143



In [None]:
# Assuming df is your DataFrame
data_t = df.loc[:, ['texture_mean', 'radius_mean', 'diagnosis']]
y_t = data_t['diagnosis']
x_t = data_t.drop(['diagnosis'], axis=1)

model_5 = KNeighborsClassifier(n_neighbors=5)
model_5.fit(x_t, y_t)

plt.figure(figsize=(8, 6))
plt.scatter(x_t['texture_mean'], x_t['radius_mean'], c=np.where(y_t == 'M', 'r', 'g'), label='Data points')
plt.xlabel('texture_mean')
plt.ylabel('radius_mean')

# Creating a meshgrid to visualize the decision boundary
x_min, x_max = x_t['texture_mean'].min() - 1, x_t['texture_mean'].max() + 1
y_min, y_max = x_t['radius_mean'].min() - 1, x_t['radius_mean'].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01))
Z = model_5.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

# Plot the decision boundary using meshgrid coordinates
plt.contourf(xx, yy, Z, alpha=0.5, cmap=plt.cm.RdYlGn)
plt.title('KNN Decision Boundary (k=5)')
plt.legend(['Malignant', 'Benign'])
plt.show()
