In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import warnings
from sklearn.model_selection import cross_val_score,cross_validate
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score


### import dataset from sklearn

In [2]:
from sklearn.datasets import load_breast_cancer
data_cancer=load_breast_cancer()
print(data_cancer.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [3]:
data_cancer.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

In [4]:
x=data_cancer.data
y=data_cancer.target

In [5]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=.20)

In [6]:
model1=KNeighborsClassifier()
model1.fit(x_train,y_train)

In [7]:
y_predic=model1.predict(x_test)

In [8]:
accuracy_score(y_test,y_predic)

0.9385964912280702

In [9]:
score=cross_val_score(model1,x_train,y_train)
print(score)
score.mean()

[0.93406593 0.96703297 0.93406593 0.91208791 0.93406593]


0.9362637362637362

In [10]:
score=cross_val_score(model1,x_train,y_train,cv=3)
print(score)
score.mean()

[0.94736842 0.93421053 0.9205298 ]


0.9340362495643081

In [11]:
score_valid=cross_validate(model1,x_train,y_train,cv=3,scoring=['accuracy','average_precision'],)
print(score_valid)

{'fit_time': array([0.00070977, 0.00071406, 0.00070858]), 'score_time': array([0.01645136, 0.01792312, 0.01620746]), 'test_accuracy': array([0.94736842, 0.93421053, 0.9205298 ]), 'test_average_precision': array([0.97768081, 0.93662235, 0.92136339])}


In [12]:
pd.DataFrame(score_valid)

Unnamed: 0,fit_time,score_time,test_accuracy,test_average_precision
0,0.00071,0.016451,0.947368,0.977681
1,0.000714,0.017923,0.934211,0.936622
2,0.000709,0.016207,0.92053,0.921363


In [13]:
from sklearn.model_selection import GridSearchCV

### try this parameters for n_neghbors using grid search

In [14]:
param_grid={'n_neighbors':[1,3,5,7,9]}
grid_search=GridSearchCV(KNeighborsClassifier(),param_grid,cv=5)
grid_search.fit(x_train,y_train)

In [15]:
grid_search.best_estimator_

#### to find the deffrence between predict and test

In [16]:
grid_search.score(x_test,y_test)

0.9385964912280702

In [17]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000709,2.4e-05,0.00929,0.000887,1,{'n_neighbors': 1},0.901099,0.967033,0.912088,0.923077,0.846154,0.90989,0.038945,5
1,0.000692,2.3e-05,0.009235,0.00034,3,{'n_neighbors': 3},0.934066,0.945055,0.934066,0.912088,0.89011,0.923077,0.019658,4
2,0.00067,1.8e-05,0.00898,7.6e-05,5,{'n_neighbors': 5},0.934066,0.967033,0.934066,0.912088,0.934066,0.936264,0.017582,1
3,0.000704,2.2e-05,0.009024,3.5e-05,7,{'n_neighbors': 7},0.945055,0.956044,0.912088,0.923077,0.934066,0.934066,0.015541,2
4,0.000705,2.5e-05,0.010125,0.000295,9,{'n_neighbors': 9},0.945055,0.945055,0.89011,0.934066,0.923077,0.927473,0.020382,3
