# Finding an Observation’s Nearest Neighbors

In [144]:
from sklearn.neighbors import NearestNeighbors 
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
import scipy as sp

In [150]:
X = load_iris().data
scaledX = StandardScaler().fit_transform(X) # the data needs to be scaled before using NearestNeighbors

In [5]:
nn = NearestNeighbors(n_neighbors=2) # metric = 'Manhattan', 'Euclidean', 'Minkowski'

In [10]:
ftnn = nn.fit(scaledX)

In [15]:
ob = [[1,1,1,1]]

In [17]:
distances, indix = ftnn.kneighbors(ob)

In [18]:
X[indix] 

array([[[6.7, 3.3, 5.7, 2.1],
        [6.5, 3.2, 5.1, 2. ]]])

In [19]:
distances

array([[0.49140089, 0.74294782]])

In [120]:
nn2 = NearestNeighbors(n_neighbors=3, metric='euclidean')

In [121]:
ftnn2 = nn2.fit(scaledX)

In [131]:
distance1, index1 = ftnn2.kneighbors(X[2].reshape(1,4))

In [132]:
index1

array([[131, 117, 109]])

In [133]:
graph = ftnn2.kneighbors_graph(X).toarray() # sparse converted to array, use toarray()

In [134]:
X.shape

(150, 4)

In [135]:
graph.shape

(150, 150)

In [136]:
for i, x in enumerate(graph):
    x[i] = 0

In [141]:
graph[2]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [149]:
print(sp.sparse.csr_matrix(graph[2]))

  (0, 109)	1.0
  (0, 117)	1.0
  (0, 131)	1.0


In [292]:
ftnn2.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'euclidean',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 3,
 'p': 2,
 'radius': 1.0}

In [293]:
ftnn2.effective_metric_

'euclidean'

In [294]:
ftnn2.n_neighbors

3

# Creating a K-Nearest Neighbor Classifier

In [151]:
features, target = load_iris().data, load_iris().target

In [155]:
from sklearn.neighbors import KNeighborsClassifier

In [220]:
clf = KNeighborsClassifier(n_neighbors=5, n_jobs=-1, metric='manhattan', weights='distance')

In [221]:
scaled_features = StandardScaler().fit_transform(features)

In [222]:
ftclf = clf.fit(scaled_features, target)

In [223]:
ftclf.predict(scaled_features[[[134], [6]], :].reshape(2,4))

array([2, 0])

In [224]:
scaled_features.shape

(150, 4)

In [225]:
ftclf.predict_proba(scaled_features[[[134], [6]], :].reshape(2,4))

array([[0., 0., 1.],
       [1., 0., 0.]])

# Identifying the Best Neighborhood Size

In [228]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [231]:
KNN = KNeighborsClassifier(n_neighbors=10, metric='manhattan', n_jobs=-1, weights='distance')

In [238]:
pipe = Pipeline([('std', StandardScaler()),('clf',KNN)])

In [241]:
grid = GridSearchCV(pipe, {'clf__n_neighbors':[1,2,3,4,5,6,7,8,9,10]}, cv=5, verbose=0, scoring='accuracy')

In [243]:
ftgrid = grid.fit(load_iris().data, load_iris().target)

In [245]:
ftgrid.best_estimator_.get_params()['clf__n_neighbors']

3

In [296]:
ftgrid.best_score_

0.9533333333333334

In [297]:
ftgrid.best_index_

2

In [300]:
ftgrid.cv_results_

{'mean_fit_time': array([0.00167327, 0.00183773, 0.00125666, 0.0011076 , 0.00099297,
        0.00101786, 0.00094814, 0.00108266, 0.00106225, 0.00097542]),
 'std_fit_time': array([1.85029002e-04, 4.15090649e-04, 1.83436745e-04, 1.26293416e-04,
        3.94149684e-05, 1.69840226e-04, 1.99654172e-05, 1.13117996e-04,
        2.02684041e-04, 6.53930313e-05]),
 'mean_score_time': array([0.01038656, 0.00924029, 0.00703197, 0.00596385, 0.00586762,
        0.00609579, 0.00590615, 0.00595994, 0.00577035, 0.00614848]),
 'std_score_time': array([0.00203616, 0.00146477, 0.00097021, 0.00036978, 0.00023899,
        0.00067541, 0.000444  , 0.00035159, 0.00020756, 0.00049247]),
 'param_clf__n_neighbors': masked_array(data=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'params': [{'clf__n_neighbors': 1},
  {'clf__n_neighbors': 2},
  {'clf__n_neighbors': 3}

# Creating a Radius-Based Nearest Neighbor Classifier

In [249]:
from sklearn.neighbors import RadiusNeighborsClassifier

In [255]:
Rclf = RadiusNeighborsClassifier(radius=0.5, n_jobs=-1, outlier_label=1)

In [256]:
fitRclf = Rclf.fit(scaled_features, target)

In [261]:
fitRclf.predict([[1,1,1,1]])

array([2])

In [262]:
fitRclf.predict_proba([[1,1,1,1]])

array([[0., 0., 1.]])

In [263]:
fitRclf.classes_

array([0, 1, 2])

In [264]:
fitRclf.weights

'uniform'

In [265]:
fitRclf.effective_metric_

'euclidean'

In [268]:
print(fitRclf.metric_params)

None


In [276]:
fitRclf.radius

0.5

In [290]:
distance, index = fitRclf.radius_neighbors(scaled_features[0].reshape(1,4))
index

array([array([49,  7, 37, 11, 39, 40, 17, 26,  4, 27, 28,  0, 20, 36])],
      dtype=object)

In [287]:
print(fitRclf.radius_neighbors_graph(scaled_features[0].reshape(1,4)))

  (0, 49)	1.0
  (0, 7)	1.0
  (0, 37)	1.0
  (0, 11)	1.0
  (0, 39)	1.0
  (0, 40)	1.0
  (0, 17)	1.0
  (0, 26)	1.0
  (0, 4)	1.0
  (0, 27)	1.0
  (0, 28)	1.0
  (0, 0)	1.0
  (0, 20)	1.0
  (0, 36)	1.0


In [295]:
fitRclf.outlier_label_

[1]