In [2]:
from sklearn import neighbors
import numpy as np

data = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])

# default n_neighbors = 5, it has to be >= data.length
nbrs = neighbors.NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(data)
distances_to_data, indicesOf_data = nbrs.kneighbors(data)

# length of each row of indicesOf_data is equal to n_neighbors
print(indicesOf_data)  # indices of data. (i.e: if indices is [1 0 3], it means data[1], data[0] and data[3])
# length of each row of distances_to_data is equal to n_neighbors
print(distances_to_data)  # distance to each point. i.e: if indices is [1 0 3], it means:
# [{distance to data[1]}, {distance to data[0]}, {distance to data[3]}])


[[0 1 2]
 [1 0 2]
 [2 1 0]
 [3 4 5]
 [4 3 5]
 [5 4 3]]
[[0.         1.         2.23606798]
 [0.         1.         1.41421356]
 [0.         1.41421356 2.23606798]
 [0.         1.         2.23606798]
 [0.         1.         1.41421356]
 [0.         1.41421356 2.23606798]]


In [3]:
print(nbrs.kneighbors_graph(data))  # (A, B) $(A and B are neighbors or not)
# i.e: there is a row: (2, 1) 1.0 -> are data[2] and data[1] neighbors or not? 1 means yes, 0 means no
# (or return the possibility from 0.0 to 1.0)

  (0, 0)	1.0
  (0, 1)	1.0
  (0, 2)	1.0
  (1, 1)	1.0
  (1, 0)	1.0
  (1, 2)	1.0
  (2, 2)	1.0
  (2, 1)	1.0
  (2, 0)	1.0
  (3, 3)	1.0
  (3, 4)	1.0
  (3, 5)	1.0
  (4, 4)	1.0
  (4, 3)	1.0
  (4, 5)	1.0
  (5, 5)	1.0
  (5, 4)	1.0
  (5, 3)	1.0


In [4]:
# nbrs can use KD Tree rather than ball tree, which looks like this:
kdt = neighbors.KDTree(data,
                       leaf_size=30,  # only effects performance, won't affect result.
                       metric='euclidean')
print(kdt.query(data,
                k=6,  # number of neighbors, can be at most data.length
                return_distance=False))

[[0 1 2 3 4 5]
 [1 0 2 3 4 5]
 [2 1 0 3 4 5]
 [3 4 5 0 1 2]
 [4 3 5 0 1 2]
 [5 4 3 0 1 2]]


In [5]:
nc = neighbors.NearestCentroid()
from sklearn import svm

svc = svm.SVC()

data2 = np.array([np.arange(0, 9), np.arange(0, 9)])
data2 = data2.T
target2 = np.arange(0, 9)

nc.fit(data2, target2)
svc.fit(data2, target2)

print(nc.predict([[100000, -100]]))
print(svc.predict([[100000, -100]]))

[8]
[8]


In [6]:
knt_d = neighbors.KNeighborsTransformer(n_neighbors=1, mode='distance')
# prints: (point A, point B) $(distance)
print(knt_d.fit_transform(data2, target2))  # either (point itself, point itself), or (point A, point B)

  (0, 0)	0.0
  (0, 1)	1.4142135623730951
  (1, 1)	0.0
  (1, 0)	1.4142135623730951
  (2, 2)	0.0
  (2, 1)	1.4142135623730951
  (3, 3)	0.0
  (3, 2)	1.4142135623730951
  (4, 4)	0.0
  (4, 3)	1.4142135623730951
  (5, 5)	0.0
  (5, 4)	1.4142135623730951
  (6, 6)	0.0
  (6, 5)	1.4142135623730951
  (7, 7)	0.0
  (7, 6)	1.4142135623730951
  (8, 8)	0.0
  (8, 7)	1.4142135623730951


In [7]:
# prints: (point A, point B) $(is neighbor or not: 1 == yes, 0 == no). Just like the kneighbors_graph
knt_c = neighbors.KNeighborsTransformer(n_neighbors=1, mode='connectivity')
print(knt_c.fit_transform(data2, target2))

  (0, 0)	1.0
  (1, 1)	1.0
  (2, 2)	1.0
  (3, 3)	1.0
  (4, 4)	1.0
  (5, 5)	1.0
  (6, 6)	1.0
  (7, 7)	1.0
  (8, 8)	1.0


In [8]:
from sklearn.pipeline import make_pipeline
from sklearn.manifold import Isomap

knt_d_i = make_pipeline(
    knt_d,
    # Isomap is a 降维 method, 用来提速
    Isomap(neighbors_algorithm='brute'),
    memory='./cache')

print(knt_d_i.fit_transform(data2, target2))

[[ 0.93530067  1.13984386]
 [-0.90978582  2.08680997]
 [-1.96860234 -1.74918355]
 [ 1.41836969 -0.46892847]
 [-1.20194288  0.37681371]
 [-0.10996824 -0.20864408]
 [-0.10996824 -0.20864408]
 [-0.10996824 -0.20864408]
 [ 2.05656538 -0.75942328]]


In [9]:
knt_c_i = make_pipeline(
    knt_c,
    # Isomap is a 降维 method, 用来提速
    Isomap(neighbors_algorithm='brute'),
    memory='./cache')

# 可得出来的是个什么鬼？？？
print(knt_c_i.fit_transform(data2, target2))

[[ 2.31503177e-02  8.05822628e-17]
 [-9.54463712e-01  1.41421356e+00]
 [-9.54463712e-01 -1.41421356e+00]
 [ 1.09874981e+00 -3.60822483e-16]
 [-8.73164628e-01 -5.55111512e-17]
 [ 2.31503177e-02  1.66533454e-16]
 [ 2.31503177e-02 -3.88578059e-16]
 [ 2.31503177e-02  5.55111512e-17]
 [ 1.59074097e+00 -1.66533454e-16]]


In [11]:

from sklearn.pipeline import Pipeline
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

data_i, targeti = load_iris(return_X_y=True)
data_train, data_test, target_train, target_test = train_test_split(data_i, targeti, random_state=42,
                                                                    stratify=targeti, test_size=0.7)

nca = neighbors.NeighborhoodComponentsAnalysis(random_state=42)
knn = neighbors.KNeighborsClassifier(n_neighbors=3)

# 重新起一个，不然会打脸的 - -||
# nca_knn = Pipeline([('nca', nca), ('knn', knn)])

nca_knn = Pipeline([('nca', neighbors.NeighborhoodComponentsAnalysis(random_state=42)),
                    ('knn', neighbors.KNeighborsClassifier(n_neighbors=3))])
nca_knn.fit(data_train, target_train)
# nca_knn.fit(data2, target2)
knn.fit(data_train, target_train)
nc.fit(data_train, target_train)
svc.fit(data_train, target_train)

from sklearn.naive_bayes import GaussianNB
bay = GaussianNB()

bay.fit(data_train, target_train)

from sklearn import tree

dt_clf = tree.DecisionTreeClassifier()

dt_clf.fit(data_train, target_train)

dt_reg = tree.DecisionTreeRegressor()  # 极端不稳定
dt_reg.fit(data_train, target_train)


from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
# ensemble means "vote"
from sklearn.neighbors import KNeighborsClassifier

bagging = BaggingClassifier(max_samples=0.5,  # half rows
                            max_features=0.5,  # half columns
                            base_estimator=KNeighborsClassifier())

# better than Bagging: can define how many estimators, rather than unknown number of estimators
# worse than Bagging: cannot define the base_estimator

rndForest = RandomForestClassifier(n_estimators=10)

exTree = ExtraTreesClassifier(max_samples=0.5,  # half rows
                              max_features=0.5,  # half columns
                              n_estimators=10)

bagging.fit(data_train, target_train)
rndForest.fit(data_train, target_train)
exTree.fit(data_train, target_train)

print(nca_knn.score(data_test, target_test))
print(knn.score(data_test, target_test))
print(nc.score(data_test, target_test))
print(svc.score(data_test, target_test))
print(bay.score(data_test, target_test))
print(dt_clf.score(data_test, target_test))
print(dt_reg.score(data_test, target_test))
print(bagging.score(data_test, target_test))
print(rndForest.score(data_test, target_test))
print(exTree.score(data_test, target_test))


0.9619047619047619
0.9333333333333333
0.9238095238095239
0.9714285714285714
0.9523809523809523
0.9142857142857143
0.8714285714285714
0.9619047619047619
0.9047619047619048
0.9428571428571428
