### import libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedShuffleSplit
from sklearn.neighbors import KNeighborsClassifier

### read data

In [3]:
bioresp = pd.read_csv('bioresponse.csv', header=0, sep=',')
bioresp = bioresp[:100] #cut, because my notebook dies if i load and fit all data
bioresp.head(5)

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,...,0,0,0,0,0,0,0,0,0,0
4,0,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0


In [5]:
bioresp.shape

(100, 1777)

### break data into train-test

In [6]:
y = bioresp.Activity.values
X = bioresp.iloc[:, 1:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=154)
cv = StratifiedShuffleSplit(n_splits = 4, test_size = 0.2, random_state = 42)

### use KNeighborsClassifier

In [7]:
classifier = KNeighborsClassifier()

In [8]:
classifier.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': 1,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

#### grid

In [9]:
parameters_grid = {
    'n_neighbors' : range(3, 7),
    'weights' : ['uniform'],
    'algorithm' : ['auto'],
    'leaf_size' : range(25, 35, 1),
    'p' : range(3,7,1)
}

### set up grid

In [10]:
grid_cv = GridSearchCV(classifier, parameters_grid, scoring = 'accuracy', cv = cv)

### fit model

In [11]:
%%time
grid_cv.fit(X_train, y_train)

Wall time: 3min 56s


GridSearchCV(cv=StratifiedShuffleSplit(n_splits=4, random_state=42, test_size=0.2,
            train_size=None),
       error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'leaf_size': range(25, 35), 'algorithm': ['auto'], 'weights': ['uniform'], 'p': range(3, 7), 'n_neighbors': range(3, 7)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

### get results

In [12]:
for mean_test_score, std_test_score, params in zip(grid_cv.cv_results_['mean_test_score'], 
                   grid_cv.cv_results_['std_test_score'], 
                   grid_cv.cv_results_['params']):
    print(params)
    print('mean:', mean_test_score)
    print('std:', std_test_score, '\n')

{'weights': 'uniform', 'n_neighbors': 3, 'leaf_size': 25, 'p': 3, 'algorithm': 'auto'}
mean: 0.5625
std: 0.076546554462 

{'weights': 'uniform', 'n_neighbors': 3, 'leaf_size': 25, 'p': 4, 'algorithm': 'auto'}
mean: 0.515625
std: 0.102459976942 

{'weights': 'uniform', 'n_neighbors': 3, 'leaf_size': 25, 'p': 5, 'algorithm': 'auto'}
mean: 0.515625
std: 0.102459976942 

{'weights': 'uniform', 'n_neighbors': 3, 'leaf_size': 25, 'p': 6, 'algorithm': 'auto'}
mean: 0.515625
std: 0.102459976942 

{'weights': 'uniform', 'n_neighbors': 4, 'leaf_size': 25, 'p': 3, 'algorithm': 'auto'}
mean: 0.53125
std: 0.103644524699 

{'weights': 'uniform', 'n_neighbors': 4, 'leaf_size': 25, 'p': 4, 'algorithm': 'auto'}
mean: 0.53125
std: 0.103644524699 

{'weights': 'uniform', 'n_neighbors': 4, 'leaf_size': 25, 'p': 5, 'algorithm': 'auto'}
mean: 0.53125
std: 0.103644524699 

{'weights': 'uniform', 'n_neighbors': 4, 'leaf_size': 25, 'p': 6, 'algorithm': 'auto'}
mean: 0.53125
std: 0.103644524699 

{'weights': 'u

### best results(parameters)

In [13]:
print(grid_cv.best_estimator_)
print(grid_cv.best_score_)

KNeighborsClassifier(algorithm='auto', leaf_size=25, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=3,
           weights='uniform')
0.5625


### the same but with randomCV

In [14]:
parameters_grid = {
    'n_neighbors' : range(3, 7),
    'weights' : ['uniform'],
    'algorithm' : ['auto'],
    'leaf_size' : range(25, 35, 1),
    'p' : range(3,7,1)
}

In [15]:
randomized_grid_cv = RandomizedSearchCV(classifier, parameters_grid, scoring = 'accuracy', cv = cv,
                                        n_iter = 5, random_state = 42)

In [16]:
%%time
randomized_grid_cv.fit(X_train, y_train)

Wall time: 7.99 s


RandomizedSearchCV(cv=StratifiedShuffleSplit(n_splits=4, random_state=42, test_size=0.2,
            train_size=None),
          error_score='raise',
          estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
          fit_params={}, iid=True, n_iter=5, n_jobs=1,
          param_distributions={'leaf_size': range(25, 35), 'algorithm': ['auto'], 'weights': ['uniform'], 'p': range(3, 7), 'n_neighbors': range(3, 7)},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score=True, scoring='accuracy', verbose=0)

#### results

In [17]:
for mean_test_score, std_test_score, params in zip(randomized_grid_cv.cv_results_['mean_test_score'], 
                   randomized_grid_cv.cv_results_['std_test_score'], 
                   randomized_grid_cv.cv_results_['params']):
    print(params)
    print('mean:', mean_test_score)
    print('std:', std_test_score, '\n')

{'leaf_size': 31, 'algorithm': 'auto', 'n_neighbors': 5, 'weights': 'uniform', 'p': 4}
mean: 0.546875
std: 0.12001790231 

{'leaf_size': 31, 'algorithm': 'auto', 'n_neighbors': 6, 'weights': 'uniform', 'p': 3}
mean: 0.53125
std: 0.128847050801 

{'leaf_size': 33, 'algorithm': 'auto', 'n_neighbors': 6, 'weights': 'uniform', 'p': 4}
mean: 0.53125
std: 0.09375 

{'leaf_size': 28, 'algorithm': 'auto', 'n_neighbors': 4, 'weights': 'uniform', 'p': 6}
mean: 0.53125
std: 0.103644524699 

{'leaf_size': 30, 'algorithm': 'auto', 'n_neighbors': 6, 'weights': 'uniform', 'p': 5}
mean: 0.546875
std: 0.111584819196 



#### best results(parameters)

In [18]:
print(randomized_grid_cv.best_estimator_)
print(randomized_grid_cv.best_score_)

KNeighborsClassifier(algorithm='auto', leaf_size=31, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=4,
           weights='uniform')
0.546875


summary: as we can see, both functions: conventional and random approximately equals, but for big datasets, as we have here, preferably use the random func because it calculates in random points(evenly distr), not on whole amount, so it noticeably reduce the time of learning