In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score  
from sklearn.metrics import recall_score  
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn import neighbors
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt

## get the X_date, Y_target

In [2]:
def get_the_predicted_bugs(name, num1, num2):
    All_Data = pd.read_csv(name, usecols = [i for i in range(num1, num2)]).values
    return All_Data

In [3]:
data = get_the_predicted_bugs('modified_all_data.csv', 2, 45)
data

array([[ 0.,  0.,  0., ..., 10.,  7.,  0.],
       [ 0.,  0.,  0., ...,  5.,  2.,  0.],
       [ 1.,  1.,  0., ..., 12.,  9.,  0.],
       ...,
       [ 1.,  1.,  0., ..., 33., 19.,  0.],
       [ 1.,  0.,  0., ...,  6.,  3.,  0.],
       [ 1.,  1.,  0., ...,  6.,  5.,  0.]])

## K-nearest neighbors

## best accuracy_score  

In [4]:
data_random = np.random.permutation(data)
x = data_random[:, [i for i in range(data.shape[1] - 1)]]
y = data_random[:, -1]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 0)
knn_clf = KNeighborsClassifier()

In [5]:
param_grid = [
         {
             'weights':['uniform'],
             'n_neighbors':[i for i in range(1,11)]
         },
         {
             'weights':['distance'],
             'n_neighbors':[i for i in range(1,11)],
             'p':[i for i in range(1,6)]
         }
             ]
grid_search = GridSearchCV(knn_clf, param_grid, scoring = 'accuracy', n_jobs = -1, cv = 3)

In [6]:
grid_search.fit(x_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'weights': ['uniform'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, {'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'p': [1, 2, 3, 4, 5]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [7]:
grid_search.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')

In [8]:
grid_search.best_score_

0.8641425389755011

In [9]:
grid_search.best_params_

{'n_neighbors': 10, 'weights': 'uniform'}

In [10]:
knn_clf = grid_search.best_estimator_
y_pre = knn_clf.predict(x_test)
accuracy_score(y_test, y_pre), recall_score(y_test, y_pre)

(0.8547579298831386, 0.07058823529411765)

In [11]:
accuracy_score(y_test, [0] * len(y_test)), recall_score(y_test, [0] * len(y_test))

(0.8580968280467446, 0.0)

By K-nearest neighbors, the best accuracy_score we can get is about 0.855.

## best recall_score  

In [12]:
data_random = np.random.permutation(data)
x = data_random[:, [i for i in range(data.shape[1] - 1)]]
y = data_random[:, -1]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 0)
knn_clf = KNeighborsClassifier()

In [13]:
param_grid = [
         {
             'weights':['uniform'],
             'n_neighbors':[i for i in range(1,11)]
         },
         {
             'weights':['distance'],
             'n_neighbors':[i for i in range(1,11)],
             'p':[i for i in range(1,6)]
         }
             ]
grid_search = GridSearchCV(knn_clf, param_grid, scoring = 'recall', n_jobs = -1, cv = 3)

In [14]:
grid_search.fit(x_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'weights': ['uniform'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, {'weights': ['distance'], 'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'p': [1, 2, 3, 4, 5]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='recall', verbose=0)

In [15]:
grid_search.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=1,
           weights='distance')

In [16]:
grid_search.best_score_

0.26019881579662124

In [17]:
grid_search.best_params_

{'n_neighbors': 1, 'p': 1, 'weights': 'distance'}

In [18]:
knn_clf = grid_search.best_estimator_
y_pre = knn_clf.predict(x_test)
accuracy_score(y_test, y_pre), recall_score(y_test, y_pre)

(0.8013355592654424, 0.2441860465116279)

In [19]:
accuracy_score(y_test, [0] * len(y_test)), recall_score(y_test, [0] * len(y_test))

(0.8564273789649416, 0.0)

By K-nearest neighbors, the best recall_score we can get is about 0.244.

By K-nearest neighbors, both the accuracy_score and the recall_score is not good enough.