In [88]:
import csv
import pandas as pd
import numpy as np

In [89]:
# Read data
df = pd.read_csv('mobile.csv', encoding="cp949")
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [90]:
# check data's shape
print('shape:',df.shape)

shape: (2000, 21)


In [91]:
# set target column
target = 'price_range'

# Separate label
X = df.drop(columns=[target])
y = df[target]

In [92]:
from sklearn.model_selection import train_test_split

# make test set
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=1, stratify=y)

In [93]:
#Print training & test data proporttion
print("X_train set--------------------")
print("Shape:",X_train.shape)
print("Target:")
print(y_train.value_counts())
print()
      
print("X_test set info-----------------")
print("Shape:",X_test.shape)
print("Target:")
print(y_test.value_counts())
print()

X_train set--------------------
Shape: (1600, 20)
Target:
3    400
2    400
1    400
0    400
Name: price_range, dtype: int64

X_test set info-----------------
Shape: (400, 20)
Target:
3    100
2    100
1    100
0    100
Name: price_range, dtype: int64



In [94]:
from sklearn.neighbors import KNeighborsClassifier

#Create a KNN classifier
knn = KNeighborsClassifier(n_neighbors = 3)

# Traing the data
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [101]:
# run model to predict
knn.predict(X_test[0:10])

array([0, 2, 2, 1, 2, 1, 3, 3, 2, 0], dtype=int64)

In [96]:
# check accuracy of test data (Hold-out method)
print('accuracy (hold-out):',knn.score(X_test, y_test))

accuracy (hold-out): 0.91


In [97]:
from sklearn.model_selection import cross_val_score

# train model with cv of 5
cv_scores = cross_val_score(knn, X, y, cv=5)

In [104]:
# check accuracy of test data (k-fold cross validation method)
print(cv_scores)
print('accuracy (cv_score_5_fold):', np.mean(cv_scores))

[0.92   0.92   0.9325 0.9175 0.9075]
accuracy (cv_score_5_fold): 0.9195


In [105]:
# Hypertune the model
from sklearn.model_selection import GridSearchCV

# create new KNN model
knn2 = KNeighborsClassifier()

# Define Knn parameters's range
param_grid = {'n_neighbors':np.arange(1,25)}

# use grid search
knn_grid = GridSearchCV(knn2, param_grid, cv=5)

knn_grid.fit(X,y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [106]:
print('best parameter:',knn_grid.best_params_)
print('best socre:', knn_grid.best_score_)

best parameter: {'n_neighbors': 11}
best socre: 0.9345
