In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

## Read and Understand data

In [2]:
# read

df = pd.read_csv('teleCust1000t.csv')
df.head(5)

Unnamed: 0,region,tenure,age,marital,address,income,ed,employ,retire,gender,reside,custcat
0,2,13,44,1,9,64.0,4,5,0.0,0,2,1
1,3,11,33,1,7,136.0,5,5,0.0,0,6,4
2,3,68,52,1,24,116.0,1,29,0.0,1,2,3
3,2,33,33,0,12,33.0,2,0,0.0,1,1,1
4,2,23,30,1,9,30.0,1,2,0.0,0,4,3


In [3]:
# understand

df['custcat'].value_counts()

3    281
1    266
4    236
2    217
Name: custcat, dtype: int64

In [4]:
# describe

df.describe()

Unnamed: 0,region,tenure,age,marital,address,income,ed,employ,retire,gender,reside,custcat
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,2.022,35.526,41.684,0.495,11.551,77.535,2.671,10.987,0.047,0.517,2.331,2.487
std,0.8162,21.359812,12.558816,0.500225,10.086681,107.044165,1.222397,10.082087,0.211745,0.499961,1.435793,1.120306
min,1.0,1.0,18.0,0.0,0.0,9.0,1.0,0.0,0.0,0.0,1.0,1.0
25%,1.0,17.0,32.0,0.0,3.0,29.0,2.0,3.0,0.0,0.0,1.0,1.0
50%,2.0,34.0,40.0,0.0,9.0,47.0,3.0,8.0,0.0,1.0,2.0,3.0
75%,3.0,54.0,51.0,1.0,18.0,83.0,4.0,17.0,0.0,1.0,3.0,3.0
max,3.0,72.0,77.0,1.0,55.0,1668.0,5.0,47.0,1.0,1.0,8.0,4.0


In [5]:
# obtain x

x = df[df.columns[:-1]].values
x[:5]

array([[  2.,  13.,  44.,   1.,   9.,  64.,   4.,   5.,   0.,   0.,   2.],
       [  3.,  11.,  33.,   1.,   7., 136.,   5.,   5.,   0.,   0.,   6.],
       [  3.,  68.,  52.,   1.,  24., 116.,   1.,  29.,   0.,   1.,   2.],
       [  2.,  33.,  33.,   0.,  12.,  33.,   2.,   0.,   0.,   1.,   1.],
       [  2.,  23.,  30.,   1.,   9.,  30.,   1.,   2.,   0.,   0.,   4.]])

In [6]:
# normalize x

scaler = preprocessing.StandardScaler().fit(x)
x = scaler.transform(x)
x[:5]

array([[-0.02696767, -1.055125  ,  0.18450456,  1.0100505 , -0.25303431,
        -0.12650641,  1.0877526 , -0.5941226 , -0.22207644, -1.03459817,
        -0.23065004],
       [ 1.19883553, -1.14880563, -0.69181243,  1.0100505 , -0.4514148 ,
         0.54644972,  1.9062271 , -0.5941226 , -0.22207644, -1.03459817,
         2.55666158],
       [ 1.19883553,  1.52109247,  0.82182601,  1.0100505 ,  1.23481934,
         0.35951747, -1.36767088,  1.78752803, -0.22207644,  0.96655883,
        -0.23065004],
       [-0.02696767, -0.11831864, -0.69181243, -0.9900495 ,  0.04453642,
        -0.41625141, -0.54919639, -1.09029981, -0.22207644,  0.96655883,
        -0.92747794],
       [-0.02696767, -0.58672182, -0.93080797,  1.0100505 , -0.25303431,
        -0.44429125, -1.36767088, -0.89182893, -0.22207644, -1.03459817,
         1.16300577]])

In [7]:
# obtain y

y = df[df.columns[-1]].values
y[:5]

array([1, 4, 3, 1, 3], dtype=int64)

In [8]:
# seprate to train and test 

train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.2, random_state = 4)
train_x.shape, test_x.shape

((800, 11), (200, 11))

## KNN Model

In [9]:
# create model

k = 4
model = KNeighborsClassifier(k).fit(train_x, train_y)
train_y_ = model.predict(train_x)

In [10]:
# test model

test_y_ = model.predict(test_x)
metrics.accuracy_score(train_y, train_y_), metrics.accuracy_score(test_y, test_y_)

(0.5475, 0.32)

In [11]:
# find the best k:

for k in range(1, 20):
    model = KNeighborsClassifier(k).fit(train_x, train_y)
    train_y_ = model.predict(train_x)
    test_y_ = model.predict(test_x)
    print(f'k = {k:2}: mean accuracy: {metrics.accuracy_score(test_y, test_y_):.3f}, std accuracy: {np.std(test_y == test_y_)/ np.sqrt(test_y.shape[0]):.5f}')

k =  1: mean accuracy: 0.300, std accuracy: 0.03240
k =  2: mean accuracy: 0.290, std accuracy: 0.03209
k =  3: mean accuracy: 0.315, std accuracy: 0.03285
k =  4: mean accuracy: 0.320, std accuracy: 0.03298
k =  5: mean accuracy: 0.315, std accuracy: 0.03285
k =  6: mean accuracy: 0.310, std accuracy: 0.03270
k =  7: mean accuracy: 0.335, std accuracy: 0.03337
k =  8: mean accuracy: 0.325, std accuracy: 0.03312
k =  9: mean accuracy: 0.340, std accuracy: 0.03350
k = 10: mean accuracy: 0.330, std accuracy: 0.03325
k = 11: mean accuracy: 0.315, std accuracy: 0.03285
k = 12: mean accuracy: 0.340, std accuracy: 0.03350
k = 13: mean accuracy: 0.330, std accuracy: 0.03325
k = 14: mean accuracy: 0.315, std accuracy: 0.03285
k = 15: mean accuracy: 0.340, std accuracy: 0.03350
k = 16: mean accuracy: 0.360, std accuracy: 0.03394
k = 17: mean accuracy: 0.355, std accuracy: 0.03384
k = 18: mean accuracy: 0.350, std accuracy: 0.03373
k = 19: mean accuracy: 0.345, std accuracy: 0.03361


In [12]:
# predict with k = 16

nx = [[4, 18, 20, 0, 8, 136, 6, 6, 0, 1, 8]]
nx = scaler.transform(nx)

model = KNeighborsClassifier(16).fit(train_x, train_y)
model.predict(nx)

array([4], dtype=int64)