In [1]:
from io import StringIO
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pydotplus
from sklearn import metrics
from sklearn import preprocessing
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

## Read and Understand data

In [2]:
# read

df = pd.read_csv('teleCust1000t.csv')
df.head(5)

Unnamed: 0,region,tenure,age,marital,address,income,ed,employ,retire,gender,reside,custcat
0,2,13,44,1,9,64.0,4,5,0.0,0,2,1
1,3,11,33,1,7,136.0,5,5,0.0,0,6,4
2,3,68,52,1,24,116.0,1,29,0.0,1,2,3
3,2,33,33,0,12,33.0,2,0,0.0,1,1,1
4,2,23,30,1,9,30.0,1,2,0.0,0,4,3


In [3]:
# understand

df['custcat'].value_counts()

3    281
1    266
4    236
2    217
Name: custcat, dtype: int64

In [4]:
# obtain x

x = df[df.columns[:-1]].values
x[:5]

array([[  2.,  13.,  44.,   1.,   9.,  64.,   4.,   5.,   0.,   0.,   2.],
       [  3.,  11.,  33.,   1.,   7., 136.,   5.,   5.,   0.,   0.,   6.],
       [  3.,  68.,  52.,   1.,  24., 116.,   1.,  29.,   0.,   1.,   2.],
       [  2.,  33.,  33.,   0.,  12.,  33.,   2.,   0.,   0.,   1.,   1.],
       [  2.,  23.,  30.,   1.,   9.,  30.,   1.,   2.,   0.,   0.,   4.]])

In [5]:
# normalize x

scaler = preprocessing.StandardScaler().fit(x)
x = scaler.transform(x)
x[:5]

array([[-0.02696767, -1.055125  ,  0.18450456,  1.0100505 , -0.25303431,
        -0.12650641,  1.0877526 , -0.5941226 , -0.22207644, -1.03459817,
        -0.23065004],
       [ 1.19883553, -1.14880563, -0.69181243,  1.0100505 , -0.4514148 ,
         0.54644972,  1.9062271 , -0.5941226 , -0.22207644, -1.03459817,
         2.55666158],
       [ 1.19883553,  1.52109247,  0.82182601,  1.0100505 ,  1.23481934,
         0.35951747, -1.36767088,  1.78752803, -0.22207644,  0.96655883,
        -0.23065004],
       [-0.02696767, -0.11831864, -0.69181243, -0.9900495 ,  0.04453642,
        -0.41625141, -0.54919639, -1.09029981, -0.22207644,  0.96655883,
        -0.92747794],
       [-0.02696767, -0.58672182, -0.93080797,  1.0100505 , -0.25303431,
        -0.44429125, -1.36767088, -0.89182893, -0.22207644, -1.03459817,
         1.16300577]])

In [6]:
# obtain y

y = df[df.columns[-1]].values
y[:5]

array([1, 4, 3, 1, 3], dtype=int64)

In [7]:
# seprate to train and test 

train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.2, random_state = 4)
train_x.shape, test_x.shape

((800, 11), (200, 11))

## KNN Model

In [8]:
# create and test model

k = 4
model = KNeighborsClassifier(k).fit(train_x, train_y)

train_y_ = model.predict(train_x)
test_y_ = model.predict(test_x)

metrics.accuracy_score(train_y, train_y_), metrics.accuracy_score(test_y, test_y_)

(0.5475, 0.32)

In [9]:
# find the best k:

for k in range(1, 20):
    model = KNeighborsClassifier(k).fit(train_x, train_y)
    train_y_ = model.predict(train_x)
    test_y_ = model.predict(test_x)
    print(f'k = {k:2}: mean accuracy: {metrics.accuracy_score(test_y, test_y_):.3f}, std accuracy: {np.std(test_y == test_y_)/ np.sqrt(test_y.shape[0]):.5f}')

k =  1: mean accuracy: 0.300, std accuracy: 0.03240
k =  2: mean accuracy: 0.290, std accuracy: 0.03209
k =  3: mean accuracy: 0.315, std accuracy: 0.03285
k =  4: mean accuracy: 0.320, std accuracy: 0.03298
k =  5: mean accuracy: 0.315, std accuracy: 0.03285
k =  6: mean accuracy: 0.310, std accuracy: 0.03270
k =  7: mean accuracy: 0.335, std accuracy: 0.03337
k =  8: mean accuracy: 0.325, std accuracy: 0.03312
k =  9: mean accuracy: 0.340, std accuracy: 0.03350
k = 10: mean accuracy: 0.330, std accuracy: 0.03325
k = 11: mean accuracy: 0.315, std accuracy: 0.03285
k = 12: mean accuracy: 0.340, std accuracy: 0.03350
k = 13: mean accuracy: 0.330, std accuracy: 0.03325
k = 14: mean accuracy: 0.315, std accuracy: 0.03285
k = 15: mean accuracy: 0.340, std accuracy: 0.03350
k = 16: mean accuracy: 0.360, std accuracy: 0.03394
k = 17: mean accuracy: 0.355, std accuracy: 0.03384
k = 18: mean accuracy: 0.350, std accuracy: 0.03373
k = 19: mean accuracy: 0.345, std accuracy: 0.03361


In [10]:
# predict with k = 16

nx = [[4, 18, 20, 0, 8, 136, 6, 6, 0, 1, 8]]
nx = scaler.transform(nx)

model = KNeighborsClassifier(16).fit(train_x, train_y)
model.predict(nx)

array([4], dtype=int64)

## Read and Understand data

In [36]:
# read

df = pd.read_csv('drug200.csv')
df.head(5)

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY


In [37]:
# understand

df['Drug'].value_counts()

drugY    91
drugX    54
drugA    23
drugC    16
drugB    16
Name: Drug, dtype: int64

In [38]:
# obtain x

x = df[df.columns[:-1]].values
x[:5]

array([[23, 'F', 'HIGH', 'HIGH', 25.355],
       [47, 'M', 'LOW', 'HIGH', 13.093],
       [47, 'M', 'LOW', 'HIGH', 10.114],
       [28, 'F', 'NORMAL', 'HIGH', 7.798],
       [61, 'F', 'LOW', 'HIGH', 18.043]], dtype=object)

In [39]:
# encode qualitative columns

sexEncode = preprocessing.LabelEncoder().fit(df['Sex'].unique())
bpEncode = preprocessing.LabelEncoder().fit(df['BP'].unique())
cholestrolEncode = preprocessing.LabelEncoder().fit(df['Cholesterol'].unique())

x[:,1], x[:,2], x[:,3] = sexEncode.transform(x[:,1]), bpEncode.transform(x[:,2]), cholestrolEncode.transform(x[:,3]) 
x[:5]

array([[23, 0, 0, 0, 25.355],
       [47, 1, 1, 0, 13.093],
       [47, 1, 1, 0, 10.114],
       [28, 0, 2, 0, 7.798],
       [61, 0, 1, 0, 18.043]], dtype=object)

In [40]:
# obtain y

y = df[df.columns[-1]].values
y[:5]

array(['drugY', 'drugC', 'drugC', 'drugX', 'drugY'], dtype=object)

In [41]:
# seprate to train and test 

train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.2, random_state = 4)
train_x.shape, test_x.shape

((160, 5), (40, 5))

## Decision Tree Model

In [42]:
# create and test model

model = DecisionTreeClassifier(criterion = 'entropy', max_depth = 4).fit(train_x, train_y)

train_y_ = model.predict(train_x)
test_y_ = model.predict(test_x)

metrics.accuracy_score(train_y, train_y_), metrics.accuracy_score(test_y, test_y_)

(1.0, 0.95)

In [51]:
# predict

nx = [[28, 'M', 'HIGH', 'HIGH', 20.355]]

nx[0][1] = sexEncode.transform([nx[0][1]])[0]
nx[0][2] = bpEncode.transform([nx[0][2]])[0]
nx[0][3] = cholestrolEncode.transform([nx[0][3]])[0]

model.predict(nx)

array(['drugY'], dtype=object)