# Classification - K-Nearest Neighbors
Choose literally k nearest neighbers of x and assign the most frequent label(y) to x.

In [1]:
import seaborn as sns
import pandas as pd

In [6]:
tip = sns.load_dataset('tips')
tip.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


## Preprocessing

Let's predict the categorical data 'size' from other variables!

In [7]:
# Check NaN - NO null-values!
tip.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


For the first two numeric columns, it will be standardized.

In [28]:
# Standardization
from sklearn.preprocessing import StandardScaler

In [32]:
scaler = StandardScaler()
scaled = StandardScaler().fit_transform(tip[['total_bill', 'tip']])   # np.array
scaled = pd.DataFrame(scaled, columns=['total_bill', 'tip'])
scaled

Unnamed: 0,total_bill,tip
0,-0.314711,-1.439947
1,-1.063235,-0.969205
2,0.137780,0.363356
3,0.438315,0.225754
4,0.540745,0.443020
...,...,...
239,1.040511,2.115963
240,0.832275,-0.722971
241,0.324630,-0.722971
242,-0.221287,-0.904026


For the rest categorical columns, it will be onehot-encoded.

In [9]:
# Onehot-encoding on categorical columns
# Check unique values first
print(tip['sex'].unique())
print(tip['smoker'].unique())
print(tip['day'].unique())
print(tip['time'].unique())

['Female', 'Male']
Categories (2, object): ['Female', 'Male']
['No', 'Yes']
Categories (2, object): ['No', 'Yes']
['Sun', 'Sat', 'Thur', 'Fri']
Categories (4, object): ['Sun', 'Sat', 'Thur', 'Fri']
['Dinner', 'Lunch']
Categories (2, object): ['Dinner', 'Lunch']


In [11]:
# Get dummies for the columns
dummy = pd.get_dummies(tip[['sex', 'smoker', 'day', 'time']])
dummy.head()

Unnamed: 0,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner
0,0,1,0,1,0,0,0,1,0,1
1,1,0,0,1,0,0,0,1,0,1
2,1,0,0,1,0,0,0,1,0,1
3,1,0,0,1,0,0,0,1,0,1
4,0,1,0,1,0,0,0,1,0,1


Concatenate the both preprocessed data back to a dataframe.

In [33]:
# Concat dummy to standardized dataframe
concated = pd.concat([scaled, dummy], axis=1)
concated.head()

Unnamed: 0,total_bill,tip,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner
0,-0.314711,-1.439947,0,1,0,1,0,0,0,1,0,1
1,-1.063235,-0.969205,1,0,0,1,0,0,0,1,0,1
2,0.13778,0.363356,1,0,0,1,0,0,0,1,0,1
3,0.438315,0.225754,1,0,0,1,0,0,0,1,0,1
4,0.540745,0.44302,0,1,0,1,0,0,0,1,0,1


## Fit Model

In [27]:
# Set x and y
x = df[['total_bill', 'tip', 'sex_Male', 'sex_Female', 
        'smoker_Yes', 'smoker_No', 'day_Thur', 'day_Fri', 'day_Sat', 'day_Sun', 'time_Lunch', 'time_Dinner']]
y = df['size']

In [34]:
# Split train/test data
from sklearn.model_selection import train_test_split

In [35]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.85, random_state=5)
x_train.shape, y_test.shape

((207, 12), (37,))

In [36]:
# Create model object
from sklearn.neighbors import KNeighborsClassifier

In [37]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train, y_train)

KNeighborsClassifier()

## Predict Unseen Data

In [40]:
knn.predict(x_test)[:10]

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [41]:
# Compare with actual data
y_test[:10]

55     2
191    2
210    3
96     2
163    2
150    2
240    2
51     2
140    2
89     2
Name: size, dtype: int64

## Evaluation

In [42]:
# Get score - mean accuracy
knn.score(x_test, y_test)

0.7297297297297297

Find information from classification report & confusion matrix.

In [43]:
from sklearn.metrics import classification_report, confusion_matrix

In [45]:
print(classification_report(y_test, knn.predict(x_test)))

              precision    recall  f1-score   support

           2       0.84      0.90      0.87        30
           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00         4

    accuracy                           0.73        37
   macro avg       0.28      0.30      0.29        37
weighted avg       0.68      0.73      0.71        37



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


It seems like it predicts well only the groups of 2 people, not 3 and 4 at all.

In [46]:
confusion_matrix(y_test, knn.predict(x_test))

array([[27,  3,  0],
       [ 3,  0,  0],
       [ 2,  2,  0]], dtype=int64)