# KNN Model

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv('ObesityDataSet_raw_and_data_sinthetic.csv')


In [3]:
dataset.isnull().sum()

Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

In [4]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Encoding independent catergorical features

In [5]:
dataset.drop('NObeyesdad',axis=1,inplace=True)

In [6]:
my_object_df = dataset.select_dtypes(include='object')
my_object_df.head()

Unnamed: 0,Gender,family_history_with_overweight,FAVC,CAEC,SMOKE,SCC,CALC,MTRANS
0,Female,yes,no,Sometimes,no,no,no,Public_Transportation
1,Female,yes,no,Sometimes,yes,yes,Sometimes,Public_Transportation
2,Male,yes,no,Sometimes,no,no,Frequently,Public_Transportation
3,Male,no,no,Sometimes,no,no,Frequently,Walking
4,Male,no,no,Sometimes,no,no,Sometimes,Public_Transportation


In [7]:
my_numeric_df = dataset.select_dtypes(exclude='object')
my_numeric_df.head()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
0,21.0,1.62,64.0,2.0,3.0,2.0,0.0,1.0
1,21.0,1.52,56.0,3.0,3.0,3.0,3.0,0.0
2,23.0,1.8,77.0,2.0,3.0,2.0,2.0,1.0
3,27.0,1.8,87.0,3.0,3.0,2.0,2.0,0.0
4,22.0,1.78,89.8,2.0,1.0,2.0,0.0,0.0


In [8]:
dummy_df = pd.get_dummies(my_object_df,drop_first=True)

In [9]:
final_df = pd.concat([my_numeric_df,dummy_df],axis=1)
final_df.head()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE,Gender_Male,family_history_with_overweight_yes,...,CAEC_no,SMOKE_yes,SCC_yes,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,21.0,1.62,64.0,2.0,3.0,2.0,0.0,1.0,0,1,...,0,0,0,0,0,1,0,0,1,0
1,21.0,1.52,56.0,3.0,3.0,3.0,3.0,0.0,0,1,...,0,1,1,0,1,0,0,0,1,0
2,23.0,1.8,77.0,2.0,3.0,2.0,2.0,1.0,1,1,...,0,0,0,1,0,0,0,0,1,0
3,27.0,1.8,87.0,3.0,3.0,2.0,2.0,0.0,1,0,...,0,0,0,1,0,0,0,0,0,1
4,22.0,1.78,89.8,2.0,1.0,2.0,0.0,0.0,1,0,...,0,0,0,0,1,0,0,0,1,0


In [10]:
X= final_df.iloc[:, :].values

In [11]:
print(X)

[[ 21.         1.62      64.       ...   0.         1.         0.      ]
 [ 21.         1.52      56.       ...   0.         1.         0.      ]
 [ 23.         1.8       77.       ...   0.         1.         0.      ]
 ...
 [ 22.524036   1.752206 133.689352 ...   0.         1.         0.      ]
 [ 24.361936   1.73945  133.346641 ...   0.         1.         0.      ]
 [ 23.664709   1.738836 133.472641 ...   0.         1.         0.      ]]


# Encoding dependent variable vector

In [12]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

# YYYYY LABEL ENCODER WHEN OUTOUT HAS MANY CLASSES

In [13]:
print(y)

[1 1 1 ... 4 4 4]


## Splitting the dataset into the Training set and Test set

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Feature Scaling

In [15]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:,:8] = sc.fit_transform(X_train[:,:8])
X_test[:,:8] = sc.transform(X_test[:,:8])

In [16]:
print(X_train[0])

[-0.47461602  0.46610942  2.32935052  1.10125089  0.41183552  0.53454487
  0.47172076  0.08574274  0.          1.          1.          0.
  1.          0.          0.          0.          0.          1.
  0.          0.          0.          1.          0.        ]


# Training KNN model to the training set

In [17]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

KNeighborsClassifier()

## Making the Confusion Matrix

In [18]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[73  4  0  0  0  1  0]
 [15 26  3  0  0 18  7]
 [ 1  1 78  1  1  0  4]
 [ 0  0  0 73  0  0  0]
 [ 0  0  0  0 88  0  0]
 [ 2  1  8  1  0 54  2]
 [ 2  1  4  3  0  2 54]]


0.8446969696969697