KNN Regression (Weight Prediction Case)

Dataset Sample

In [773]:
import pandas as pd

census = {
    "height(cm)":[165, 172, 175, 177, 180, 183, 188, 166, 155, 145, 162, 170, 153, 158],
    "weight(kg)":[64, 72, 74, 75, 82, 82, 88, 55, 51, 42, 53, 64, 51, 52],
    "gender":["male", "male", "male", "male", "male", "male", "male", "female", 
              "female", "female", "female", "female", "female", "female"]
}

censusDf = pd.DataFrame(census)

Dataset Splitting

In [774]:
import numpy as np
X_train = np.array(censusDf[["height(cm)", "gender"]])
y_train = np.array(censusDf["weight(kg)"])

Dataset Preprocessing

In [775]:
X_train_Tpose = np.transpose(X_train)
print(f"Normal X Train:\n{X_train}\n\nTranspose X Train: \n{X_train_Tpose}")

Normal X Train:
[[165 'male']
 [172 'male']
 [175 'male']
 [177 'male']
 [180 'male']
 [183 'male']
 [188 'male']
 [166 'female']
 [155 'female']
 [145 'female']
 [162 'female']
 [170 'female']
 [153 'female']
 [158 'female']]

Transpose X Train: 
[[165 172 175 177 180 183 188 166 155 145 162 170 153 158]
 ['male' 'male' 'male' 'male' 'male' 'male' 'male' 'female' 'female'
  'female' 'female' 'female' 'female' 'female']]


In [776]:
from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
gender = lb.fit_transform(X_train_Tpose[1])
gender = gender.flatten()
gender

array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0])

In [777]:
X_train_Tpose[1] = gender
X_train = np.transpose(X_train_Tpose)
X_train

array([[165, 1],
       [172, 1],
       [175, 1],
       [177, 1],
       [180, 1],
       [183, 1],
       [188, 1],
       [166, 0],
       [155, 0],
       [145, 0],
       [162, 0],
       [170, 0],
       [153, 0],
       [158, 0]], dtype=object)

Train Test Split

KNN Regression Model Training

In [778]:
from sklearn.neighbors import KNeighborsRegressor
K = 2
knModel = KNeighborsRegressor(n_neighbors=K)
knModel.fit(X_train, y_train)

Model Prediction

In [779]:
xNew = [[175, 1]]
yNew = knModel.predict(xNew)
print(f"New Data: {xNew}\n\nWeight Prediction Result: {yNew}")

New Data: [[175, 1]]

Weight Prediction Result: [74.5]


Model Evaluate & Metric Score

In [780]:
X_test = np.array([[172, 0], [188, 1], [177, 1], [181, 0], [168, 1], [166, 0], [155, 0]])
y_test = np.array([65, 84.87, 72, 85, 70.67, 74.4, 62])


print(f"X Test:\n{X_test}")
print(f"Y Test:\n{y_test}")


X Test:
[[172   0]
 [188   1]
 [177   1]
 [181   0]
 [168   1]
 [166   0]
 [155   0]]
Y Test:
[65.   84.87 72.   85.   70.67 74.4  62.  ]


In [781]:
yPred = (knModel.predict(X_test))
print(f"Y Pred Data")
yPred

Y Pred Data


array([68. , 85. , 74.5, 82. , 59.5, 59.5, 51. ])

In [782]:
from sklearn.metrics import r2_score
yPred = knModel.predict(X_test)
score = r2_score(y_test, yPred)
print(f"R2 Prediction Accuracy: {score}")

R2 Prediction Accuracy: -0.03143227872910903


MAE (Mean Absolute Error)

In [783]:
from sklearn.metrics import mean_absolute_error as mae
maeScore = mae(y_test, yPred)
print(f"MAE Prediction Accuracy: {maeScore}")

MAE Prediction Accuracy: 6.528571428571429


MSE (Mean Squared Error)

In [784]:
from sklearn.metrics import mean_squared_error as mse
mseScore = mse(y_test, yPred)
print(f"MSE Prediction Accuracy: {mseScore}")

MSE Prediction Accuracy: 70.29225714285717


Scaling Probelem on Features

In [785]:
from scipy.spatial.distance import euclidean

xNew = np.array([[1700, 0], [1600, 1]])
xTrain = np.array([[1640, 0]])

print([euclidean(xNew[0], d) for d in xTrain])
print([euclidean(xNew[1], d) for d in xTrain])

[np.float64(60.0)]
[np.float64(40.01249804748511)]


In [786]:
xNew = np.array([[1.70, 0], [1.6, 1]])
xTrain = np.array([[1.64, 0]])

print([euclidean(xNew[0], d) for d in xTrain])
print([euclidean(xNew[1], d) for d in xTrain])

[0.06000000000000005]
[1.0007996802557444]


Standard Scaled to Handle the Problem 

Formula:

z = x-(mean(x))/s


In [787]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

In [788]:
xNew = np.array([[1700, 0], [1600, 1]])
xNewScaled = ss.fit_transform(xNew)
xTrain = np.array([[1640, 0]])
xTrainScaled = ss.transform(xTrain)

distance1 = [euclidean(xNewScaled[0], d) for d in xTrainScaled]
distance2 = [euclidean(xNewScaled[1], d) for d in xTrainScaled]
print(f"X New Scaled:\n{xNewScaled}")
print(f"X Train Scaled:\n{xTrainScaled}\n")
print(f"Distance:\n{distance1, distance2}")

X New Scaled:
[[ 1. -1.]
 [-1.  1.]]
X Train Scaled:
[[-0.2 -1. ]]

Distance:
([1.2], [2.154065922853802])


In [789]:
xNew = np.array([[1.7, 0], [1.6, 1]])
xNewScaled = ss.fit_transform(xNew)
xTrain = np.array([[1.64, 0]])
xTrainScaled = ss.transform(xTrain)

distance1 = [euclidean(xNewScaled[0], d) for d in xTrainScaled]
distance2 = [euclidean(xNewScaled[1], d) for d in xTrainScaled]
print(f"X New Scaled:\n{xNewScaled}")
print(f"X Train Scaled:\n{xTrainScaled}\n")
print(f"Distance:\n{distance1, distance2}")

X New Scaled:
[[ 1. -1.]
 [-1.  1.]]
X Train Scaled:
[[-0.2 -1. ]]

Distance:
([1.2000000000000026], [2.1540659228538006])


Training Model with SS

In [790]:
X_train2 = np.array([[158, 1], [170, 1], [183, 1], [191, 1], [155, 0], [163, 0], [180, 0], 
                     [158, 0], [170, 0]])

y_train2 = np.array([64, 86, 84, 80, 49, 59, 67, 54, 67])

X_test2 = np.array([[168, 1], [180, 1], [160, 0], [169, 0]])
y_test2 = np.array([65, 96, 52, 67])

In [791]:
X_train_scaled = ss.fit_transform(X_train2)
X_test_scaled = ss.transform(X_test2)

Training & Predict

In [792]:
knModel.fit(X_train_scaled, y_train2)
yPred2 = knModel.predict(X_test_scaled)

In [794]:
maeScore2 = mae(y_test2, yPred2)
mseScore2 = mse(y_test2, yPred2)
r2Score2 = r2_score(y_test2, yPred2)
print(f"MAE Scores: {maeScore2}")
print(f"MSE Scores: {mseScore2}")
print(f"R2 Scores: {r2Score2}")

MAE Scores: 7.375
MSE Scores: 64.3125
R2 Scores: 0.7512088974854932
