In [1]:
# importing necessary source
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import accuracy_score

# Data Collection and Preprocessing

In [2]:
wh_data = pd.read_csv('weight-height.csv')
wh_data.head()

Unnamed: 0,Gender,Height,Weight
0,Male,73.847017,241.893563
1,Male,68.781904,162.310473
2,Male,74.110105,212.740856
3,Male,71.730978,220.04247
4,Male,69.881796,206.349801


In [3]:
wh_data.shape

(8555, 3)

In [4]:
wh_data.isnull().sum()

Gender    0
Height    0
Weight    0
dtype: int64

In [5]:
from sklearn.preprocessing import LabelEncoder # for enconding Gender

In [6]:
enc = LabelEncoder()
wh_data['Gender'] = enc.fit_transform(wh_data['Gender'])

In [7]:
wh_data.head()

Unnamed: 0,Gender,Height,Weight
0,1,73.847017,241.893563
1,1,68.781904,162.310473
2,1,74.110105,212.740856
3,1,71.730978,220.04247
4,1,69.881796,206.349801


In [8]:
wh_data.tail()

Unnamed: 0,Gender,Height,Weight
8550,0,60.483946,110.565497
8551,0,63.423372,129.921671
8552,0,65.584057,155.942671
8553,0,67.429971,151.678405
8554,0,60.921791,131.253738


In [9]:
x = wh_data.drop('Weight', axis=1)
y = wh_data['Weight']

In [10]:
print(x.shape)
print(y.shape)

(8555, 2)
(8555,)


# Train and Test data

In [11]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3, random_state=1)

In [12]:
print(x.shape, xtrain.shape, xtest.shape)

(8555, 2) (5988, 2) (2567, 2)


# Model Training --> LinearRegression

In [13]:
linearModel = LinearRegression()

In [14]:
linearModel.fit(xtrain, ytrain)

LinearRegression()

# Model Evaluation --> LinearRegression

In [15]:
linearAccuracy = linearModel.score(xtest, ytest)

In [16]:
 pred_y = linearModel.predict(xtest)

In [17]:
# Mean Square Error
from sklearn.metrics import mean_squared_error
mse_linear = mean_squared_error(ytest, pred_y)

In [18]:
wh_data['Linear_pred_Weight'] = linearModel.predict(x)

In [19]:
print('Accuracy of LinearReagression ', linearAccuracy)
print('Mean Square Error of LinearRegression ', mse_linear)
print(wh_data.head())
print(wh_data.tail())

Accuracy of LinearReagression  0.9059959607091161
Mean Square Error of LinearRegression  99.67250998070027
   Gender     Height      Weight  Linear_pred_Weight
0       1  73.847017  241.893563          215.574352
1       1  68.781904  162.310473          185.341329
2       1  74.110105  212.740856          217.144694
3       1  71.730978  220.042470          202.943984
4       1  69.881796  206.349801          191.906445
      Gender     Height      Weight  Linear_pred_Weight
8550       0  60.483946  110.565497          116.468265
8551       0  63.423372  129.921671          134.013329
8552       0  65.584057  155.942671          146.910187
8553       0  67.429971  151.678405          157.928220
8554       0  60.921791  131.253738          119.081708


# Model Training --> KNeighborsRegressor

In [20]:
from sklearn.neighbors import KNeighborsRegressor

In [21]:
knn = KNeighborsRegressor()

In [22]:
knn.fit(xtrain, ytrain)

KNeighborsRegressor()

# Model Evaluation --> KNeighborsRegressor

In [23]:
knnAccuracy = knn.score(xtest, ytest)

In [24]:
y_pred_knn = knn.predict(xtest)

In [25]:
# Mean Square Error
from sklearn.metrics import mean_squared_error
mse_knn = mean_squared_error(ytest, y_pred_knn)

In [26]:
wh_data['knn_pred_Weight'] = knn.predict(x)

In [27]:
print('Accuracy of KNeighborsRegressor ', knnAccuracy)
print('Mean Square Error of KNeighborsRegressor ', mse_knn)
print(wh_data.head())
print(wh_data.tail())

Accuracy of KNeighborsRegressor  0.8838502073708883
Mean Square Error of KNeighborsRegressor  123.15365863436983
   Gender     Height      Weight  Linear_pred_Weight  knn_pred_Weight
0       1  73.847017  241.893563          215.574352       222.249723
1       1  68.781904  162.310473          185.341329       181.243871
2       1  74.110105  212.740856          217.144694       220.883791
3       1  71.730978  220.042470          202.943984       199.347460
4       1  69.881796  206.349801          191.906445       193.719655
      Gender     Height      Weight  Linear_pred_Weight  knn_pred_Weight
8550       0  60.483946  110.565497          116.468265       114.899508
8551       0  63.423372  129.921671          134.013329       135.770862
8552       0  65.584057  155.942671          146.910187       146.748282
8553       0  67.429971  151.678405          157.928220       161.587442
8554       0  60.921791  131.253738          119.081708       118.242226


# Optimizing Algorithm with Hyper parameter Tuning

In [28]:
# Randomized Search CV
from sklearn.model_selection import RandomizedSearchCV

In [29]:
#List Hyperparameters that we want to tune.
leaf_size = list(range(1,50))
n_neighbors = list(range(1,1000))
p=[1,2]

grids = {
    'leaf_size': leaf_size,
    'n_neighbors': n_neighbors,
    'p': p
}

In [30]:
grids

{'leaf_size': [1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49],
 'n_neighbors': [1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  82,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  100,
  101,
  102,
  103,
  104,
  105,
  106,
  107,
  108,
  109,
  110,
  111,
  112,
  113,
  1

In [31]:
knn_2 = KNeighborsRegressor()

In [32]:
# Randomized Search CV
rscv = RandomizedSearchCV(knn_2,grids, n_iter=500, cv=3)

In [33]:
rscv.fit(xtrain,ytrain)

RandomizedSearchCV(cv=3, estimator=KNeighborsRegressor(), n_iter=500,
                   param_distributions={'leaf_size': [1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                      10, 11, 12, 13, 14, 15,
                                                      16, 17, 18, 19, 20, 21,
                                                      22, 23, 24, 25, 26, 27,
                                                      28, 29, 30, ...],
                                        'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8,
                                                        9, 10, 11, 12, 13, 14,
                                                        15, 16, 17, 18, 19, 20,
                                                        21, 22, 23, 24, 25, 26,
                                                        27, 28, 29, 30, ...],
                                        'p': [1, 2]})

In [34]:
results = pd.DataFrame(rscv.cv_results_)

In [35]:
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_p,param_n_neighbors,param_leaf_size,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004001,1.946680e-07,0.229388,0.001246,2,973,15,"{'p': 2, 'n_neighbors': 973, 'leaf_size': 15}",0.827503,0.826898,0.832061,0.828820,0.002305,493
1,0.004003,3.907932e-06,0.031005,0.000004,2,131,21,"{'p': 2, 'n_neighbors': 131, 'leaf_size': 21}",0.888355,0.886207,0.895244,0.889936,0.003855,52
2,0.004000,1.266589e-06,0.226719,0.000471,1,919,38,"{'p': 1, 'n_neighbors': 919, 'leaf_size': 38}",0.847523,0.847356,0.853622,0.849501,0.002915,381
3,0.003667,4.709775e-04,0.187709,0.002358,1,762,44,"{'p': 1, 'n_neighbors': 762, 'leaf_size': 44}",0.855982,0.856320,0.863062,0.858455,0.003261,324
4,0.004001,1.123916e-07,0.053012,0.000817,2,230,39,"{'p': 2, 'n_neighbors': 230, 'leaf_size': 39}",0.881664,0.880427,0.890111,0.884068,0.004303,105
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.005001,8.991328e-07,0.174639,0.001762,1,687,6,"{'p': 1, 'n_neighbors': 687, 'leaf_size': 6}",0.859878,0.860319,0.867152,0.862450,0.003330,296
496,0.004001,4.899036e-07,0.022332,0.000479,1,91,22,"{'p': 1, 'n_neighbors': 91, 'leaf_size': 22}",0.891262,0.888701,0.897223,0.892395,0.003570,25
497,0.004001,5.947204e-07,0.166766,0.001795,1,678,24,"{'p': 1, 'n_neighbors': 678, 'leaf_size': 24}",0.860285,0.860800,0.867788,0.862958,0.003422,289
498,0.003667,4.715955e-04,0.042010,0.000817,1,176,47,"{'p': 1, 'n_neighbors': 176, 'leaf_size': 47}",0.886978,0.884971,0.894141,0.888696,0.003936,63


In [36]:
rscv.best_score_

0.8938511139610527

In [37]:
rscv.best_params_

{'p': 1, 'n_neighbors': 54, 'leaf_size': 21}

In [38]:
knn_3 = rscv.best_estimator_

In [39]:
knn_3

KNeighborsRegressor(leaf_size=21, n_neighbors=54, p=1)

# Model Evaluation After Tuning --> KNeighborsRegressor

In [40]:
knn_3Accuracy = knn_3.score(xtest, ytest)

In [41]:
y_pred_knn_3 = knn_3.predict(xtest)

In [42]:
# Mean Square Error
mse_knn_3 = mean_squared_error(ytest, y_pred_knn_3)

In [43]:
wh_data['Tuning_Linear_pred_Weight'] = knn_3.predict(x)

In [44]:
print('Accuracy of KNeighborsRegressor after tuning ', knn_3Accuracy)
print('Mean Square Error of KNeighborsRegressor after tuning ', mse_knn_3)
print(wh_data.head())
print(wh_data.tail())

Accuracy of KNeighborsRegressor after tuning  0.9036724361032478
Mean Square Error of KNeighborsRegressor after tuning  102.13614379064946
   Gender     Height      Weight  Linear_pred_Weight  knn_pred_Weight  \
0       1  73.847017  241.893563          215.574352       222.249723   
1       1  68.781904  162.310473          185.341329       181.243871   
2       1  74.110105  212.740856          217.144694       220.883791   
3       1  71.730978  220.042470          202.943984       199.347460   
4       1  69.881796  206.349801          191.906445       193.719655   

   Tuning_Linear_pred_Weight  
0                 215.765988  
1                 184.242783  
2                 216.107978  
3                 201.781271  
4                 193.116929  
      Gender     Height      Weight  Linear_pred_Weight  knn_pred_Weight  \
8550       0  60.483946  110.565497          116.468265       114.899508   
8551       0  63.423372  129.921671          134.013329       135.770862   
8552    

# Comparison among the above

In [72]:
print('Accuracy and Mean Square Error of LinearRegression ',linearAccuracy, mse_linear)
print('Accuracy and Mean Square Error of KNeighborsRegressor ',knnAccuracy, mse_knn)
print('Accuracy and Mean Square Error of KNeighborsRegressor after tuning ',knn_3Accuracy, mse_knn_3)

Accuracy and Mean Square Error of LinearRegression  0.9059959607091161 99.67250998070027
Accuracy and Mean Square Error of KNeighborsRegressor  0.8838502073708883 123.15365863436983
Accuracy and Mean Square Error of KNeighborsRegressor after tuning  0.9036724361032478 102.13614379064946


# KNN Classifier

In [46]:
wh = pd.read_csv('weight-height.csv')
wh.head()

Unnamed: 0,Gender,Height,Weight
0,Male,73.847017,241.893563
1,Male,68.781904,162.310473
2,Male,74.110105,212.740856
3,Male,71.730978,220.04247
4,Male,69.881796,206.349801


In [47]:
wh['Gender'] = wh.Gender.replace({'Male':0,'Female':1})

In [48]:
x = wh.drop(['Gender'], axis=1)
y = wh.Gender

In [49]:
x.head()

Unnamed: 0,Height,Weight
0,73.847017,241.893563
1,68.781904,162.310473
2,74.110105,212.740856
3,71.730978,220.04247
4,69.881796,206.349801


In [50]:
print(x.shape)
print(y.shape)

(8555, 2)
(8555,)


In [51]:
#split train and test data
from sklearn.model_selection import train_test_split

In [52]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3, random_state=1)

In [53]:
from sklearn.neighbors import KNeighborsClassifier
knn_cls = KNeighborsClassifier()

In [54]:
knn_cls.fit(xtrain, ytrain)

KNeighborsClassifier()

In [55]:
knnClsAccuracy = knn_cls.score(xtest, ytest)
pred_y_knn_cls = knn_cls.predict(xtest)

In [56]:
# Mean Square Error
from sklearn.metrics import mean_squared_error
mse_knn_cls = mean_squared_error(ytest, pred_y_knn_cls)

In [57]:
wh_data['knn_pred_Weight'] = knn_cls.predict(x)

In [58]:
print('Accuracy of KNeighborsClassifier ', knnClsAccuracy)
print('Mean Square Error of KNeighborsClassifier ', mse_knn_cls)
print(wh.head())
print(wh.tail())

Accuracy of KNeighborsClassifier  0.9146864043630697
Mean Square Error of KNeighborsClassifier  0.08531359563693026
   Gender     Height      Weight
0       0  73.847017  241.893563
1       0  68.781904  162.310473
2       0  74.110105  212.740856
3       0  71.730978  220.042470
4       0  69.881796  206.349801
      Gender     Height      Weight
8550       1  60.483946  110.565497
8551       1  63.423372  129.921671
8552       1  65.584057  155.942671
8553       1  67.429971  151.678405
8554       1  60.921791  131.253738


# Optimizing Algorithm with Hyper parameter Tuning

In [59]:
# Randomized Search CV
from sklearn.model_selection import RandomizedSearchCV

In [62]:
rscv = RandomizedSearchCV(knn_cls,grids, n_iter=300, cv=3)

In [64]:
rscv.fit(xtrain, ytrain)

RandomizedSearchCV(cv=3, estimator=KNeighborsClassifier(), n_iter=300,
                   param_distributions={'leaf_size': [1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                      10, 11, 12, 13, 14, 15,
                                                      16, 17, 18, 19, 20, 21,
                                                      22, 23, 24, 25, 26, 27,
                                                      28, 29, 30, ...],
                                        'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8,
                                                        9, 10, 11, 12, 13, 14,
                                                        15, 16, 17, 18, 19, 20,
                                                        21, 22, 23, 24, 25, 26,
                                                        27, 28, 29, 30, ...],
                                        'p': [1, 2]})

In [65]:
results = pd.DataFrame(rscv.cv_results_)
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_p,param_n_neighbors,param_leaf_size,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004003,3.489574e-06,0.166035,8.202353e-04,1,503,14,"{'p': 1, 'n_neighbors': 503, 'leaf_size': 14}",0.896293,0.900301,0.918337,0.904977,0.009588,129
1,0.004000,1.215701e-06,0.101691,1.248722e-03,1,254,7,"{'p': 1, 'n_neighbors': 254, 'leaf_size': 7}",0.896794,0.901303,0.918838,0.905645,0.009509,78
2,0.004000,7.867412e-07,0.191711,3.858751e-03,1,600,9,"{'p': 1, 'n_neighbors': 600, 'leaf_size': 9}",0.895792,0.901804,0.917836,0.905144,0.009304,110
3,0.003334,4.719323e-04,0.105024,2.829571e-03,2,273,35,"{'p': 2, 'n_neighbors': 273, 'leaf_size': 35}",0.896794,0.900301,0.920341,0.905812,0.010373,67
4,0.003000,6.257699e-07,0.128029,8.163401e-04,1,360,35,"{'p': 1, 'n_neighbors': 360, 'leaf_size': 35}",0.895792,0.897796,0.919339,0.904309,0.010659,198
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,0.004001,1.072147e-06,0.158038,8.166327e-04,2,467,14,"{'p': 2, 'n_neighbors': 467, 'leaf_size': 14}",0.895291,0.898297,0.919339,0.904309,0.010698,198
296,0.003332,4.723274e-04,0.229603,1.192675e-03,1,745,45,"{'p': 1, 'n_neighbors': 745, 'leaf_size': 45}",0.896794,0.899800,0.916834,0.904476,0.008824,192
297,0.003999,3.204639e-06,0.075013,8.298079e-04,1,147,48,"{'p': 1, 'n_neighbors': 147, 'leaf_size': 48}",0.898798,0.903307,0.920341,0.907482,0.009277,48
298,0.004333,4.706398e-04,0.121657,1.735062e-03,2,325,10,"{'p': 2, 'n_neighbors': 325, 'leaf_size': 10}",0.896794,0.898297,0.918838,0.904643,0.010056,171


In [66]:
knn_cls_2 = rscv.best_estimator_

In [67]:
knn_cls_Accuracy = knn_cls_2.score(xtest, ytest)

In [68]:
y_pred_knn_cls_2 = knn_cls_2.predict(xtest)

In [69]:
# Mean Square Error
mse_knn_cls_2 = mean_squared_error(ytest, y_pred_knn_cls_2)

In [70]:
print('Accuracy of KNeighborsClassifier after tuning ', knn_cls_Accuracy)
print('Mean Square Error of KNeighborsClassifier after tuning ', mse_knn_cls_2)

Accuracy of KNeighborsClassifier after tuning  0.9244253992987924
Mean Square Error of KNeighborsClassifier after tuning  0.07557460070120764


# Comparison of KNeighborsClassifer before and after Tuning

In [73]:
print('Accuracy and Mean Square Error of KNeighborsClassifier ',knnClsAccuracy, mse_knn_cls)
print('Accuracy and Mean Square Error of KNeighborsClassifier after tuning ',knn_cls_Accuracy, mse_knn_cls_2)

Accuracy and Mean Square Error of KNeighborsClassifier  0.9146864043630697 0.08531359563693026
Accuracy and Mean Square Error of KNeighborsClassifier after tuning  0.9244253992987924 0.07557460070120764
