In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import warnings as wr
wr.filterwarnings('ignore')

# Import Dataset

In [2]:
df = pd.read_csv('weight-height.csv')
df.head()

Unnamed: 0,Gender,Height,Weight
0,Male,73.847017,241.893563
1,Male,68.781904,162.310473
2,Male,74.110105,212.740856
3,Male,71.730978,220.04247
4,Male,69.881796,206.349801


In [3]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [4]:
df.Gender = le.fit_transform(df[['Gender']])

In [5]:
df.head()

Unnamed: 0,Gender,Height,Weight
0,1,73.847017,241.893563
1,1,68.781904,162.310473
2,1,74.110105,212.740856
3,1,71.730978,220.04247
4,1,69.881796,206.349801


In [6]:
df.tail()

Unnamed: 0,Gender,Height,Weight
8550,0,60.483946,110.565497
8551,0,63.423372,129.921671
8552,0,65.584057,155.942671
8553,0,67.429971,151.678405
8554,0,60.921791,131.253738


# Separate(X and Y)

In [7]:
x = df[['Gender','Height']]

In [8]:
x.head()

Unnamed: 0,Gender,Height
0,1,73.847017
1,1,68.781904
2,1,74.110105
3,1,71.730978
4,1,69.881796


In [9]:
y = df[['Weight']]
y.head()

Unnamed: 0,Weight
0,241.893563
1,162.310473
2,212.740856
3,220.04247
4,206.349801


In [10]:
df.corr()

Unnamed: 0,Gender,Height,Weight
Gender,1.0,0.682524,0.790208
Height,0.682524,1.0,0.922975
Weight,0.790208,0.922975,1.0


# Splliting Data, Train 70% & Test 30%

In [11]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.30, random_state=0)

In [12]:
df.shape

(8555, 3)

In [13]:
xtrain.shape

(5988, 2)

In [14]:
xtest.shape

(2567, 2)

In [15]:
ytrain.shape

(5988, 1)

In [16]:
ytest.shape

(2567, 1)

In [17]:
xtrain.head()

Unnamed: 0,Gender,Height
3197,1,69.689848
4394,1,66.299441
7711,0,63.261527
7019,0,62.035319
898,1,67.272171


In [18]:
xtest.head()

Unnamed: 0,Gender,Height
541,1,63.867921
3209,1,68.688931
7598,0,65.865244
5170,0,66.259558
2423,1,67.290776


In [19]:
ytrain.head()

Unnamed: 0,Weight
3197,186.377448
4394,159.235988
7711,126.426793
7019,119.418135
898,187.363366


In [20]:
ytest.head()

Unnamed: 0,Weight
541,174.473363
3209,178.5075
7598,155.2489
5170,138.83994
2423,154.718344


# Apply Linear Regression

In [21]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()

In [22]:
reg.fit(xtrain, ytrain)

# Evaluate the model(Testing & Training Accuracy, MSE for testing)

In [23]:
from sklearn.metrics import mean_squared_error

In [24]:
y_pred = reg.predict(xtest)

In [25]:
y_pred

array([[156.20235428],
       [184.93442898],
       [148.42945323],
       ...,
       [159.2442037 ],
       [166.96403181],
       [136.95529212]])

In [26]:
mse = mean_squared_error(ytest, y_pred)
print('MSE:', mse)

MSE: 102.30154155088874


In [27]:
reg.score(xtest, ytest) #testing score

0.9025307579174771

In [28]:
reg.score(xtrain, ytrain) # training score

0.8988118427483529

# KNN Regressor

In [29]:
from sklearn.neighbors import KNeighborsRegressor

In [30]:
neigh = KNeighborsRegressor(n_neighbors=3)

In [31]:
neigh.fit(xtrain,ytrain)

In [32]:
y_pred2 = neigh.predict(xtest)

In [34]:
y_pred2

array([[156.45870553],
       [189.52082843],
       [160.00456057],
       ...,
       [158.63676763],
       [179.18163467],
       [139.3463748 ]])

In [35]:
mse2 = mean_squared_error(ytest, y_pred2)
print('MSE:', mse2)

MSE: 139.06397442506378


In [36]:
neigh.score(xtest, ytest) #testing score

0.8675048295195845

In [37]:
neigh.score(xtrain, ytrain) # training score

0.9319496298003415

# Compare KNN & Linear Regression with the KNN Model and Linear regression

In [38]:
data= xtest
df1 = pd.DataFrame(data, columns=['Gender','Height'])
df1.head()

Unnamed: 0,Gender,Height
541,1,63.867921
3209,1,68.688931
7598,0,65.865244
5170,0,66.259558
2423,1,67.290776


In [39]:
data= ytest
df2 = pd.DataFrame(data, columns=['Weight'])
df2.head()

Unnamed: 0,Weight
541,174.473363
3209,178.5075
7598,155.2489
5170,138.83994
2423,154.718344


In [40]:
df_merged = df1.merge(df2,  left_index=True, right_index=True)
df_merged.head()

Unnamed: 0,Gender,Height,Weight
541,1,63.867921,174.473363
3209,1,68.688931,178.5075
7598,0,65.865244,155.2489
5170,0,66.259558,138.83994
2423,1,67.290776,154.718344


In [41]:
data= y_pred
df3 = pd.DataFrame(data, columns=['OLS Predicted Weight'])
df3.head()

Unnamed: 0,OLS Predicted Weight
0,156.202354
1,184.934429
2,148.429453
3,150.77947
4,176.601756


In [42]:
df_merged.reset_index(drop=True, inplace=True)
df3.reset_index(drop=True, inplace=True)

df_merged = df_merged.merge(df3,  left_index=True, right_index=True)
df_merged.head()

Unnamed: 0,Gender,Height,Weight,OLS Predicted Weight
0,1,63.867921,174.473363,156.202354
1,1,68.688931,178.5075,184.934429
2,0,65.865244,155.2489,148.429453
3,0,66.259558,138.83994,150.77947
4,1,67.290776,154.718344,176.601756


In [43]:
data= y_pred2
df4 = pd.DataFrame(data, columns=['KNN Predicted Weight'])
df4.head()

Unnamed: 0,KNN Predicted Weight
0,156.458706
1,189.520828
2,160.004561
3,152.335456
4,187.556171


In [44]:
df_merged.reset_index(drop=True, inplace=True)
df4.reset_index(drop=True, inplace=True)

df_merged = pd.concat([df_merged, df4], axis=1, join='inner')
df_merged.head()

Unnamed: 0,Gender,Height,Weight,OLS Predicted Weight,KNN Predicted Weight
0,1,63.867921,174.473363,156.202354,156.458706
1,1,68.688931,178.5075,184.934429,189.520828
2,0,65.865244,155.2489,148.429453,160.004561
3,0,66.259558,138.83994,150.77947,152.335456
4,1,67.290776,154.718344,176.601756,187.556171


OLS score= 0.9025307579174771 & KNN score= 0.8675048295195845. 
For this dataset our OLS predicted scores are higher than KNN predicted scores. 