# KNN for Regression problem
## We have to predict the energy consumption of different country in incoming year

In [20]:
# Base Libraries
import pandas as pd
import matplotlib.pyplot as plt  
import numpy as np
# Transformation
from sklearn.preprocessing import MinMaxScaler
# Models
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import neighbors
# Metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score

In [21]:
df = pd.read_csv(r"C:\Users\Ankit kumar singh\Knowledge\Technical knowledge\Machine Learning\DATASETS\Hydropower_Consumption.csv")

In [22]:
df.head()

Unnamed: 0,Country,2000,2001,2002,2003,2004,2005,2006,2007,2008,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Afghanistan,312,498,555,63,565,59,637,748,542,...,751,595,71,804,895,989,1025,105,105,107
1,Africa,75246,80864,85181,82873,87405,89066,92241,95341,97157,...,107427,110445,110952,117673,123727,115801,123816,130388,132735,0
2,Albania,4548,3519,3477,5117,5411,5319,4951,276,3759,...,7673,4036,4725,6959,4726,5866,7136,448,448,4018
3,Algeria,54,69,57,265,251,555,218,226,283,...,173,378,389,99,193,145,72,56,117,152
4,Angola,903,1007,1132,1229,1733,2197,2638,2472,3103,...,3666,3967,3734,4719,4991,5037,5757,7576,7576,8422


In [77]:
df.shape

(153, 21)

In [30]:
# drop categrical data from the dataframe
data = df.drop("Country",axis = 1)
data.head()

Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,312,498,555,63,565,59,637,748,542,776,751,595,71,804,895,989,1025,105,105,107
1,75246,80864,85181,82873,87405,89066,92241,95341,97157,99761,107427,110445,110952,117673,123727,115801,123816,130388,132735,0
2,4548,3519,3477,5117,5411,5319,4951,276,3759,5201,7673,4036,4725,6959,4726,5866,7136,448,448,4018
3,54,69,57,265,251,555,218,226,283,342,173,378,389,99,193,145,72,56,117,152
4,903,1007,1132,1229,1733,2197,2638,2472,3103,3063,3666,3967,3734,4719,4991,5037,5757,7576,7576,8422


# Scale the data for values between 0 to 1

In [31]:
scaler = MinMaxScaler()
data = pd.DataFrame(scaler.fit_transform(data), columns=['2000','2001','2002','2003','2004','2005',
                                 '2006','2007','2008','2009','2010','2011',
                                 '2012','2013','2014','2015','2016','2017',
                                 '2018','2019'])

In [32]:
data.head()

Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,0.000471,0.000814,0.000878,0.000101,0.000902,9e-05,0.000952,0.001171,0.00081,0.001164,0.001056,0.000813,8.2e-05,0.000884,0.000842,0.000885,0.000889,8.8e-05,0.000148,8.4e-05
1,0.113562,0.132122,0.134715,0.132393,0.139596,0.135752,0.137904,0.149435,0.145494,0.14958,0.151011,0.151688,0.128596,0.129366,0.116755,0.1039,0.107361,0.111912,0.187511,0.0
2,0.006864,0.00575,0.005499,0.008175,0.008642,0.008107,0.007402,0.000431,0.005628,0.007798,0.010786,0.005539,0.005476,0.007651,0.004457,0.005261,0.006188,0.000382,0.000633,0.003165
3,8.1e-05,0.000113,9e-05,0.000423,0.000401,0.000846,0.000326,0.000353,0.000422,0.000513,0.000243,0.000515,0.000451,0.000109,0.000179,0.000127,6.2e-05,4.5e-05,0.000165,0.00012
4,0.001363,0.001645,0.00179,0.001963,0.002768,0.003349,0.003944,0.003873,0.004645,0.004593,0.005153,0.005444,0.004328,0.005188,0.004707,0.004517,0.004992,0.0065,0.010702,0.006633


# Train-Test-Split

In [38]:
X = data.drop("2019",axis =1)
y = data["2019"]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size= 0.3, random_state = 42)

# Model Building and Training

In [40]:
knnr = KNeighborsRegressor()
knnr.fit(X_train,y_train)

# Model Testing

In [42]:
y_pred = knnr.predict(X_test)

In [44]:
rmse = np.sqrt(mean_squared_error(y_test,y_pred))
rmse

0.023568650571157385

In [47]:
rmsle = np.sqrt(mean_squared_log_error(y_test,y_pred))
rmsle

0.021961244849291153

In [49]:
r2 = r2_score(y_test,y_pred)
r2

-3.9382850317181486

In [52]:
evs = explained_variance_score(y_test, y_pred)
evs

-3.8932158945027657

In [54]:
mae = mean_absolute_error(y_test, y_pred)
mae

0.005745630973674791

# Model Evaluation

In [63]:
# Employing grid search algorithm for best k value in knn
params = ({"n_neighbors":list(range(1,21))})
model = GridSearchCV(knnr, params, cv=5)
model.fit(X_train,y_train)
model.best_params_



{'n_neighbors': 2}

In [71]:
y_predi = model.predict(X_test)
y_predi

array([4.83982503e-04, 4.17429986e-05, 1.73272824e-04, 1.09024049e-02,
       9.78991458e-04, 2.49670388e-04, 2.49670388e-04, 3.42213828e-04,
       1.40981071e-04, 3.76080789e-03, 1.40981071e-04, 1.25622798e-04,
       8.08869048e-04, 9.17558365e-05, 3.74190540e-03, 9.84504684e-05,
       1.04672538e-02, 1.05152976e-02, 2.52190720e-03, 4.17429986e-05,
       5.86764792e-03, 2.91413386e-05, 5.47384604e-05, 8.08869048e-04,
       7.75789691e-05, 5.92986861e-03, 2.97068381e-02, 8.08869048e-04,
       3.13348151e-03, 9.78991458e-04, 9.84504684e-05, 1.04751298e-03,
       3.06373920e-02, 2.64026435e-01, 1.33892637e-04, 1.58150832e-03,
       6.85215260e-05, 1.38578879e-03, 4.79595550e-02, 1.35806514e-02,
       4.83982503e-04, 7.33731651e-03, 9.05744309e-06, 2.99919507e-03,
       4.14909654e-03, 1.96231474e-03])

In [65]:
rmse = np.sqrt(mean_squared_error(y_test,y_predi))
rmse

0.03932122647779812

In [66]:
rmsle = np.sqrt(mean_squared_log_error(y_test,y_predi))
rmsle

0.03497237175642198

In [67]:
r2 = r2_score(y_test,y_predi)
r2

-12.745509083842778

In [69]:
evs = explained_variance_score(y_test, y_predi)
evs

-12.518294383958729

In [70]:
mae = mean_absolute_error(y_test, y_predi)
mae

0.008028712745229842

In [73]:
model.score(X_test,y_test)

-12.745509083842778

In [79]:
r2_valid = r2_score(y_test, y_pred)
mae_valid = mean_absolute_error(y_test, y_pred)
evs_valid = explained_variance_score(y_test, y_pred, multioutput='uniform_average')
rmse_valid = np.sqrt(mean_squared_error(y_test, y_pred))
rmsle_valid = np.sqrt(mean_squared_log_error(y_test, y_pred))

print('R2 Valid:',r2_valid)
print('EVS Valid:', evs_valid)
print('MAE Valid:', mae_valid)
print('RMSE Valid:',rmse_valid)
print('RMSLE Valid:', rmsle_valid)

R2 Valid: -3.9382850317181486
EVS Valid: -3.8932158945027657
MAE Valid: 0.005745630973674791
RMSE Valid: 0.023568650571157385
RMSLE Valid: 0.021961244849291153


In [80]:
data_prediction = list(zip(y_test,y_pred))
data_prediction = pd.DataFrame(data_prediction, columns=['Test','Prediction'])
data_prediction.head(10)

Unnamed: 0,Test,Prediction
0,0.000912,0.00031
1,6.1e-05,5e-05
2,0.000289,0.000155
3,0.012155,0.010069
4,0.001239,0.001395
5,0.000396,0.000216
6,3.9e-05,0.000224
7,0.000722,0.000224
8,1.1e-05,0.000105
9,0.005371,0.003693
