In [1]:
import os
import subprocess

import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error


In [2]:
PATH = "../datasets"
input_file = "world_happiness2.csv"

dataset_path = os.path.join(PATH, input_file)
data = pd.read_csv(dataset_path, header=0)

In [3]:
data.shape

(156, 9)

In [4]:
data.columns.values

array(['Rank', 'Country', 'Score', 'GDP ', 'Social support',
       'Healthy life ', 'Freedom ', 'Generosity', 'Politics'],
      dtype=object)

In [5]:
data.head()

Unnamed: 0,Rank,Country,Score,GDP,Social support,Healthy life,Freedom,Generosity,Politics
0,1,Finland,7.769,1.34,1.587,0.986,0.596,0.153,0.393
1,2,Denmark,7.6,1.383,1.573,0.996,0.592,0.252,0.41
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341
3,4,Iceland,7.494,1.38,1.624,1.026,0.591,0.354,0.118
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298


In [6]:
data2 = data[data.columns.drop(['Country'])]
data2
X = data2[['Rank', 'GDP ', 'Social support', 'Healthy life ','Generosity', 'Freedom ']]
y = data2[['Score','Politics']]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [7]:
regr = linear_model.Ridge(alpha=.25)
regr.fit(X_train, y_train)

Ridge(alpha=0.25, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [8]:
prediction = regr.predict(X_test)
pr = prediction

In [9]:
mean_squared_error(y_test, prediction)

0.01479379113016626

In [11]:
mean_absolute_error(y_test, prediction)

0.09142261088018219

In [12]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [13]:
regr = linear_model.Ridge(alpha=.5)
regr.fit(X_train_scaled, y_train)

prediction = regr.predict(X_test_scaled)
pr = prediction

In [14]:
mean_squared_error(y_test, prediction)

0.018116250464163087

In [15]:
mean_absolute_error(y_test, prediction)

0.10025008611951451

In [16]:
X_train_copy = X_train
y_train_copy = y_train

neigh = KNeighborsRegressor(n_neighbors=5)

neigh.fit(X_train_copy, y_train_copy)


KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')

In [17]:
prediction = neigh.predict(X_test)
pr = prediction

In [18]:
mean_squared_error(y_test, prediction)

0.003440178846153847

In [21]:
from math import sqrt
mse_val = [] 
mae_val = []
sqrt_k = int(sqrt(np.size(data,0)))
for K in range(1,sqrt_k,2):
    model = KNeighborsRegressor(n_neighbors = K)
    model.fit(X_train, y_train) 
    pred=model.predict(X_test) 
    error = mean_squared_error(y_test,pred) 
    mse_val.append(error) 
    print('MSE value for k = ' , K , 'is:', error)
    error = mean_absolute_error(y_test, prediction)
    mae_val.append(error) 
    print('MAE value for k = ' , K , 'is:', error)

MSE value for k =  1 is: 0.005702961538461539
MAE value for k =  1 is: 0.03699038461538468
MSE value for k =  3 is: 0.0035803995726495716
MAE value for k =  3 is: 0.03699038461538468
MSE value for k =  5 is: 0.003440178846153847
MAE value for k =  5 is: 0.03699038461538468
MSE value for k =  7 is: 0.0037209446624803747
MAE value for k =  7 is: 0.03699038461538468
MSE value for k =  9 is: 0.004943627611585944
MAE value for k =  9 is: 0.03699038461538468
MSE value for k =  11 is: 0.006023307851239676
MAE value for k =  11 is: 0.03699038461538468


In [179]:
mse_array = np.asarray(mse_val)
min_val = np.ndarray.min(mse_array)
chosenK = np.where(mse_array==min_val)[0][0]*2+1
chosenK

3

In [22]:
from sklearn.preprocessing import Normalizer
scaler = Normalizer()

x_train_scaled = scaler.fit_transform(X_train)
x_test_scaled = scaler.fit_transform(X_test)


In [23]:
from math import sqrt
mse_val = [] 
mae_val=[]
sqrt_k = int(sqrt(np.size(data,0)))
for K in range(1,sqrt_k,2):
    model = KNeighborsRegressor(n_neighbors = K)
    model.fit(x_train_scaled, y_train) 
    pred=model.predict(x_test_scaled) 
    error = mean_squared_error(y_test,pred) 
    mse_val.append(error) 
    print('MSE value for k = ' , K , 'is:', error)
    error = mean_absolute_error(y_test, prediction)
    mae_val.append(error) 
    print('MAE value for k = ' , K , 'is:', error)

MSE value for k =  1 is: 0.0819273653846154
MAE value for k =  1 is: 0.03699038461538468
MSE value for k =  3 is: 0.0530720096153846
MAE value for k =  3 is: 0.03699038461538468
MSE value for k =  5 is: 0.042980226153846106
MAE value for k =  5 is: 0.03699038461538468
MSE value for k =  7 is: 0.044935920525902624
MAE value for k =  7 is: 0.03699038461538468
MSE value for k =  9 is: 0.0425509946581196
MAE value for k =  9 is: 0.03699038461538468
MSE value for k =  11 is: 0.04312298585505396
MAE value for k =  11 is: 0.03699038461538468


In [24]:
mse_array = np.asarray(mse_val)
min_val = np.ndarray.min(mse_array)
chosenK = np.where(mse_array==min_val)[0][0]*2+1
chosenK

9