Import libraries

In [1]:
import numpy as np
import pandas as pd

Load Data

In [2]:
data = pd.read_csv("numbers.csv")
data.head()

Unnamed: 0,AT,V,AP,RH,PE
0,14.96,41.76,1024.07,73.17,463.26
1,25.18,62.96,1020.04,59.08,444.37
2,5.11,39.4,1012.16,92.14,488.56
3,20.86,57.32,1010.24,76.64,446.48
4,10.82,37.5,1009.23,96.62,473.9


Shape the dataset

In [3]:
X = data.iloc[:, :4].values
Y = data.iloc[:, 4].values.reshape(-1,1)

In [4]:
print(X)
print(Y)

print(np.shape(X))
print(np.shape(Y))

[[  14.96   41.76 1024.07   73.17]
 [  25.18   62.96 1020.04   59.08]
 [   5.11   39.4  1012.16   92.14]
 ...
 [  31.32   74.33 1012.92   36.48]
 [  24.48   69.45 1013.86   62.39]
 [  21.6    62.52 1017.23   67.87]]
[[463.26]
 [444.37]
 [488.56]
 ...
 [429.57]
 [435.74]
 [453.28]]
(9568, 4)
(9568, 1)


Split data into testing, training and valuation sets

In [5]:
from sklearn.model_selection import train_test_split
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2, random_state=2021)

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=0)
Xtrain, Xval, Ytrain, Yval = train_test_split(Xtrain, Ytrain, test_size=0.2, random_state=0)

Run Linear Regression model

In [6]:
from sklearn.linear_model import LinearRegression
# Create and fit the linear regression model
regr = LinearRegression()
regr.fit(Xtrain, Ytrain)

# Make predictions on the validation set
pred = regr.predict(Xval)

Config and Run GridSearchCV

In [7]:
# Make a dictionary of hyperparameter values to search
search_space = {
    "fit_intercept" : [True, False],
    "copy_X" : [True, False],
    "n_jobs" : [-1, 1],
    "positive" : [True, False]
}

In [8]:
from sklearn.model_selection import GridSearchCV

GS = GridSearchCV(estimator = LinearRegression(),
                  param_grid = search_space,
                  scoring = ["r2", "neg_mean_squared_error"], 
                  refit = "r2",
                  cv = 5,
                  verbose = 4)

In [9]:
GS.fit(Xtest, Ytest)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END copy_X=True, fit_intercept=True, n_jobs=-1, positive=True; neg_mean_squared_error: (test=-195.609) r2: (test=0.376) total time=   0.6s
[CV 2/5] END copy_X=True, fit_intercept=True, n_jobs=-1, positive=True; neg_mean_squared_error: (test=-194.659) r2: (test=0.372) total time=   0.1s
[CV 3/5] END copy_X=True, fit_intercept=True, n_jobs=-1, positive=True; neg_mean_squared_error: (test=-178.244) r2: (test=0.367) total time=   0.1s
[CV 4/5] END copy_X=True, fit_intercept=True, n_jobs=-1, positive=True; neg_mean_squared_error: (test=-174.796) r2: (test=0.408) total time=   0.1s
[CV 5/5] END copy_X=True, fit_intercept=True, n_jobs=-1, positive=True; neg_mean_squared_error: (test=-154.977) r2: (test=0.405) total time=   0.2s
[CV 1/5] END copy_X=True, fit_intercept=True, n_jobs=-1, positive=False; neg_mean_squared_error: (test=-18.862) r2: (test=0.940) total time=   0.0s
[CV 2/5] END copy_X=True, fit_intercept=True, n_job

In [10]:
print("best model: ", GS.best_estimator_) # to get the complete details of the best model
print("best parameter values: ", GS.best_params_) # to get only the best hyperparameter values that we searched for
print("best r^2 value: ", GS.best_score_) # score according to the metric we passed in refit

best model:  LinearRegression(n_jobs=-1)
best parameter values:  {'copy_X': True, 'fit_intercept': True, 'n_jobs': -1, 'positive': False}
best r^2 value:  0.9317961881853025


In [11]:
df = pd.DataFrame(GS.cv_results_)
df = df.sort_values("rank_test_r2")
df.to_csv("cv_results.csv", index = False)