In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/mydrive')

Mounted at /content/mydrive


In [None]:
%cd /content/mydrive/MyDrive/Sem_5/FODS_Project

/content/mydrive/MyDrive/Sem_5/FODS_Project


### Data

In [53]:
data_df = pd.read_csv('FODS-A2.csv')
print(data_df.shape)
data_df.head()

(7894, 27)


Unnamed: 0,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,...,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2,Appliances
0,21.2,33.29,19.823333,31.79,23.463333,38.23,20.5,31.73,19.2,39.363333,...,29.23,9.85,756.183333,41.833333,4.833333,40.0,-2.67,42.01718,42.01718,290
1,21.79,38.5,19.5,40.633333,22.5,37.9,21.0,37.9,20.033333,47.29,...,40.326667,6.9,754.0,75.0,4.0,40.0,2.8,24.62438,24.62438,50
2,22.39,41.39,20.2,43.79,24.5,39.333333,20.1,38.26,19.39,48.09,...,42.06,10.1,756.433333,68.0,5.833333,40.0,4.45,3.73126,3.73126,260
3,24.0,30.26,24.39,26.963333,23.39,33.4,22.79,31.2,21.033333,40.626667,...,35.5,19.1,760.0,31.0,4.0,40.0,1.5,1.058826,1.058826,50
4,20.05,38.245,17.6,41.0,21.1,37.2,19.89,36.4,18.2,43.56,...,38.863333,0.1,754.6,99.0,1.0,32.0,-0.1,39.248108,39.248108,30


In [54]:
x = data_df.iloc[:,:-1]
print(x.shape)
y = data_df.iloc[:,-1]
print(y.shape)

(7894, 26)
(7894,)


In [57]:
scaler_x = StandardScaler()
scaler_x.fit(x)
x_scaled = scaler_x.transform(x)
x_scaled.shape

(7894, 26)

In [59]:
y = np.array(y).reshape(-1,1)
y.shape
scaler_y = StandardScaler()
scaler_y.fit(y)
y_scaled = scaler_y.transform(y)
y_scaled.shape

(7894, 1)

In [60]:
x_train , x_test , y_train , y_test = train_test_split(x_scaled, y_scaled, test_size=0.2 ,random_state=30)

In [61]:
model = LinearRegression()
model.fit(x_train,y_train)
train_score = model.score(x_train,y_train)
test_score = model.score(x_test,y_test)

print(train_score)
print(test_score)

0.1455438599174067
0.1253382224250098


In [196]:
import math
from sklearn.metrics import mean_squared_error

pred = model.predict(x_test)
rmse = math.sqrt(mean_squared_error(y_test,pred))
print(rmse)

0.9595423601536057


In [86]:
print(x_train.shape)

(6315, 26)


### Helper functions and classes

In [238]:
class CustomLinearRegression:
  def __init__(self):
    self.lr = 1e-1
    self.epochs = 1000
    pass
  
  def fit(self, X, y):
    self.W = np.zeros((X.shape[1],1))
    self.b = np.array([0])
    for _ in range(self.epochs):
      y_hat = np.dot(X, self.W) + self.b
      m = X.shape[0]
      dy = (y_hat - y)
      dW = (1/m) * np.dot(X.T, dy)
      db = (1/m) * np.sum(dy)
      self.W = self.W - self.lr*dW
      self.b = self.b - self.lr*db
  
  def predict(self, X):
    return np.dot(X, self.W) + self.b

In [241]:
model_custom = CustomLinearRegression()
model_custom.fit(x_train,y_train)
pred = model_custom.predict(x_test)
rmse = math.sqrt(mean_squared_error(y_test,pred))
print(rmse)

0.959542138819973


In [153]:
a = [i for i in range(x.shape[1])]
vars = a
rmse_all = rmse_from_vars(vars, x_train , x_test , y_train , y_test)
print(rmse_all)

0.9595423601536057


In [246]:
def rmse_from_vars(vars, x_train , x_test , y_train , y_test):
  x_train_temp = []
  x_test_temp = []
  for i in vars:
    x_train_temp.append(x_train[:,i])
    x_test_temp.append(x_test[:,i])

  x_train_temp = np.array(x_train_temp).T
  x_test_temp = np.array(x_test_temp).T

  model_temp = CustomLinearRegression()
  model_temp.fit(x_train_temp,y_train)
  pred = model_temp.predict(x_test_temp)
  rmse = math.sqrt(mean_squared_error(y_test,pred))
  return rmse

### Forward Greedy

In [247]:
a = [i for i in range(x.shape[1])]
rmse_best = 1e4
vars = []
while a:
  rmse_dict = {}
  vars_temp = vars
  for i in a:
    vars_temp.append(i)
    rmse_temp = rmse_from_vars(vars_temp, x_train , x_test , y_train , y_test)
    rmse_dict[str(i)] = rmse_temp
    vars_temp.remove(i)
  
  rmse_iter = float(min(zip(rmse_dict.values(), rmse_dict.keys()))[0])
  if rmse_iter>rmse_best:
    break
  else:
    rmse_best = rmse_iter
    var = int(min(zip(rmse_dict.values(), rmse_dict.keys()))[1])
    vars.append(var)
    a.remove(var)
  print(rmse_best)
  print(vars)

1.0138305252232327
[20]
1.0062867775634954
[20, 1]
0.9932267103417312
[20, 1, 15]
0.9863130256106569
[20, 1, 15, 3]
0.9832285273450012
[20, 1, 15, 3, 21]
0.9810870923786664
[20, 1, 15, 3, 21, 13]
0.9799570214478841
[20, 1, 15, 3, 21, 13, 4]
0.9680031114866456
[20, 1, 15, 3, 21, 13, 4, 16]
0.9642336106279341
[20, 1, 15, 3, 21, 13, 4, 16, 2]
0.9626596113220096
[20, 1, 15, 3, 21, 13, 4, 16, 2, 10]
0.9601397639609455
[20, 1, 15, 3, 21, 13, 4, 16, 2, 10, 14]
0.958854796305508
[20, 1, 15, 3, 21, 13, 4, 16, 2, 10, 14, 18]
0.9585554651705097
[20, 1, 15, 3, 21, 13, 4, 16, 2, 10, 14, 18, 7]
0.9579713281959752
[20, 1, 15, 3, 21, 13, 4, 16, 2, 10, 14, 18, 7, 6]
0.9576799592076534
[20, 1, 15, 3, 21, 13, 4, 16, 2, 10, 14, 18, 7, 6, 22]
0.9576134042549151
[20, 1, 15, 3, 21, 13, 4, 16, 2, 10, 14, 18, 7, 6, 22, 23]
0.9575857972496086
[20, 1, 15, 3, 21, 13, 4, 16, 2, 10, 14, 18, 7, 6, 22, 23, 8]
0.9575540467392756
[20, 1, 15, 3, 21, 13, 4, 16, 2, 10, 14, 18, 7, 6, 22, 23, 8, 17]


In [248]:
print(len(vars))

18


In [249]:
print(vars)
vars.sort()
print(vars)
#[20, 1, 15, 3, 21, 13, 4, 16, 2, 10, 14, 18, 6, 7, 22, 8, 17, 12]

[20, 1, 15, 3, 21, 13, 4, 16, 2, 10, 14, 18, 7, 6, 22, 23, 8, 17]
[1, 2, 3, 4, 6, 7, 8, 10, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23]


In [250]:
print(rmse_all)
print(rmse_best)

0.9595423601536057
0.9575540467392756


### Backward Greedy

In [251]:
a = [i for i in range(x.shape[1])]
rmse_best = rmse_all
vars = a.copy()

while a:
  rmse_dict = {}
  vars_temp = a.copy()
  for i in a:
    vars_temp.remove(i)
    rmse_temp = rmse_from_vars(vars_temp, x_train , x_test , y_train , y_test)
    rmse_dict[str(i)] = rmse_temp
    vars_temp.append(i)
  
  rmse_iter = float(min(zip(rmse_dict.values(), rmse_dict.keys()))[0])
  if rmse_iter>rmse_best:
    break
  else:
    rmse_best = rmse_iter
    var = int(min(zip(rmse_dict.values(), rmse_dict.keys()))[1])
    vars.remove(var)
    a.remove(var)
  print(rmse_best)
  print(vars)

0.9586871813646709
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23, 24, 25]
0.9582510856110045
[0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23, 24, 25]
0.9579395134107956
[0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23, 24, 25]
0.9577113124310452
[0, 1, 2, 3, 4, 6, 7, 8, 10, 12, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23, 24, 25]
0.9576408757686011
[1, 2, 3, 4, 6, 7, 8, 10, 12, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23, 24, 25]
0.9576249436195521
[1, 2, 3, 4, 6, 7, 8, 10, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23, 24, 25]
0.9576249336624996
[1, 2, 3, 4, 6, 7, 8, 10, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23, 25]
0.9575540467392756
[1, 2, 3, 4, 6, 7, 8, 10, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23]


In [252]:
print(len(vars))

18


In [253]:
print(vars)
vars.sort()
print(vars)
# [1, 2, 3, 4, 6, 7, 8, 10, 12, 13, 14, 15, 16, 17, 18, 21, 22]

[1, 2, 3, 4, 6, 7, 8, 10, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23]
[1, 2, 3, 4, 6, 7, 8, 10, 13, 14, 15, 16, 17, 18, 20, 21, 22, 23]


In [254]:
print(rmse_all)
print(rmse_best)

0.9595423601536057
0.9575540467392756
