# Linear Regression

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
class Linear_Regression:
    
    
    def __init__(self,learning_rate=0.01,epochs=1000):
        # Learning rate(alpha) is a hyper-parameter to determine the value of update of weights and bias on gradient descent
        # Too small Learning rate could make convergence too slow
        # Too big learning rate cause the descent to jumps the minima
        self.learning_rate=learning_rate
        # Epochs is the number to time the descent will take place
        self.epochs=epochs
        # Initialising weights and bias to None
        # Weights and bias is tweaked by gradient to find the line of best fit.
        # Weight is defined as how much of what features combined to give the Dependent Variable
        self.weights=None
        self.bias=None
        
    def predicted_val(self,Xi,W,b):
        # Xi=(num_of_samples,num_of_features);W=(num_of_features,1);b=(num_of_samples,1)
        # Dimension of ypred --> (num_of_samples,1)
        ypred=np.dot(Xi,W) + b
        return ypred
        
    def fit(self,X,y):
        num_of_samples,num_of_features=X.shape
        # weight vector is of dimension (num_of_features,1)
        self.weights= np.random.randn(num_of_features,1)
        # Bias is (1,1)
        self.bias=np.random.randn(1,1)
        # Looping over number of epochs
        for i in range(self.epochs):
            # dW is the derivative of the cost function 
            # Cost function is the mean-squared error between predicted value of y and real y
            dW=(X.T@(self.predicted_val(X,self.weights,self.bias)-y))/(len(y))
            db=np.sum((self.predicted_val(X,self.weights,self.bias))-y)/(len(y))
            
            # Updating weights and bias
            self.weights=self.weights-self.learning_rate*dW
            self.bias=self.bias-self.learning_rate*db
            
    def predict(self,X):
        y_predicted=np.dot(X,self.weights) +self.bias
        return y_predicted
    
    # Checking accuracy by r_squared
    def r2(self,y_pred_val,y):
        sst = np.sum((y-y.mean())**2)
        ssr = np.sum((y_pred_val-y)**2)
        r2 = 1-(ssr/sst)
        return r2           
            

In [3]:
# Reading csv file
ds=pd.read_excel('pp.xlsx')

In [4]:
# Seeing content of first 5 rows
ds.head()

Unnamed: 0,AT,V,AP,RH,PE
0,14.96,41.76,1024.07,73.17,463.26
1,25.18,62.96,1020.04,59.08,444.37
2,5.11,39.4,1012.16,92.14,488.56
3,20.86,57.32,1010.24,76.64,446.48
4,10.82,37.5,1009.23,96.62,473.9


In [5]:
# Checking for null
ds.isnull().sum()

AT    0
V     0
AP    0
RH    0
PE    0
dtype: int64

In [6]:
# Shuffle the rows
ds=ds.sample(frac=1)

In [7]:
# List column name for further reference
list_of_columns=list(ds)

In [8]:
ds.shape

(9568, 5)

In [9]:
# Splitting into train and validation set and target variable and independent variable
ds_train_X=ds.iloc[:9000,:-1].values
ds_train_y=ds.iloc[:9000,4].values
ds_val_X=ds.iloc[9000:,:-1].values
ds_val_y=ds.iloc[9000:,4].values

In [10]:
# Scaling
# It could be done manually, I used sci-kit learn library
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
ds_train_X = sc.fit_transform(ds_train_X)
ds_val_X=sc.fit_transform(ds_val_X)

In [11]:
# To rank matrix to proper matrix
ds_train_y=ds_train_y.reshape(9000,1)
ds_val_y=ds_val_y.reshape(568,1)

In [12]:
linreg=Linear_Regression()

In [13]:
linreg.fit(ds_train_X,ds_train_y)

In [14]:
pred=linreg.predict(ds_val_X)

In [15]:
linreg.r2(pred,ds_val_y)

0.9252275854226435