In [1]:
import random
from math import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
caldata = pd.read_csv("weather.csv")
print(caldata)

            Date  TempHighF  TempAvgF  TempLowF DewPointHighF DewPointAvgF  \
0     2013-12-21         74        60        45            67           49   
1     2013-12-22         56        48        39            43           36   
2     2013-12-23         58        45        32            31           27   
3     2013-12-24         61        46        31            36           28   
4     2013-12-25         58        50        41            44           40   
...          ...        ...       ...       ...           ...          ...   
1314  2017-07-27        103        89        75            71           67   
1315  2017-07-28        105        91        76            71           64   
1316  2017-07-29        107        92        77            72           64   
1317  2017-07-30        106        93        79            70           68   
1318  2017-07-31         99        88        77            66           61   

     DewPointLowF HumidityHighPercent HumidityAvgPercent Humidi

In [3]:
caldata.isnull().sum()

Date                          0
TempHighF                     0
TempAvgF                      0
TempLowF                      0
DewPointHighF                 0
DewPointAvgF                  0
DewPointLowF                  0
HumidityHighPercent           0
HumidityAvgPercent            0
HumidityLowPercent            0
SeaLevelPressureHighInches    0
SeaLevelPressureAvgInches     0
SeaLevelPressureLowInches     0
VisibilityHighMiles           0
VisibilityAvgMiles            0
VisibilityLowMiles            0
WindHighMPH                   0
WindAvgMPH                    0
WindGustMPH                   0
PrecipitationSumInches        0
Events                        0
dtype: int64

In [32]:
class RainMLR():

    def __init__(self, alpha, iterations):
        self.iterations = iterations
        self.alpha = alpha
        self.features = []
        self.results = []
        self.predictions = []
        self.X = []
        self.Y = []

    def generateDataSet(self):
        df = pd.read_csv("weather.csv")
        results = 'PrecipitationSumInches'
        features = ['TempAvgF', 'DewPointAvgF', 'HumidityAvgPercent', 'SeaLevelPressureAvgInches', 'VisibilityAvgMiles', 'WindAvgMPH']
        for col in df.columns:
            if col not in results and col not in features:
                df.drop(col, axis=1, inplace=True)

        df = df[df[results]!="T"]
        for col in features:
            df = df[df[col]!="-"]

        for col in df.columns:
            if col != results:
                df[col] = (df[col].astype('float') - df[col].astype('float').min()) / (df[col].astype('float').max() - df[col].astype('float').min())

        bias = np.array([1 for _ in range(0, df.shape[0])])
        print(bias)
        tempX = np.asarray(df[features])
        print(tempX)

        X = np.matrix(np.c_[bias.T, tempX], dtype=float)
        Y = np.matrix(df[results], dtype=float).T
        
        self.results = results
        self.features = features
        return X, Y

    def findH(self, x, beta):
        h = np.sum([x[i]*beta[i] for i in range(len(x))])
        return h
    
    def findCost(self, X, Y, estBeta):
        cost = np.sum([(self.findH(X[i], estBeta)-Y[i])**2 for i in range(len(Y))])/2*len(Y)
        return cost

    def batchGradientDescent(self):
        X, Y = self.generateDataSet()
        self.X = np.array(X)
        X = np.array(X)
        self.Y = np.array(Y)
        Y = np.array(Y)
        estBeta = np.array([random.random() for _ in range(X.shape[1])])

        costs = []
        cost = self.findCost(X, Y, estBeta)
        costs.append(cost)
        
        for i in range(self.iterations):
            for j in range(X.shape[1]):
                estBeta[j] = estBeta[j] - (self.alpha/len(Y))*(np.sum([(self.findH(X[i],estBeta) - Y[i])*X[i][j] for i in range(len(Y))]))
            
            cost = self.findCost(X, Y, estBeta)
            costs.append(cost)

        for i in range(len(self.features)):
            print(f"Estimated Beta for {self.features[i]}: {estBeta[i]}")
            
        return estBeta
        
    def predict(self):
        predictions = []
        estimatedBeta = self.batchGradientDescent()
        for x in self.X: 
            predictions.append(self.findH(x, estimatedBeta))
            
        self.predictions = predictions
    
    def evaluate(self):
        sst = 0
        ssr = 0
        sse = 0
        
        y_bar = np.mean(self.Y)
        
        predictions = self.predictions
        
        for y in predictions:
            ssr += (y - y_bar)**2
        
        i = 0
        for y in self.Y:
            sse += (y - predictions[i])**2
            i += 1
            
        for y in self.Y:
            sst += (y - y_bar)**2
            
        print("\nSSE: ", sse)
        print("SSR: ", ssr)
        print("SSR + SSE: ", ssr+sse)
        print("SSTO: ", sst)
        
        r2 = 1 - ssr/(ssr + sse)
        print("R2 Metric: ", r2)
        t = (ssr*(1319-6-1))/(sse*6)
        print("T stats: ", t)

        adjusted_r2 = 1 - ((1 - r2) * (1319 - 1) / (1319 - 6 - 1))
        print("Adjusted R2 score: ", adjusted_r2)
        

In [34]:
model = RainMLR(0.1,500)
model.predict()
model.evaluate()

[1 1 1 ... 1 1 1]
[[0.484375   0.60294118 0.68571429 0.1092437  0.625      0.27272727]
 [0.296875   0.41176471 0.58571429 0.48739496 1.         0.45454545]
 [0.25       0.27941176 0.35714286 0.78991597 1.         0.18181818]
 ...
 [0.984375   0.82352941 0.34285714 0.2605042  1.         0.27272727]
 [1.         0.88235294 0.3        0.30252101 1.         0.27272727]
 [0.921875   0.77941176 0.22857143 0.35294118 1.         0.27272727]]
Estimated Beta for TempAvgF: -0.34589743739749307
Estimated Beta for DewPointAvgF: -0.0017956466736647185
Estimated Beta for HumidityAvgPercent: 0.24006753111691215
Estimated Beta for SeaLevelPressureAvgInches: 0.7228460561017894
Estimated Beta for VisibilityAvgMiles: 0.30411212488246925
Estimated Beta for WindAvgMPH: -0.3569662469075914

SSE:  [200.10788726]
SSR:  47.06105347542753
SSR + SSE:  [247.16894074]
SSTO:  [239.69267291]
R2 Metric:  [0.80959965]
T stats:  [51.42567759]
Adjusted R2 score:  [0.80872891]
