# Ordinary Least Squares (OLS)

- Extension of [Regression with Seasonal ARIMA](https://github.com/Brinkley97/book-forecasting_and_control/blob/main/part_1/4-linear_nonstationary_models/regressionWithSeasonalARIMA.ipynb);
- Same code up to OLS calculation
- Mathematical Representation of OLS
    
    \begin{align}
    \hat{\beta} = (X^TX)^{-1}X^Ty,
    \space where 
    \end{align}

    \begin{align}
    \hat{\beta} \space (1)
    \end{align}

    \begin{align}
    X^T \space (2) 
    \end{align}

    \begin{align}
    y \space (3)
    \end{align}

    - (1) OLS Estimator (also called coefficient vector of outcomes)
    - (2) Maxtrix Regressor variable X with a transpose, respectively
    - (3) vector of the value of the response variable

# Load data from extension

In [1]:
extension = "regressionWithSeasonalARIMA.ipynb"

In [2]:
# %load extension
regressionWithSeasonalARIMA.ipynb

In [3]:
%run regressionWithSeasonalARIMA.ipynb

In [4]:
air_quality_df

Unnamed: 0_level_0,DateTime,CO_GT,PT08_S1_CO,NMHC_GT,C6H6_GT,PT08_S2_NMHC,NOx_GT,PT08_S3_NOx,NO2_GT,PT08_S4_NO2,PT08_S5_O3,T,RH,AH
DateTimeIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2004-03-10 18:00:00,03-10-04 18:00,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578
2004-03-10 19:00:00,03-10-04 19:00,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255
2004-03-10 20:00:00,03-10-04 20:00,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502
2004-03-10 21:00:00,03-10-04 21:00,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867
2004-03-10 22:00:00,03-10-04 22:00,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2005-04-04 10:00:00,04-04-05 10:00,3.1,1314.0,-200.0,13.5,1101.0,472.0,539.0,190.0,1374.0,1729.0,21.9,29.3,0.7568
2005-04-04 11:00:00,04-04-05 11:00,2.4,1163.0,-200.0,11.4,1027.0,353.0,604.0,179.0,1264.0,1269.0,24.3,23.7,0.7119
2005-04-04 12:00:00,04-04-05 12:00,2.4,1142.0,-200.0,12.4,1063.0,293.0,603.0,175.0,1241.0,1092.0,26.9,18.3,0.6406
2005-04-04 13:00:00,04-04-05 13:00,2.1,1003.0,-200.0,9.5,961.0,235.0,702.0,156.0,1041.0,770.0,28.3,13.5,0.5139


In [5]:
print("dataset_len: ", dataset_len)
print("split_data: ", split_data)
print("train_set_end_date: ", train_set_end_date)

dataset_len:  9357
split_data:  8421
train_set_end_date:  2005-02-24 15:00:00


In [6]:
df_train

Unnamed: 0_level_0,DateTime,CO_GT,PT08_S1_CO,NMHC_GT,C6H6_GT,PT08_S2_NMHC,NOx_GT,PT08_S3_NOx,NO2_GT,PT08_S4_NO2,PT08_S5_O3,T,RH,AH
DateTimeIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2004-03-10 18:00:00,03-10-04 18:00,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578
2004-03-10 19:00:00,03-10-04 19:00,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255
2004-03-10 20:00:00,03-10-04 20:00,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502
2004-03-10 21:00:00,03-10-04 21:00,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867
2004-03-10 22:00:00,03-10-04 22:00,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2005-02-24 11:00:00,02-24-2005 11:00:00,1.8,1093.0,-200.0,6.5,836.0,414.0,728.0,232.0,1142.0,1025.0,4.6,82.0,0.6997
2005-02-24 12:00:00,02-24-2005 12:00:00,1.4,1060.0,-200.0,5.4,784.0,314.0,770.0,196.0,1117.0,941.0,5.5,79.5,0.7238
2005-02-24 13:00:00,02-24-2005 13:00:00,1.7,1121.0,-200.0,7.4,873.0,317.0,697.0,195.0,1206.0,1000.0,6.7,76.2,0.7519
2005-02-24 14:00:00,02-24-2005 14:00:00,1.9,1118.0,-200.0,7.7,888.0,290.0,685.0,189.0,1223.0,962.0,7.9,69.0,0.7394


In [7]:
df_test

Unnamed: 0_level_0,DateTime,CO_GT,PT08_S1_CO,NMHC_GT,C6H6_GT,PT08_S2_NMHC,NOx_GT,PT08_S3_NOx,NO2_GT,PT08_S4_NO2,PT08_S5_O3,T,RH,AH
DateTimeIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2005-02-24 16:00:00,02-24-2005 16:00:00,3.0,1274.0,-200.0,13.3,1094.0,507.0,537.0,267.0,1364.0,1428.0,7.6,70.8,0.7399
2005-02-24 17:00:00,02-24-2005 17:00:00,3.0,1301.0,-200.0,13.2,1092.0,534.0,525.0,262.0,1388.0,1484.0,7.2,72.4,0.7372
2005-02-24 18:00:00,02-24-2005 18:00:00,3.9,1405.0,-200.0,17.5,1224.0,601.0,468.0,270.0,1503.0,1658.0,6.7,74.2,0.7326
2005-02-24 19:00:00,02-24-2005 19:00:00,4.0,1375.0,-200.0,17.7,1230.0,583.0,473.0,260.0,1522.0,1632.0,6.1,75.8,0.7184
2005-02-24 20:00:00,02-24-2005 20:00:00,3.9,1367.0,-200.0,17.5,1225.0,551.0,485.0,265.0,1492.0,1628.0,6.3,74.9,0.7187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2005-04-04 10:00:00,04-04-05 10:00,3.1,1314.0,-200.0,13.5,1101.0,472.0,539.0,190.0,1374.0,1729.0,21.9,29.3,0.7568
2005-04-04 11:00:00,04-04-05 11:00,2.4,1163.0,-200.0,11.4,1027.0,353.0,604.0,179.0,1264.0,1269.0,24.3,23.7,0.7119
2005-04-04 12:00:00,04-04-05 12:00,2.4,1142.0,-200.0,12.4,1063.0,293.0,603.0,175.0,1241.0,1092.0,26.9,18.3,0.6406
2005-04-04 13:00:00,04-04-05 13:00,2.1,1003.0,-200.0,9.5,961.0,235.0,702.0,156.0,1041.0,770.0,28.3,13.5,0.5139


In [8]:
# of df_train 
    # X = T and RH
    # y = PT08_S4_NO2
    # beta = (X^TX)^{-1}X^Ty

In [9]:
t = df_train["T"]
rh = df_train["RH"]
X = pd.concat([t, rh])
y = df_train["PT08_S4_NO2"]
df_train_len = len(df_train)

In [10]:
beta = (X * X.T)**(-1) * X.T * y
beta

DateTimeIndex
2004-03-10 18:00:00    124.411765
2004-03-10 18:00:00     34.601227
2004-03-10 19:00:00    117.218045
2004-03-10 19:00:00     32.683438
2004-03-10 20:00:00    130.672269
                          ...    
2005-02-24 13:00:00     15.826772
2005-02-24 14:00:00    154.810127
2005-02-24 14:00:00     17.724638
2005-02-24 15:00:00    152.823529
2005-02-24 15:00:00     19.592760
Length: 16844, dtype: float64

In [11]:
beta.sum() 

1120640.7929805794

In [12]:
# NOT getting the coefficients
# Intercept    902.977354
# T              9.791778
# AH           384.485262

# t = df_train["T"]
# rh = df_train["RH"]
# X = pd.concat([t, rh])
# y = df_train["PT08_S4_NO2"]

In [13]:
# https://medium.com/analytics-vidhya/ordinary-least-square-ols-method-for-linear-regression-ef8ca10aadfc
# -- error in her calculations with y-yi and X-xi
sales_data = {'Sales': [651, 762, 853, 1062, 1190, 1293], 'Advertising': [25, 28, 35, 40, 46, 53]}
my_sales_data = pd.DataFrame(data=sales_data)
my_sales_data.index += 1
my_sales_data

Unnamed: 0,Sales,Advertising
1,651,25
2,762,28
3,853,35
4,1062,40
5,1190,46
6,1293,53


In [54]:
# https://towardsdatascience.com/linear-regression-from-scratch-cd0dee067f72


BASE = '/Users/brinkley97/Documents/development/'
path_to_dataset = 'book-forecasting_and_control_by_4_Gs/datasets/'
name_of_dataset = 'headSizeVsBrainWeight.csv'
hb_dataset = BASE + path_to_dataset + name_of_dataset
my_head_brain_data = pd.read_csv(hb_dataset)
copy_hd_df = my_head_brain_data.rename(columns={"Head Size(cm^3)":"Head", "Brain Weight(grams)":"Brain"})
copy_hd_df

Unnamed: 0,Gender,Age Range,Head,Brain
0,1,1,4512,1530
1,1,1,3738,1297
2,1,1,4261,1335
3,1,1,3777,1282
4,1,1,4177,1590
...,...,...,...,...
232,2,2,3214,1110
233,2,2,3394,1215
234,2,2,3233,1104
235,2,2,3352,1170


In [85]:
def intercept(X, y, df_train_len):
    """
    
    Parameters:
    X -- pd Df
    y - pd Series
    
    Return
    intercept of b1_hat and b2_hat
    """
    
    y_mean = np.mean(y)
    y_errors = y - y_mean

    X_mean = np.mean(X)
    X_errors = X - X_mean
    
    b1_hat = np.sum(X_errors * y_errors) / np.sum(X_errors**2)
    b0_hat = y_mean - (b1_hat * X_mean)
    
    y_predictions = b0_hat + b1_hat * X
    
    mse = ((y_predictions - y)**2).sum()
    # root_mse = mse**1/2
    root_mse = np.sqrt(mse/df_train_len)
    
    ssr = np.sum(y_errors**2)
    sst = np.sum((y - y_predictions)**2)
    
    r_squared = sst / ssr
    score = 1 - r_squared
    return b1_hat, b0_hat, mse, root_mse, r_squared, score

In [86]:
intercept(X, y, df_train_len)

(1.41506084600539,
 1425.5770296475148,
 1506291506.9515905,
 422.90896845656545,
 1.9829958937404903,
 -0.9829958937404903)

In [87]:
my_X = my_sales_data.loc[0:, "Advertising"]
my_y = my_sales_data.loc[0:, "Sales"]
my_df_len = len(my_sales_data)
intercept(my_X, my_y, my_df_len)

(23.414014598540145,
 82.66978102189773,
 6953.487883211676,
 34.0428354753529,
 0.02173688394172334,
 0.9782631160582766)

In [88]:
my_head_X = copy_hd_df["Head"].values
my_brain_y = copy_hd_df["Brain"].values
my_hb_df_len = len(copy_hd_df)
intercept(my_head_X, my_brain_y, my_hb_df_len)

(0.2634293394893993,
 325.5734210494428,
 1232728.0146365524,
 72.1206213783709,
 0.360688280043,
 0.639311719957)