In [25]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
import random
from scipy import stats
random.seed(42)

In [26]:
data = pd.read_csv("../../data/ice_cream_sales.csv")
data.head()

Unnamed: 0,ID,Temperature,Is_Weekend,Hours_Open,Electricity_Usage,Ice_Cream_Sales
0,1,24.273285,False,10,92.118314,983
1,2,25.503474,False,9,83.917817,1018
2,3,24.370024,False,8,88.290617,951
3,4,24.377495,False,10,85.561865,1012
4,5,26.632614,False,8,94.404976,1010


In [27]:
pearsonr(data["Temperature"], data["Ice_Cream_Sales"])

(0.6627341330556741, 6.8575705768942765e-47)

In [28]:
X = data.drop(["Ice_Cream_Sales", "Is_Weekend", "ID"], axis=1)
y = data["Ice_Cream_Sales"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
# Linear Regression Approach

model = LinearRegression(fit_intercept=True)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Root Mean Squared Error: {sqrt(mse)}')

Root Mean Squared Error: 39.89084800825926


In [30]:
print(f'Intercept: {model.intercept_}')
for i in range(len(X.columns)):
    print(f'{X.columns[i]}: {model.coef_[i]}')

Intercept: -219.52018514362612
Temperature: 29.57052887008028
Hours_Open: 55.57459674364777
Electricity_Usage: -0.08938301919533856


In [31]:
# Intervention

data["Electricity_Usage"] = data["Electricity_Usage"] + 20
X = data.drop(["Ice_Cream_Sales", "Is_Weekend", "ID"], axis=1)
y = data["Ice_Cream_Sales"];

In [32]:
mse = mean_squared_error(y, model.predict(X))

print(f'Root Mean Squared Error: {sqrt(mse)}')

Root Mean Squared Error: 37.894286967083715


In [33]:
alpha = 0.05
coefs = np.r_[[model.intercept_], model.coef_]
X_aux = X_train.copy()
X_aux.insert(0, 'const', 1)
# degrees of freedom
dof = -np.diff(X_aux.shape)[0]
# Student's t-distribution table lookup
t_val = stats.t.isf(alpha/2, dof)
# MSE of the residuals
mse = np.sum((y_train - model.predict(X_train)) ** 2) / dof
# inverse of the variance of the parameters
var_params = np.diag(np.linalg.inv(X_aux.T.dot(X_aux)))
# distance between lower and upper bound of CI
gap = t_val * np.sqrt(mse * var_params)

conf_int = pd.DataFrame({'lower': coefs - gap, 'upper': coefs + gap}, index=X_aux.columns)
conf_int

Unnamed: 0,lower,upper
const,-290.230979,-148.809391
Temperature,26.985475,32.155583
Hours_Open,50.812114,60.33708
Electricity_Usage,-0.703143,0.524377


In [34]:
# Counterfactual

data = pd.read_csv("../../data/ice_cream_sales.csv")
data["Ice_Cream_Sales"] = data["Ice_Cream_Sales"] + (30 - data["Temperature"]) * 30
data["Temperature"] = 30
X = data.drop(["Ice_Cream_Sales", "Is_Weekend", "ID"], axis=1)
y = data["Ice_Cream_Sales"]

In [35]:
mse = mean_squared_error(y, model.predict(X))

print(f'Root Mean Squared Error: {sqrt(mse)}')

Root Mean Squared Error: 37.92723693374946


In [36]:
alpha = 0.05
coefs = np.r_[[model.intercept_], model.coef_]
X_aux = X_train.copy()
X_aux.insert(0, 'const', 1)
# degrees of freedom
dof = -np.diff(X_aux.shape)[0]
# Student's t-distribution table lookup
t_val = stats.t.isf(alpha/2, dof)
# MSE of the residuals
mse = np.sum((y_train - model.predict(X_train)) ** 2) / dof
# inverse of the variance of the parameters
var_params = np.diag(np.linalg.inv(X_aux.T.dot(X_aux)))
# distance between lower and upper bound of CI
gap = t_val * np.sqrt(mse * var_params)

conf_int = pd.DataFrame({'lower': coefs - gap, 'upper': coefs + gap}, index=X_aux.columns)
conf_int

Unnamed: 0,lower,upper
const,-290.230979,-148.809391
Temperature,26.985475,32.155583
Hours_Open,50.812114,60.33708
Electricity_Usage,-0.703143,0.524377
