In [1]:
# multiple linear regression
import pandas as pd
df = pd.DataFrame({
    'discount': [28, 24, 13, 0, 27, 30, 10, 16, 6, 5, 7, 11, 11, 30, 25,
            4, 7, 24, 19, 21, 6, 10, 26, 13, 15, 6, 12, 6, 20, 2],
    'temperature': [15, 34, 15, 22, 29, 30, 14, 17, 28, 29, 19, 19, 34, 10,
           29, 28, 12, 25, 32, 28, 22, 16, 30, 11, 16, 18, 16, 33, 12, 22],
    'advertisement': [342, 666, 224, 764, 148, 499, 711, 596, 797, 484, 986, 347, 146, 362, 642,
            591, 846, 260, 560, 941, 469, 309, 730, 305, 892, 147, 887, 526, 525, 884],
    'sales': [635, 958, 525, 25, 607, 872, 858, 732, 1082, 863, 904, 686, 699, 615, 893,
            830, 856, 679, 918, 951, 789, 583, 988, 631, 866, 549, 910, 946, 647, 943]
})
df.head()

Unnamed: 0,discount,temperature,advertisement,sales
0,28,15,342,635
1,24,34,666,958
2,13,15,224,525
3,0,22,764,25
4,27,29,148,607


In [3]:
from statsmodels.formula.api import ols
model = ols('sales ~ advertisement + discount + temperature', df).fit()
model.summary()

0,1,2,3
Dep. Variable:,sales,R-squared:,0.4
Model:,OLS,Adj. R-squared:,0.33
Method:,Least Squares,F-statistic:,5.77
Date:,"Thu, 28 Nov 2024",Prob (F-statistic):,0.00366
Time:,22:58:27,Log-Likelihood:,-194.11
No. Observations:,30,AIC:,396.2
Df Residuals:,26,BIC:,401.8
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,267.6609,129.985,2.059,0.050,0.472,534.849
advertisement,0.4148,0.125,3.310,0.003,0.157,0.672
discount,4.2068,3.549,1.185,0.247,-3.089,11.503
temperature,9.4798,4.098,2.313,0.029,1.057,17.903

0,1,2,3
Omnibus:,56.788,Durbin-Watson:,1.647
Prob(Omnibus):,0.0,Jarque-Bera (JB):,419.005
Skew:,-3.845,Prob(JB):,1.0299999999999999e-91
Kurtosis:,19.616,Cond. No.,2580.0


In [4]:
# Q1 correlation coefficient between discount and temperature
df['discount'].corr(df['temperature'])

0.09392795276277474

In [9]:
# Q2 r-squre
import numpy as np
np.round(model.rsquared, 2)

0.4

In [12]:
# Q3 coefficients
model.params

Intercept        267.660902
advertisement      0.414827
discount           4.206829
temperature        9.479843
dtype: float64

In [13]:
# Q4 intercept
model.params['Intercept']

267.6609019248274

In [16]:
# Q5 p-value
np.round(model.pvalues['temperature'], 4)

0.0289

In [18]:
## Q6 predict
new_data = pd.DataFrame({'discount':[10], 'temperature':[20], 'advertisement':[500000]})
model.predict(new_data)[0]

207912.6107404532

In [20]:
# SSE
np.sum(model.resid **2)

732197.8988142071

In [21]:
# MSE
np.mean(model.resid **2)

24406.596627140236

In [22]:
# confidence interval
model.conf_int(alpha=0.05)

Unnamed: 0,0,1
Intercept,0.472382,534.849422
advertisement,0.15721,0.672443
discount,-3.089233,11.502892
temperature,1.056864,17.902822


In [26]:
# confidence and prediction intervals
new_data = pd.DataFrame({'advertisement':[45], 'discount':[10], 'temperature':[20]})
pred = model.get_prediction(new_data)
pred.summary_frame(alpha=0.05)

Unnamed: 0,mean,mean_se,mean_ci_lower,mean_ci_upper,obs_ci_lower,obs_ci_upper
0,517.993249,75.613396,362.567688,673.418811,139.648259,896.33824


In [28]:
model.pvalues['advertisement'] # reject

0.0027398053085787755