In [11]:
import pandas as pd
import statsmodels.api as sm
from bokeh.io import output_notebook
output_notebook()
from bokeh.plotting import figure
from bokeh.io import show
import numpy as np
%matplotlib inline 

df = pd.read_excel('Weddings.xlsx')
df.head()
model = sm.OLS.from_formula('WeddingCost ~ CoupleIncome + BrideAge + Payor + Attendance', data=df).fit()
model.summary()

fig = figure(height=400, width=400)
fig.circle(list(range(len(model.resid))), model.resid)
show(fig)

import numpy as np
hist, edges = np.histogram(model.resid, bins=10)
fig = figure(height=400, width=400)
fig.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="white")
show(fig)

print (model.summary())


### ASSUMPTIONS ###

# Coefficients on the variables
# Looking at the coefficients we can assume that the wedding's cost is positively affected by Income, Payor and Attendance
# That means that the greater these variables the higher the cost
# At the same time the cost is negatively affected by the Bride's age, showing that the greatest the age the lower the cost

# Significance of the variables
# The p-values on all coefficients, except for the intercept, show that all variables have a significant effect on the cost
# Looking at p-values and coefficients it looks like the most important variables are the Couple's Income and the Attendance

# Quality of the model
# If we look at the simple linear regression that we used on Exercise 15.1,
# for the correlation between the cost of the wedding and the couple's income,
# we can see that the R-squared and the adjusted R-squared had the values of 0.689 and 0.675 respectively 
# Now, in the multiple linear regression the R-squared and adjusted R-squared have increased to 0.754 and 0.705
# This shows that the addition of more independent variables has helped to improve the quality of the model
# The fact that they are close to 1 indicate that the model takes account of most of the variables that affect the dependent variable
# There are still some variables that need to be added so that the model is perfect

# Errors and residuals
# Looking at the bottom of the summary we can get information about the residuals
# From there we can get assumptions about the data distribution and behavior
# Looking at the distribution of error on the scatterplot and histogram we can get the following assumptions:
# Errors seem to be equally distributed around 0 but seem not to be normally distributed
# However the p-value of the Jarque-Bera is 0.1, which is greater than the significance level of 0.05
# So, we cannot reject the null hypothesis of normal distribution

# Model
# Following the model from Session15 Tutorial our model can now be described as:
# Wedding's Cost = -3048.3092 +0.3238*CoupleIncome -233.3111*BrideAge +3712.8716*Payor +54.1315*Attendance

# Example
# Attendance = 175
# CoupleIncome = 100,000
# BrideAge = 33
# Payor = 0

Budget = (-3048.3092 +0.3238*100000 -233.3111*33 + 3712.8716*0 +54.1315*175)
print (Budget)

                            OLS Regression Results                            
Dep. Variable:            WeddingCost   R-squared:                       0.754
Model:                            OLS   Adj. R-squared:                  0.705
Method:                 Least Squares   F-statistic:                     15.31
Date:                Tue, 01 Oct 2019   Prob (F-statistic):           6.97e-06
Time:                        23:08:48   Log-Likelihood:                -254.71
No. Observations:                  25   AIC:                             519.4
Df Residuals:                      20   BIC:                             525.5
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept    -3048.3092   1.16e+04     -0.263   