In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sqlalchemy import create_engine
import warnings

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

warnings.filterwarnings("ignore")

<IPython.core.display.Javascript object>

In [3]:
import warnings
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor


def print_vif(x):
    """Utility for checking multicollinearity assumption
    
    :param x: input features to check using VIF. This is assumed to be a pandas.DataFrame
    :return: nothing is returned the VIFs are printed as a pandas series
    """
    # Silence numpy FutureWarning about .ptp
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        x = sm.add_constant(x)

    vifs = []
    for i in range(x.shape[1]):
        vif = variance_inflation_factor(x.values, i)
        vifs.append(vif)

    print("VIF results\n-------------------------------")
    print(pd.Series(vifs, index=x.columns))
    print("-------------------------------\n")

<IPython.core.display.Javascript object>

In [4]:
postgres_user = "dsbc_student"
postgres_pw = "7*.8G9QH21"
postgres_host = "142.93.121.174"
postgres_port = "5432"
postgres_db = "weatherinszeged"
table_name = "weatherinszeged"

engine = create_engine(
    "postgresql://{}:{}@{}:{}/{}".format(
        postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db
    )
)

df = pd.read_sql_query("select * from {}".format(table_name), con=engine)

# no need for an open connection,
# as we're only doing a single query
engine.dispose()

<IPython.core.display.Javascript object>

In [5]:
weather = df.copy()

<IPython.core.display.Javascript object>

In [6]:
weather["temp_diff"] = weather["apparenttemperature"] - weather["temperature"]

<IPython.core.display.Javascript object>

In [7]:
X = weather[["humidity", "windspeed"]]
y = weather["temp_diff"]

<IPython.core.display.Javascript object>

In [8]:
X_const = sm.add_constant(X)

model = sm.OLS(y, X_const).fit()

model.summary()
#NO these R sqred values are not satisfactory. Our model is doing hardly better than
#just guessing the mean.

0,1,2,3
Dep. Variable:,temp_diff,R-squared:,0.288
Model:,OLS,Adj. R-squared:,0.288
Method:,Least Squares,F-statistic:,19490.0
Date:,"Tue, 01 Sep 2020",Prob (F-statistic):,0.0
Time:,13:11:52,Log-Likelihood:,-170460.0
No. Observations:,96453,AIC:,340900.0
Df Residuals:,96450,BIC:,340900.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.4381,0.021,115.948,0.000,2.397,2.479
humidity,-3.0292,0.024,-126.479,0.000,-3.076,-2.982
windspeed,-0.1193,0.001,-176.164,0.000,-0.121,-0.118

0,1,2,3
Omnibus:,3935.747,Durbin-Watson:,0.267
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4613.311
Skew:,-0.478,Prob(JB):,0.0
Kurtosis:,3.484,Cond. No.,88.1


<IPython.core.display.Javascript object>

In [9]:
test_df = pd.concat([X, y], axis=1)
test_df.corr()

Unnamed: 0,humidity,windspeed,temp_diff
humidity,1.0,-0.224951,-0.242212
windspeed,-0.224951,1.0,-0.411943
temp_diff,-0.242212,-0.411943,1.0


<IPython.core.display.Javascript object>

In [10]:
test_df["ws*h"] = test_df["windspeed"] * test_df["humidity"]

<IPython.core.display.Javascript object>

In [11]:
X = test_df.drop(columns="temp_diff")
y = test_df["temp_diff"]

<IPython.core.display.Javascript object>

In [12]:
X_const = sm.add_constant(X)

model = sm.OLS(y, X_const).fit()

model.summary()
#This model shows a very slight improvement over the last

0,1,2,3
Dep. Variable:,temp_diff,R-squared:,0.341
Model:,OLS,Adj. R-squared:,0.341
Method:,Least Squares,F-statistic:,16660.0
Date:,"Tue, 01 Sep 2020",Prob (F-statistic):,0.0
Time:,13:14:02,Log-Likelihood:,-166690.0
No. Observations:,96453,AIC:,333400.0
Df Residuals:,96449,BIC:,333400.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0839,0.033,2.511,0.012,0.018,0.149
humidity,0.1775,0.043,4.133,0.000,0.093,0.262
windspeed,0.0905,0.002,36.797,0.000,0.086,0.095
ws*h,-0.2971,0.003,-88.470,0.000,-0.304,-0.291

0,1,2,3
Omnibus:,4849.937,Durbin-Watson:,0.265
Prob(Omnibus):,0.0,Jarque-Bera (JB):,9295.404
Skew:,-0.378,Prob(JB):,0.0
Kurtosis:,4.32,Cond. No.,193.0


<IPython.core.display.Javascript object>

In [13]:
test_df["visibility"] = weather["visibility"]

<IPython.core.display.Javascript object>

In [14]:
X = test_df.drop(columns="temp_diff")
y = test_df["temp_diff"]

<IPython.core.display.Javascript object>

In [15]:
X_const = sm.add_constant(X)

model = sm.OLS(y, X_const).fit()

model.summary()
#Another very slight improvement. Also, all Ps are 0 which shows that 
#visibility was a useful addition in some sense, although the coeff is quite small

0,1,2,3
Dep. Variable:,temp_diff,R-squared:,0.364
Model:,OLS,Adj. R-squared:,0.363
Method:,Least Squares,F-statistic:,13770.0
Date:,"Tue, 01 Sep 2020",Prob (F-statistic):,0.0
Time:,13:15:35,Log-Likelihood:,-165040.0
No. Observations:,96453,AIC:,330100.0
Df Residuals:,96448,BIC:,330100.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.1006,0.039,-28.459,0.000,-1.176,-1.025
humidity,0.8909,0.044,20.263,0.000,0.805,0.977
windspeed,0.1033,0.002,42.579,0.000,0.099,0.108
ws*h,-0.3164,0.003,-95.355,0.000,-0.323,-0.310
visibility,0.0646,0.001,58.051,0.000,0.062,0.067

0,1,2,3
Omnibus:,5328.364,Durbin-Watson:,0.288
Prob(Omnibus):,0.0,Jarque-Bera (JB):,11525.074
Skew:,-0.373,Prob(JB):,0.0
Kurtosis:,4.52,Cond. No.,246.0


<IPython.core.display.Javascript object>

According to AIC and BIC, our last, most accurate, model happens to be the best. However, the scores are quite high and indicate a pretty poorly performing model.

In [16]:
postgres_user = "dsbc_student"
postgres_pw = "7*.8G9QH21"
postgres_host = "142.93.121.174"
postgres_port = "5432"
postgres_db = "houseprices"
table_name = "houseprices"

engine = create_engine(
    "postgresql://{}:{}@{}:{}/{}".format(
        postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db
    )
)

df = pd.read_sql_query("select * from {}".format(table_name), con=engine)

# no need for an open connection,
# as we're only doing a single query
engine.dispose()

<IPython.core.display.Javascript object>

In [17]:
house_prices_df = df.copy()

<IPython.core.display.Javascript object>

In [18]:
house_prices_df = pd.concat(
    [
        house_prices_df,
        pd.get_dummies(house_prices_df.mszoning, prefix="mszoning", drop_first=True),
    ],
    axis=1,
)
house_prices_df = pd.concat(
    [
        house_prices_df,
        pd.get_dummies(house_prices_df.street, prefix="street", drop_first=True),
    ],
    axis=1,
)
dummy_column_names = list(
    pd.get_dummies(house_prices_df.mszoning, prefix="mszoning", drop_first=True).columns
)
dummy_column_names = dummy_column_names + list(
    pd.get_dummies(house_prices_df.street, prefix="street", drop_first=True).columns
)

<IPython.core.display.Javascript object>

In [21]:
house_prices_df.columns

Index(['id', 'mssubclass', 'mszoning', 'lotfrontage', 'lotarea', 'street',
       'alley', 'lotshape', 'landcontour', 'utilities', 'lotconfig',
       'landslope', 'neighborhood', 'condition1', 'condition2', 'bldgtype',
       'housestyle', 'overallqual', 'overallcond', 'yearbuilt', 'yearremodadd',
       'roofstyle', 'roofmatl', 'exterior1st', 'exterior2nd', 'masvnrtype',
       'masvnrarea', 'exterqual', 'extercond', 'foundation', 'bsmtqual',
       'bsmtcond', 'bsmtexposure', 'bsmtfintype1', 'bsmtfinsf1',
       'bsmtfintype2', 'bsmtfinsf2', 'bsmtunfsf', 'totalbsmtsf', 'heating',
       'heatingqc', 'centralair', 'electrical', 'firstflrsf', 'secondflrsf',
       'lowqualfinsf', 'grlivarea', 'bsmtfullbath', 'bsmthalfbath', 'fullbath',
       'halfbath', 'bedroomabvgr', 'kitchenabvgr', 'kitchenqual',
       'totrmsabvgrd', 'functional', 'fireplaces', 'fireplacequ', 'garagetype',
       'garageyrblt', 'garagefinish', 'garagecars', 'garagearea', 'garagequal',
       'garagecond', 'paved

<IPython.core.display.Javascript object>

In [22]:
# Y is the target variable
y = house_prices_df["saleprice"]
# X is the feature set
X = house_prices_df[
    ["lotarea", "overallqual", "grlivarea", "garagecars", "garagearea", "totalbsmtsf"]
    + dummy_column_names
]

X = sm.add_constant(X)

results = sm.OLS(y, X).fit()

results.summary()

0,1,2,3
Dep. Variable:,saleprice,R-squared:,0.773
Model:,OLS,Adj. R-squared:,0.771
Method:,Least Squares,F-statistic:,447.4
Date:,"Tue, 01 Sep 2020",Prob (F-statistic):,0.0
Time:,14:12:04,Log-Likelihood:,-17463.0
No. Observations:,1460,AIC:,34950.0
Df Residuals:,1448,BIC:,35010.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.344e+05,1.82e+04,-7.369,0.000,-1.7e+05,-9.86e+04
lotarea,0.5414,0.110,4.933,0.000,0.326,0.757
overallqual,2.401e+04,1088.703,22.054,0.000,2.19e+04,2.61e+04
grlivarea,43.0027,2.505,17.164,0.000,38.088,47.917
garagecars,1.348e+04,2966.671,4.545,0.000,7665.044,1.93e+04
garagearea,15.4682,10.321,1.499,0.134,-4.778,35.714
totalbsmtsf,26.1108,2.944,8.870,0.000,20.337,31.885
mszoning_FV,2.24e+04,1.36e+04,1.648,0.100,-4260.817,4.91e+04
mszoning_RH,1.091e+04,1.57e+04,0.694,0.488,-1.99e+04,4.18e+04

0,1,2,3
Omnibus:,436.524,Durbin-Watson:,1.981
Prob(Omnibus):,0.0,Jarque-Bera (JB):,49049.354
Skew:,-0.213,Prob(JB):,0.0
Kurtosis:,31.392,Cond. No.,403000.0


<IPython.core.display.Javascript object>

`Run your house prices model again and assess the goodness of fit of your model using F-test, R-squared, adjusted R-squared, AIC and BIC.`

Our F -statistic is high but maybe not that high considering the scale of houseprices. The important part, the p, is close to 0 so we can at least be confident that it's better than the reduced model.

R-squared and adj. R-squared are not the greatest. With many more features, the value goes up but considering the complexity of the data, there is a very high chance for overfitting. The model could just be capturing the high variance associated with this data.

Both AIC and BIC are quite high, possibly due to the somewhat high number of features.

`Do you think your model is satisfactory? If so, why?`
Overall, I think there are many improvements that could be made to the model but I think would require much more exploration of the data. There's probably a lot of interactions that could be made with the high amount of categorical variables involved

In [25]:
# Y is the target variable
y = house_prices_df["saleprice"]
# X is the feature set
X = house_prices_df[
    ["lotarea", "overallqual", "grlivarea", "garagecars", "garagearea", "totalbsmtsf"]
    + dummy_column_names
]
X = X.drop(columns=["mszoning_RH", "mszoning_RM", "street_Pave"])
X = sm.add_constant(X)

results = sm.OLS(y, X).fit()

results.summary()

0,1,2,3
Dep. Variable:,saleprice,R-squared:,0.772
Model:,OLS,Adj. R-squared:,0.771
Method:,Least Squares,F-statistic:,615.4
Date:,"Tue, 01 Sep 2020",Prob (F-statistic):,0.0
Time:,14:23:48,Log-Likelihood:,-17464.0
No. Observations:,1460,AIC:,34950.0
Df Residuals:,1451,BIC:,34990.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.103e+05,4769.003,-23.123,0.000,-1.2e+05,-1.01e+05
lotarea,0.5135,0.107,4.799,0.000,0.304,0.723
overallqual,2.411e+04,1081.080,22.302,0.000,2.2e+04,2.62e+04
grlivarea,43.2715,2.491,17.371,0.000,38.385,48.158
garagecars,1.353e+04,2961.112,4.569,0.000,7722.036,1.93e+04
garagearea,14.3709,10.273,1.399,0.162,-5.781,34.522
totalbsmtsf,26.1474,2.941,8.890,0.000,20.378,31.917
mszoning_FV,1.608e+04,5536.049,2.905,0.004,5225.069,2.69e+04
mszoning_RL,1.788e+04,2834.227,6.307,0.000,1.23e+04,2.34e+04

0,1,2,3
Omnibus:,434.914,Durbin-Watson:,1.98
Prob(Omnibus):,0.0,Jarque-Bera (JB):,48213.282
Skew:,-0.21,Prob(JB):,0.0
Kurtosis:,31.149,Cond. No.,83900.0


<IPython.core.display.Javascript object>

In [33]:
model_df = house_prices_df[
    [
        "lotarea",
        "overallqual",
        "grlivarea",
        "garagecars",
        "totalbsmtsf",
        "masvnrarea",
        "saleprice",
        "bedroomabvgr",
    ]
    + dummy_column_names
].dropna()

<IPython.core.display.Javascript object>

In [34]:
y = model_df["saleprice"]
# X is the feature set

X = model_df.drop(columns=["mszoning_RH", "mszoning_RM", "street_Pave", "saleprice"])
X = sm.add_constant(X)

results = sm.OLS(y, X).fit()

results.summary()

0,1,2,3
Dep. Variable:,saleprice,R-squared:,0.783
Model:,OLS,Adj. R-squared:,0.782
Method:,Least Squares,F-statistic:,578.6
Date:,"Tue, 01 Sep 2020",Prob (F-statistic):,0.0
Time:,14:30:09,Log-Likelihood:,-17330.0
No. Observations:,1452,AIC:,34680.0
Df Residuals:,1442,BIC:,34730.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-8.123e+04,5836.482,-13.918,0.000,-9.27e+04,-6.98e+04
lotarea,0.5150,0.104,4.934,0.000,0.310,0.720
overallqual,2.188e+04,1080.406,20.256,0.000,1.98e+04,2.4e+04
grlivarea,52.7323,3.028,17.415,0.000,46.793,58.672
garagecars,1.492e+04,1692.170,8.820,0.000,1.16e+04,1.82e+04
totalbsmtsf,22.0419,2.872,7.674,0.000,16.408,27.676
masvnrarea,34.8774,6.128,5.692,0.000,22.857,46.898
bedroomabvgr,-9436.0875,1502.861,-6.279,0.000,-1.24e+04,-6488.060
mszoning_FV,1.805e+04,5474.574,3.297,0.001,7312.043,2.88e+04

0,1,2,3
Omnibus:,595.62,Durbin-Watson:,1.988
Prob(Omnibus):,0.0,Jarque-Bera (JB):,72188.752
Skew:,-0.882,Prob(JB):,0.0
Kurtosis:,37.498,Cond. No.,89300.0


<IPython.core.display.Javascript object>