In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import warnings
from sqlalchemy import create_engine

%matplotlib inline

# Filter warnings
warnings.filterwarnings(action='ignore')

In [2]:
# Connection information
user = 'dsbc_student'
pw = '7*.8G9QH21'
host = '142.93.121.174'
port = '5432'
db = 'weatherinszeged'

In [3]:
# Establish and terminate connection
engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(user, pw, host, port, db))

weather = pd.read_sql_query('select * from weatherinszeged', con=engine)

engine.dispose()

In [4]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96453 entries, 0 to 96452
Data columns (total 12 columns):
date                   96453 non-null datetime64[ns, UTC]
summary                96453 non-null object
preciptype             96453 non-null object
temperature            96453 non-null float64
apparenttemperature    96453 non-null float64
humidity               96453 non-null float64
windspeed              96453 non-null float64
windbearing            96453 non-null float64
visibility             96453 non-null float64
loudcover              96453 non-null float64
pressure               96453 non-null float64
dailysummary           96453 non-null object
dtypes: datetime64[ns, UTC](1), float64(8), object(3)
memory usage: 8.8+ MB


In [5]:
weather.describe()

Unnamed: 0,temperature,apparenttemperature,humidity,windspeed,windbearing,visibility,loudcover,pressure
count,96453.0,96453.0,96453.0,96453.0,96453.0,96453.0,96453.0,96453.0
mean,11.932678,10.855029,0.734899,10.81064,187.509232,10.347325,0.0,1003.235956
std,9.551546,10.696847,0.195473,6.913571,107.383428,4.192123,0.0,116.969906
min,-21.822222,-27.716667,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.688889,2.311111,0.6,5.8282,116.0,8.3398,0.0,1011.9
50%,12.0,12.0,0.78,9.9659,180.0,10.0464,0.0,1016.45
75%,18.838889,18.838889,0.89,14.1358,290.0,14.812,0.0,1021.09
max,39.905556,39.344444,1.0,63.8526,359.0,16.1,0.0,1046.38


In [6]:
weather.head()

Unnamed: 0,date,summary,preciptype,temperature,apparenttemperature,humidity,windspeed,windbearing,visibility,loudcover,pressure,dailysummary
0,2006-03-31 22:00:00+00:00,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.
1,2006-03-31 23:00:00+00:00,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.
2,2006-04-01 00:00:00+00:00,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.
3,2006-04-01 01:00:00+00:00,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.
4,2006-04-01 02:00:00+00:00,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.


### Model 1 

In [7]:
# Independent and dependent variables
X = weather[['humidity', 'windspeed']]
Y = weather['target'] = weather['apparenttemperature'] - weather['temperature']

# Linear Regression model
X = sm.add_constant(X)
results = sm.OLS(Y, X).fit()

# Print summary
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.288
Model:                            OLS   Adj. R-squared:                  0.288
Method:                 Least Squares   F-statistic:                 1.949e+04
Date:                Fri, 17 Jan 2020   Prob (F-statistic):               0.00
Time:                        19:11:24   Log-Likelihood:            -1.7046e+05
No. Observations:               96453   AIC:                         3.409e+05
Df Residuals:                   96450   BIC:                         3.409e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.4381      0.021    115.948      0.0

R-squared and Adjusted R-squared are both 0.288, which is not satisfactory because only 28.8% of the target variable variance can be explained while the remaining 71.2% cannot.

### Model 2 

In [10]:
# Add interaction between humidity and windspeed to dataframe
weather['humid_wspd_interaction'] = weather['humidity'] * weather['windspeed']

In [13]:
# New set of independent variables
X2 = weather[['humidity', 'windspeed', 'humid_wspd_interaction']]

# Linear Regression
X2 = sm.add_constant(X2)
results = sm.OLS(Y, X2).fit()

# Print results summary
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.341
Model:                            OLS   Adj. R-squared:                  0.341
Method:                 Least Squares   F-statistic:                 1.666e+04
Date:                Fri, 17 Jan 2020   Prob (F-statistic):               0.00
Time:                        19:42:27   Log-Likelihood:            -1.6669e+05
No. Observations:               96453   AIC:                         3.334e+05
Df Residuals:                   96449   BIC:                         3.334e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                      0

With the addition of the interaction between ```humidity``` and ```windspeed```, the R-squared and adjusted R-squared value increased to 0.341, which is an improvement upon the previous model.

### Model 3 

In [19]:
# Add independent variable
X3 = X2
X3['visibility'] = weather['visibility']

In [20]:
X3.head()

Unnamed: 0,const,humidity,windspeed,humid_wspd_interaction,visibility
0,1.0,0.89,14.1197,12.566533,15.8263
1,1.0,0.86,14.2646,12.267556,15.8263
2,1.0,0.89,3.9284,3.496276,14.9569
3,1.0,0.83,14.1036,11.705988,15.8263
4,1.0,0.83,11.0446,9.167018,15.8263


In [21]:
# Linear Regression
X3 = sm.add_constant(X3)
results = sm.OLS(Y, X3).fit()

# Print results summary
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.364
Model:                            OLS   Adj. R-squared:                  0.363
Method:                 Least Squares   F-statistic:                 1.377e+04
Date:                Fri, 17 Jan 2020   Prob (F-statistic):               0.00
Time:                        19:47:16   Log-Likelihood:            -1.6504e+05
No. Observations:               96453   AIC:                         3.301e+05
Df Residuals:                   96448   BIC:                         3.301e+05
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                     -1

After including ```visibility``` in our model, the values for R-squared and adjusted R-squared increased. When comparing the adjusted R-squared values for Model 2 (0.341) and Model 3 (0.363), we see Model 3 increased by 0.022 points concluding that Model 3 is more useful.

In addition, when comparing AIC and BIC for the three models, we see that Model 3 has the lowest values for both AIC and BIC (3.301e+05).