### Import libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_val_score


### Open weather_df in this notebook

In [2]:
# Call stored weather_df 
%store -r weather_df

# View
weather_df

Unnamed: 0,year,month,tmin,tmax,rain,tmean
0,1948,1,3.3,8.9,85.0,6.10
1,1948,2,2.2,7.9,26.0,5.05
2,1948,3,3.8,14.2,14.0,9.00
3,1948,4,5.1,15.4,35.0,10.25
4,1948,5,6.9,18.1,57.0,12.50
...,...,...,...,...,...,...
865,2020,2,4.3,11.1,99.8,7.70
866,2020,3,3.9,12.0,42.8,7.95
867,2020,4,6.5,18.2,38.2,12.35
868,2020,5,9.1,21.1,2.0,15.10


### Split data into train and test sets 

In [3]:
# Set the target variable (tmean) and feature variables (rain, tmin, tmax).

target = 'tmean'
Y = weather_df['tmean']

X = weather_df[['tmin', 'tmax', 'rain']]

# Split dataset into training and testing sets. 
# Using a testing set of 20% and training set of 80%.
# Using random state to obtain consistent results acorss different code runs
# to maintain reproducibility.

X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                 test_size = 0.20,
                                                 random_state = 42)

print(Y_train.shape)
print(Y_test.shape)
print(X_train.shape)
print(X_test.shape)

# Looks like the data has been 20/80 split and has the correct number of variables.

(696,)
(174,)
(696, 3)
(174, 3)


### Establish Baseline MAE

In [4]:
# Need to establish a baseline to try and beat with our models using MAE.
# Assumes that the average tmean over all the years (1948-2020)
# is a reasonable estimate of mean temp in current year.

y_pred = [Y_train.mean()] * len(Y_train)
print('Baseline MAE: ', round (mean_absolute_error(Y_train, y_pred), 5))

# As MAE shows difference between actual and predicted values, the lower the better.
# the MAE of 4.51086 is............

Baseline MAE:  4.51086


### Create Pipeline for Linear Regression Model

In [8]:
# Create pipeline for linear regression model
# Apply standard scaler

lm = make_pipeline(
    StandardScaler(),
    LinearRegression(),
    )

# Fit model to data
lm.fit(X_train, Y_train)

# Quick view of the perfromance 
print('Linear Regression Training MAE:', round(mean_absolute_error(Y_train, lm.predict(X_train)), 5))
print('Linear Regression Test MAE:', round(mean_absolute_error(Y_test, lm.predict(X_test)), 5))

Linear Regression Training MAE: 0.0
Linear Regression Test MAE: 0.0


### Create Pipeline for Random Forest Regressor Model

In [10]:
# Create pipeline for Random Forest Regressor model
# Apply standard scaler
# Start with 100 n_estimators (num of trees) 
# and max_depth of 50 (depth of each tree/how much info captured)

rf = make_pipeline(
    SelectKBest(k='all'),
    StandardScaler(),
    RandomForestRegressor(n_estimators = 100,
                         max_depth = 50),
    )

# Fit model to data
rf.fit(X_train, Y_train)

# Quick view of the perfromance 
print('Random Forest Regressor Training MAE:', round(mean_absolute_error(Y_train, rf.predict(X_train)), 5))
print('Random Forest Regressor  Test MAE:', round(mean_absolute_error(Y_test, rf.predict(X_test)), 5))

Random Forest Regressor Training MAE: 0.02895
Random Forest Regressor  Test MAE: 0.0698
