In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot
from datetime import datetime, timedelta

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelBinarizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) #to supress import warnings

# Preprocessing Data

In [None]:
#COLUMNS = ['station','date','feature', 'value', 'measurement','quality', 'source', 'hour']
COLUMNS_test = ['station','date']

In [None]:
# load data
df_train = pd.read_csv('../data/export_features_loc_MAX.csv', index_col=0, low_memory=False)

In [None]:
df_train['date'] = pd.to_datetime(df_train['date'], format='%Y%m%d', errors='ignore')
df_train.head()

In [None]:
# Do you want to use past days as predictor?
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [None]:
#get year and date as features
df = df_train
df = df_train_yd.drop(columns='date')
df['year'] = df_train['date'].map(lambda x: x.year)
df['day'] = df_train['date'].map(lambda x: x.timetuple().tm_yday)
df.head(20)

In [None]:
# export data to reduce preproccesing duration
#df_train_yd.to_csv('../data/tmp/export_LSMT_MAX_yd.csv')

### Getting the data ready for LMST

In [2]:
# importing the preprocessed data for a quicker start
df = pd.read_csv('../data/tmp/export_LSMT_MAX_yd.csv', index_col=0, low_memory=False)

In [3]:
# pick random stations for test and training
seed = 93598357
np.random.seed(seed)
stations = df.station.unique()
np.random.shuffle(stations)
stations_shuffled = stations
stations_train = stations_shuffled[:2000]
stations_holdout = stations_shuffled[2000:4000]

df_train_test = df[df['station'].isin(stations_train)]
df_train_test.head()

Unnamed: 0,station,TMIN,lat,long,elev,year,day
4039,AG000060590,30,30.5667,2.8667,397.0,2014,1
4040,AG000060590,31,30.5667,2.8667,397.0,2014,2
4041,AG000060590,36,30.5667,2.8667,397.0,2014,3
4042,AG000060590,60,30.5667,2.8667,397.0,2014,4
4043,AG000060590,50,30.5667,2.8667,397.0,2014,5


In [4]:
#divide test and training to test effective of model to different timeframe (start of 2017)
training_years = [2014,2015,2016]
testing_days = range(80)

df_train = df_train_test[df_train_test['year'].isin(training_years)]
df_test = df_train_test[df_train_test['day'].isin(testing_days)]
print(df_train.shape,df_test.shape)

(1792641, 7) (514252, 7)


In [5]:
#seperate target from features
df_X_train_raw = df_train.drop(columns='TMIN')
df_X_test_raw = df_test.drop(columns='TMIN')
sy_train = df_train['TMIN']
sy_test = df_test['TMIN']
y_train_raw = sy_train.reshape(-1,1)
y_test_raw = sy_test.reshape(-1,1)

In [6]:
df_X_train_raw.shape

(1792641, 6)

In [7]:
# int encode stations
#LB = LabelBinarizer()
#df_X['station'] = LB.fit_transform(df_X[['station']])
df_X_train_red = df_X_train_raw.drop(columns='station')
df_X_test_red = df_X_test_raw.drop(columns='station')

In [8]:
#X_dict = df_X.to_dict('records')
#vec = DictVectorizer()
#X = vec.fit_transform(X_dict).toarray()
#X_dummies = pd.get_dummies(df_X)
#X = X_dummies.to_dict('records')

In [9]:
# normalize features
X_train_raw = df_X_train_red.values
X_test_raw = df_X_test_red.values
X_train_raw = X_train_raw.astype('float32')
X_test_raw = X_test_raw.astype('float32')
y_train_raw = y_train_raw.astype('float32')
y_test_raw = y_test_raw.astype('float32')

scaler = MinMaxScaler(feature_range=(0, 1))                             
train_X = scaler.fit_transform(X_train_raw)
test_X = scaler.fit_transform(X_test_raw)
train_y = scaler.fit_transform(y_train_raw).ravel()
test_y = scaler.fit_transform(y_test_raw).ravel()

In [10]:
print(train_X.shape,train_y.shape,test_X.shape,test_y.shape)

(1792641, 5) (1792641,) (514252, 5) (514252,)


In [11]:
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr.fit(train_X, train_y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [12]:
regr.score(test_X, test_y)

-0.1626275646978168

In [13]:
yhat = regr.predict(test_X)

print(yhat.reshape(514252,1).shape, test_X.shape)

(514252, 1) (514252, 5)


In [14]:
# make a prediction
yhat = regr.predict(test_X)
# invert scaling for forecast
inv_yhat = np.concatenate((yhat.reshape(514252,1), test_X[:, 1:]), axis=1)
inv_yhat = scaler.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:,0]
# invert scaling for actual
test_y = test_y.reshape((len(test_y), 1))
inv_y = np.concatenate((test_y, test_X[:, 1:]), axis=1)
inv_y = scaler.inverse_transform(inv_y)
inv_y = inv_y[:,0]
# calculate RMSE
rmse = np.sqrt(mean_squared_error(inv_y, inv_yhat))
print('Test RMSE: %.3f' % rmse)

Test RMSE: 135.783


In [None]:
param_grid = {"n_estimators": [200, 500],
    "max_depth": [3, None],
    "max_features": [3, 5],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 5],
    "bootstrap": [True, False]}

model = RandomForestRegressor(random_state=0)
grid = GridSearchCV(estimator=model, verbose=2, param_grid=param_grid, n_jobs=-1)
grid.fit(train_X, train_y)
 
print(grid.best_score_)
print(grid.best_params_)

Fitting 3 folds for each of 64 candidates, totalling 192 fits
[CV] bootstrap=True, max_depth=3, max_features=3, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV] bootstrap=True, max_depth=3, max_features=3, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV] bootstrap=True, max_depth=3, max_features=3, min_samples_leaf=1, min_samples_split=2, n_estimators=200 
[CV] bootstrap=True, max_depth=3, max_features=3, min_samples_leaf=1, min_samples_split=2, n_estimators=500 
[CV]  bootstrap=True, max_depth=3, max_features=3, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total= 2.9min
[CV] bootstrap=True, max_depth=3, max_features=3, min_samples_leaf=1, min_samples_split=2, n_estimators=500 
[CV]  bootstrap=True, max_depth=3, max_features=3, min_samples_leaf=1, min_samples_split=2, n_estimators=200, total= 3.0min
[CV] bootstrap=True, max_depth=3, max_features=3, min_samples_leaf=1, min_samples_split=2, n_estimators=500 
[CV]  bootstrap=True, max_depth=3, m

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 59.2min


[CV]  bootstrap=True, max_depth=3, max_features=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total= 6.0min
[CV] bootstrap=True, max_depth=3, max_features=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  bootstrap=True, max_depth=3, max_features=5, min_samples_leaf=1, min_samples_split=5, n_estimators=500, total=14.2min
[CV] bootstrap=True, max_depth=3, max_features=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200 
[CV]  bootstrap=True, max_depth=3, max_features=5, min_samples_leaf=1, min_samples_split=5, n_estimators=500, total=14.3min
[CV] bootstrap=True, max_depth=3, max_features=5, min_samples_leaf=5, min_samples_split=2, n_estimators=500 
[CV]  bootstrap=True, max_depth=3, max_features=5, min_samples_leaf=5, min_samples_split=2, n_estimators=200, total= 6.2min
[CV] bootstrap=True, max_depth=3, max_features=5, min_samples_leaf=5, min_samples_split=2, n_estimators=500 
[CV]  bootstrap=True, max_depth=3, max_features=5, min_samples_leaf=

In [None]:
param_grid = {"n_estimators": [200, 500],
    "max_depth": [3, None],
    "max_features": [3, 5],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 5],
    "bootstrap": [True, False]}

model = RandomForestRegressor(random_state=0)
grid = GridSearchCV(estimator=model, verbose=2, param_grid=param_grid, n_jobs=-1)
grid.fit(train_X, train_y)
 
print(grid.best_score_)
print(grid.best_params_)

In [None]:
param_grid = {"n_estimators": [200, 500, 1000],
    "max_depth": [1, 2, 3, 5, None],
    "max_features": [1, 3, 5],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 3, 10],
    "bootstrap": [True, False]}

In [None]:
# make a prediction
yhat = model.predict(test_X)
# invert scaling for forecast
inv_yhat = np.concatenate((yhat.reshape(514252,1), test_X[:, 1:]), axis=1)
inv_yhat = scaler.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:,0]
# invert scaling for actual
test_y = test_y.reshape((len(test_y), 1))
inv_y = np.concatenate((test_y, test_X[:, 1:]), axis=1)
inv_y = scaler.inverse_transform(inv_y)
inv_y = inv_y[:,0]
# calculate RMSE
rmse = np.sqrt(mean_squared_error(inv_y, inv_yhat))
print('Test RMSE: %.3f' % rmse)