In [34]:
import numpy as np
import pandas as pd
from matplotlib import pyplot
from datetime import datetime, timedelta

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelBinarizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.externals import joblib

from sklearn.metrics import mean_squared_error

from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import LSTM
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasClassifier

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) #to supress import warnings

# Preprocessing Data

### Getting the data ready for LMST

In [2]:
# importing the preprocessed data for a quicker start
df = pd.read_csv('export_LSMT_MAX_yd.csv', index_col=0, low_memory=False)

In [3]:
# importing the preprocessed data for a quicker start

TRAIN_FILE = '../data/tmp/export_LSMT_MAX_yd.csv'
STATIONS_FILE = '../data/ghcnd-stations.csv'

def get_original_df():
    df_original = pd.read_csv(TRAIN_FILE)
    df_original = df_original.drop(['Unnamed: 0'], axis=1)
    return df_original

def get_station_df():
     return pd.read_csv(STATIONS_FILE, header=None, names=['station','lat', 'long', 'elev'], sep=';')

def add_coordinates(df_src, df_stations, src_index='station', foreign_index='station'):
    df_out = df_src.copy()
    return df_out.join(df_stations.set_index(foreign_index), on=src_index)


def add_day_of_year_column(df_src, column_name='date'):
    df_out = df_src.copy()
    df_out['day'] = df_out[column_name].apply(lambda d: date_to_nth_day(str(d)))
    return df_out

def date_to_nth_day(date, format='%Y%m%d'):
    date = datetime.strptime(date, format)
    new_year_day = datetime(year=date.year, month=1, day=1)
    return (date - new_year_day).days + 1

In [4]:
# pick random stations for test and training
seed = 93598357
np.random.seed(seed)
stations = df.station.unique()
np.random.shuffle(stations)
stations_shuffled = stations
fraction = 64
stations_train = stations_shuffled[:int(np.round(len(stations)/fraction))]
stations_holdout14 = stations_shuffled[int(np.round(len(stations)/fraction)):int(np.round(len(stations)/fraction*2))]
stations_holdout15 = stations_shuffled[int(np.round(len(stations)/fraction*2)):int(np.round(len(stations)/fraction*3))]
stations_holdout16 = stations_shuffled[int(np.round(len(stations)/fraction*3)):int(np.round(len(stations)/fraction*4))]

df_17 = df[df['station'].isin(stations_train)]
df_14 = df[df['station'].isin(stations_holdout14)]
df_15 = df[df['station'].isin(stations_holdout15)]
df_16 = df[df['station'].isin(stations_holdout16)]

In [5]:
print(len(df_17), len(df_14), len(df_15), len(df_16))

288148 285757 280102 287220


In [6]:
#divide test and training to test effective of model to different timeframe (start of 2017)
training_years = [2014,2015,2016]
testing_days = list(range(90))

df_train17 = df_17[df_17['year'].isin(training_years)]
df_test17 = df_17[~df_17['year'].isin(training_years)]
df_test17 = df_test17[df_test17['day'].isin(testing_days)]
print(df_train17.shape,df_test17.shape)

training_years = [2017,2015,2016]
df_train14 = df_14[df_14['year'].isin(training_years)]
df_test14 = df_14[~df_14['year'].isin(training_years)]
df_test14 = df_test14[df_test14['day'].isin(testing_days)]
print(df_train14.shape,df_test14.shape)

training_years = [2017,2014,2016]
df_train15 = df_15[df_15['year'].isin(training_years)]
df_test15 = df_15[~df_15['year'].isin(training_years)]
df_test15 = df_test15[df_test15['day'].isin(testing_days)]
print(df_train15.shape,df_test15.shape)

training_years = [2017,2015,2014]
df_train16 = df_16[df_16['year'].isin(training_years)]
df_test16 = df_16[~df_16['year'].isin(training_years)]
df_test16 = df_test16[df_test16['day'].isin(testing_days)]
print(df_train16.shape,df_test16.shape)

(217112, 7) (17658, 7)
(211770, 7) (18086, 7)
(207350, 7) (17889, 7)
(215511, 7) (17361, 7)


In [7]:
#define split for CV later on
split = [[df_train17.index.values, df_test17.index.values], [df_train16.index.values, df_test16.index.values],
         [df_train15.index.values, df_test15.index.values],[df_train14.index.values, df_test14.index.values]]

In [8]:
#seperate target from features
df_X_raw = df.drop(columns='TMIN')
sy = df['TMIN']
y_raw = sy.reshape(-1,1)

In [9]:
df_X_raw.shape

(18683824, 6)

In [10]:
# int encode stations
#LB = LabelBinarizer()
#df_X['station'] = LB.fit_transform(df_X[['station']])
df_X_red = df_X_raw.drop(columns='station')

In [11]:
#X_dict = df_X.to_dict('records')
#vec = DictVectorizer()
#X = vec.fit_transform(X_dict).toarray()
#X_dummies = pd.get_dummies(df_X)
#X = X_dummies.to_dict('records')

In [12]:
# normalize features
X_raw = df_X_red.values
y_raw = y_raw.astype('float32')

scaler = MinMaxScaler(feature_range=(0, 1))                             
X = scaler.fit_transform(X_raw)
y = scaler.fit_transform(y_raw).ravel()

In [13]:
# reshape input to be 3D [samples, timesteps, features]
X = X.reshape((len(X),1,5))

# LSTMs

In [42]:
# grid search for optimal parameters: batch and epochs
def create_model(layer_width=50, dropout_rate=0.2, learning_rate=0.1):
    model = Sequential()
    model.add(LSTM(layer_width, input_shape=(X.shape[1],X.shape[2])))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1))
    adam = Adam(lr=learning_rate, decay=0.1, amsgrad=False)
    # Compile model
    model.compile(loss='mae', optimizer=adam, metrics=['accuracy'])
    return model

np.random.seed(seed)
# create model
model = KerasClassifier(build_fn=create_model, verbose=0)

# define the grid search parameters
batch_size = [100, 200, 500]
epochs = [20, 50, 100]
learning_rate= [0.01,0.03,0.05, 0.1, 0.2]
dropout_rate = [0.0, 0.2, 0.4, 0.6, 0.8]
layer_width = [25, 50, 75, 100]

# learning_rate=learning_rate
param_grid = dict(batch_size=batch_size, epochs=epochs, learning_rate=learning_rate, dropout_rate=dropout_rate)
grid = RandomizedSearchCV(estimator=model, verbose=2, param_distributions=param_grid, cv=split, n_iter=50, n_jobs=-1)
grid.fit(X, y, verbose=2)
# summarize results


print(grid.best_score_)
print(grid.best_params_)

Fitting 4 folds for each of 10 candidates, totalling 40 fits
[CV] learning_rate=0.01, epochs=20, dropout_rate=0.4, batch_size=500 .
[CV] learning_rate=0.01, epochs=20, dropout_rate=0.4, batch_size=500 .
[CV] learning_rate=0.01, epochs=20, dropout_rate=0.4, batch_size=500 .
[CV] learning_rate=0.01, epochs=20, dropout_rate=0.4, batch_size=500 .
[CV]  learning_rate=0.01, epochs=20, dropout_rate=0.4, batch_size=500, total= 1.9min
[CV] learning_rate=0.1, epochs=50, dropout_rate=0.8, batch_size=100 ..
[CV]  learning_rate=0.01, epochs=20, dropout_rate=0.4, batch_size=500, total= 1.9min
[CV]  learning_rate=0.01, epochs=20, dropout_rate=0.4, batch_size=500, total= 2.0min
[CV] learning_rate=0.1, epochs=50, dropout_rate=0.8, batch_size=100 ..
[CV] learning_rate=0.1, epochs=50, dropout_rate=0.8, batch_size=100 ..
[CV]  learning_rate=0.01, epochs=20, dropout_rate=0.4, batch_size=500, total= 2.1min
[CV] learning_rate=0.1, epochs=50, dropout_rate=0.8, batch_size=100 ..
[CV]  learning_rate=0.1, epochs

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 46.0min


[CV] learning_rate=0.1, epochs=20, dropout_rate=0.8, batch_size=100 ..
[CV]  learning_rate=0.01, epochs=50, dropout_rate=0.6, batch_size=100, total= 9.7min
[CV] learning_rate=0.1, epochs=20, dropout_rate=0.8, batch_size=100 ..
[CV]  learning_rate=0.01, epochs=50, dropout_rate=0.6, batch_size=100, total= 9.8min
[CV] learning_rate=0.1, epochs=20, dropout_rate=0.8, batch_size=100 ..
[CV]  learning_rate=0.01, epochs=50, dropout_rate=0.6, batch_size=100, total= 9.8min
[CV] learning_rate=0.1, epochs=20, dropout_rate=0.8, batch_size=100 ..
[CV]  learning_rate=0.1, epochs=20, dropout_rate=0.8, batch_size=100, total= 4.2min
[CV]  learning_rate=0.1, epochs=20, dropout_rate=0.8, batch_size=100, total= 4.0min
[CV]  learning_rate=0.1, epochs=20, dropout_rate=0.8, batch_size=100, total= 4.1min
[CV]  learning_rate=0.1, epochs=20, dropout_rate=0.8, batch_size=100, total= 3.4min


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed: 51.1min finished


0.004859565491096622
{'learning_rate': 0.1, 'epochs': 50, 'dropout_rate': 0.8, 'batch_size': 100}


In [68]:
grid.estimator.save('../models/max/simple_LSTM_MAX.h5')
#joblib.dump(grid, '../models/max/simple_LSTM_MAX.pkl')

AttributeError: 'KerasClassifier' object has no attribute 'save'

array([[0.00174481],
       [0.00174481],
       [0.00174481],
       ...,
       [0.00174481],
       [0.00174481],
       [0.00174481]], dtype=float32)

## Predict

In [56]:
SUBMISSION_PATH = '../data/2018_test_org.csv'
def load_submission_file():
    df_test = pd.read_csv(SUBMISSION_PATH)
    return df_test

def prepare_submission_file(df_test):
    df_stations = get_station_df()
    df_out = add_coordinates(df_test, df_stations, src_index='ID', foreign_index='station')
    df_out = add_day_of_year_column(df_out, column_name='DATE')
    return df_out
    
def save_submission(df_src, PATH):
    df_submission = pd.DataFrame()
    df_submission['SUB_ID'] = df_src['DATE'].apply(lambda d: str(d)) + df_src['ID']
    df_submission['DATA_VALUE'] = df_src['DATA_VALUE']
    df_submission.to_csv(PATH, index=False)
    return df_submission

In [104]:
def generate_predictions_from_simple_random_forest_model():
    model = grid
    required_features = [
        'day',
        'year',
        'lat',
        'long',
        'elev',
        ]
    PREDICITON_FILE_PATH = '../data/predictions/prediction_simple_LSTM_MAX.csv'

    df_test = load_submission_file()
    df_test = prepare_submission_file(df_test)
    df_test['year'] = 2018
    df_test.head()

    # create predictions
    df_predict = df_test
#    df_predict = df_predict.values 
    X_predict = df_predict.drop(columns=['ID', 'DATE'])
    X_predict = X_predict.values
    X_predict = scaler.fit_transform(X_predict)
    X_predict = X_predict.reshape((len(X_predict),1,5))
    df_predict['DATA_VALUE'] = scaler.inverse_transform(model.predict(X_predict)).ravel()
    


    

    #save predictions
    df_submission = save_submission(df_predict, PREDICITON_FILE_PATH)

In [105]:
generate_predictions_from_simple_random_forest_model()

ValueError: non-broadcastable output operand with shape (397804,1) doesn't match the broadcast shape (397804,5)

In [83]:
prepare_submission_file(load_submission_file()).head()

Unnamed: 0,ID,DATE,lat,long,elev,day
0,ASN00015643,20180101,-22.4518,133.6377,565.6,1
1,ASN00085296,20180101,-37.7481,147.1428,480.0,1
2,ASN00085280,20180101,-38.2094,146.4747,55.7,1
3,CA005030984,20180101,52.8167,-97.6167,223.0,1
4,CA003076680,20180101,55.1,-117.2,698.0,1
