In [17]:
import numpy as np
import pandas as pd
from matplotlib import pyplot
from datetime import datetime, timedelta

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelBinarizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.externals import joblib

from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) #to supress import warnings

# Preprocessing Data

### Getting the data ready for Random Forest

In [34]:
# importing the preprocessed data for a quicker start

TRAIN_FILE = '../data/tmp/export_LSMT_MAX_yd.csv'
STATIONS_FILE = '../data/ghcnd-stations.csv'

def get_original_df():
    df_original = pd.read_csv(TRAIN_FILE)
    df_original = df_original.drop(['Unnamed: 0'], axis=1)
    return df_original

def get_station_df():
     return pd.read_csv(STATIONS_FILE, header=None, names=['station','lat', 'long', 'elev'], sep=';')

def add_coordinates(df_src, df_stations, src_index='station', foreign_index='station'):
    df_out = df_src.copy()
    return df_out.join(df_stations.set_index(foreign_index), on=src_index)


def add_day_of_year_column(df_src, column_name='date'):
    df_out = df_src.copy()
    df_out['day'] = df_out[column_name].apply(lambda d: date_to_nth_day(str(d)))
    return df_out

def date_to_nth_day(date, format='%Y%m%d'):
    date = datetime.strptime(date, format)
    new_year_day = datetime(year=date.year, month=1, day=1)
    return (date - new_year_day).days + 1

In [27]:
df = pd.read_csv('../data/tmp/export_LSMT_MAX_yd.csv', index_col=0, low_memory=False)

In [3]:
# pick random stations for test and training
seed = 93598357
np.random.seed(seed)
stations = df.station.unique()
np.random.shuffle(stations)
stations_shuffled = stations
fraction = 64
stations_train = stations_shuffled[:int(np.round(len(stations)/fraction))]
stations_holdout14 = stations_shuffled[int(np.round(len(stations)/fraction)):int(np.round(len(stations)/fraction*2))]
stations_holdout15 = stations_shuffled[int(np.round(len(stations)/fraction*2)):int(np.round(len(stations)/fraction*3))]
stations_holdout16 = stations_shuffled[int(np.round(len(stations)/fraction*3)):int(np.round(len(stations)/fraction*4))]

df_17 = df[df['station'].isin(stations_train)]
df_14 = df[df['station'].isin(stations_holdout14)]
df_15 = df[df['station'].isin(stations_holdout15)]
df_16 = df[df['station'].isin(stations_holdout16)]

In [4]:
print(len(df_17), len(df_14), len(df_15), len(df_16))

288148 285757 280102 287220


In [5]:
#divide test and training to test effective of model to different timeframe (start of 2017)
training_years = [2014,2015,2016]
testing_days = list(range(90))

df_train17 = df_17[df_17['year'].isin(training_years)]
df_test17 = df_17[~df_17['year'].isin(training_years)]
df_test17 = df_test17[df_test17['day'].isin(testing_days)]
print(df_train17.shape,df_test17.shape)

training_years = [2017,2015,2016]
df_train14 = df_14[df_14['year'].isin(training_years)]
df_test14 = df_14[~df_14['year'].isin(training_years)]
df_test14 = df_test14[df_test14['day'].isin(testing_days)]
print(df_train14.shape,df_test14.shape)

training_years = [2017,2014,2016]
df_train15 = df_15[df_15['year'].isin(training_years)]
df_test15 = df_15[~df_15['year'].isin(training_years)]
df_test15 = df_test15[df_test15['day'].isin(testing_days)]
print(df_train15.shape,df_test15.shape)

training_years = [2017,2015,2014]
df_train16 = df_16[df_16['year'].isin(training_years)]
df_test16 = df_16[~df_16['year'].isin(training_years)]
df_test16 = df_test16[df_test16['day'].isin(testing_days)]
print(df_train16.shape,df_test16.shape)

(217112, 7) (17658, 7)
(211770, 7) (18086, 7)
(207350, 7) (17889, 7)
(215511, 7) (17361, 7)


In [6]:
#define split for CV later on
split = [[df_train17.index.values, df_test17.index.values], [df_train16.index.values, df_test16.index.values],
         [df_train15.index.values, df_test15.index.values],[df_train14.index.values, df_test14.index.values]]

In [7]:
#seperate target from features

df_X_raw = df.drop(columns='TMIN')
sy = df['TMIN']
y_raw = sy.reshape(-1,1)

In [8]:
df_X_raw.shape

(18683824, 6)

In [9]:
# int encode stations
#LB = LabelBinarizer()
#df_X['station'] = LB.fit_transform(df_X[['station']])
df_X_red = df_X_raw.drop(columns='station')

In [10]:
#X_dict = df_X.to_dict('records')
#vec = DictVectorizer()
#X = vec.fit_transform(X_dict).toarray()
#X_dummies = pd.get_dummies(df_X)
#X = X_dummies.to_dict('records')

In [11]:
# normalize features
X_raw = df_X_red.values
y_raw = y_raw.astype('float32')

scaler = MinMaxScaler(feature_range=(0, 1))                             
X = scaler.fit_transform(X_raw)
y = scaler.fit_transform(y_raw).ravel()

In [None]:
{'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': 3, 'max_depth': 10, 'bootstrap': True}

In [14]:
param_grid = {"n_estimators": [50, 100, 150],
    "max_depth":[7, 10, 13],
    "max_features": [2, 3, 4],
    "min_samples_split": [10],
    "min_samples_leaf": [10, 15],
    "bootstrap": [True]}


model = RandomForestRegressor(random_state=0)
grid = RandomizedSearchCV(estimator=model, verbose=2, param_distributions=param_grid, n_iter=25, cv=split, n_jobs=-1)
grid.fit(X, y)

print(grid.best_score_)
print(grid.best_params_)

Fitting 4 folds for each of 25 candidates, totalling 100 fits
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=10, max_features=4, max_depth=10, bootstrap=True 
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=10, max_features=4, max_depth=10, bootstrap=True 
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=10, max_features=4, max_depth=10, bootstrap=True 
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=10, max_features=4, max_depth=10, bootstrap=True 
[CV]  n_estimators=100, min_samples_split=10, min_samples_leaf=10, max_features=4, max_depth=10, bootstrap=True, total=  56.2s
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=10, max_features=2, max_depth=13, bootstrap=True 
[CV]  n_estimators=100, min_samples_split=10, min_samples_leaf=10, max_features=4, max_depth=10, bootstrap=True, total=  55.1s
[CV]  n_estimators=100, min_samples_split=10, min_samples_leaf=10, max_features=4, max_depth=10, bootstrap=True, total=  53.

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  6.2min


[CV] n_estimators=150, min_samples_split=15, min_samples_leaf=10, max_features=4, max_depth=13, bootstrap=True 
[CV]  n_estimators=100, min_samples_split=15, min_samples_leaf=10, max_features=4, max_depth=13, bootstrap=True, total=  59.8s
[CV]  n_estimators=100, min_samples_split=15, min_samples_leaf=10, max_features=4, max_depth=13, bootstrap=True, total=  56.2s
[CV] n_estimators=150, min_samples_split=15, min_samples_leaf=10, max_features=4, max_depth=13, bootstrap=True 
[CV] n_estimators=150, min_samples_split=15, min_samples_leaf=10, max_features=4, max_depth=13, bootstrap=True 
[CV]  n_estimators=100, min_samples_split=15, min_samples_leaf=10, max_features=4, max_depth=13, bootstrap=True, total=  57.4s
[CV] n_estimators=150, min_samples_split=15, min_samples_leaf=10, max_features=4, max_depth=13, bootstrap=True 
[CV]  n_estimators=150, min_samples_split=15, min_samples_leaf=10, max_features=4, max_depth=13, bootstrap=True, total= 1.3min
[CV] n_estimators=100, min_samples_split=15,

[CV]  n_estimators=100, min_samples_split=10, min_samples_leaf=15, max_features=2, max_depth=10, bootstrap=True, total=  29.4s
[CV] n_estimators=100, min_samples_split=10, min_samples_leaf=10, max_features=2, max_depth=7, bootstrap=True 
[CV]  n_estimators=100, min_samples_split=10, min_samples_leaf=10, max_features=2, max_depth=7, bootstrap=True, total=  24.3s
[CV] n_estimators=150, min_samples_split=15, min_samples_leaf=10, max_features=2, max_depth=10, bootstrap=True 
[CV]  n_estimators=100, min_samples_split=10, min_samples_leaf=10, max_features=2, max_depth=7, bootstrap=True, total=  23.6s
[CV] n_estimators=150, min_samples_split=15, min_samples_leaf=10, max_features=2, max_depth=10, bootstrap=True 
[CV]  n_estimators=100, min_samples_split=10, min_samples_leaf=10, max_features=2, max_depth=7, bootstrap=True, total=  23.8s
[CV] n_estimators=150, min_samples_split=15, min_samples_leaf=10, max_features=2, max_depth=10, bootstrap=True 
[CV]  n_estimators=100, min_samples_split=10, mi

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 17.3min finished


0.7531046783052038
{'n_estimators': 150, 'min_samples_split': 10, 'min_samples_leaf': 15, 'max_features': 2, 'max_depth': 13, 'bootstrap': True}


In [19]:
joblib.dump(grid, '../models/max/simple_random_forest.pkl')

['../models/max/simple_random_forest.pkl']

## Predict

In [35]:
SUBMISSION_PATH = '../data/2018_test.csv'
def load_submission_file():
    df_test = pd.read_csv(SUBMISSION_PATH)
    return df_test

def prepare_submission_file(df_test):
    df_stations = get_station_df()
    df_out = add_coordinates(df_test, df_stations, src_index='ID', foreign_index='station')
    df_out = add_day_of_year_column(df_out, column_name='DATE')
    return df_out
    
def save_submission(df_src, PATH):
    df_submission = pd.DataFrame()
    df_submission['SUB_ID'] = df_src['DATE'].apply(lambda d: str(d)) + df_src['ID']
    df_submission['DATA_VALUE'] = df_src['DATA_VALUE']
    df_submission.to_csv(PATH, index=False)
    return df_submission

In [38]:
def generate_predictions_from_simple_random_forest_model():
    model = joblib.load('../models/max/simple_random_forest.pkl')
    required_features = [
        'day',
        'year',
        'lat',
        'long',
        'elev',
        ]
    PREDICITON_FILE_PATH = '../data/predictions/prediction_simple_random_forest_MAX.csv'

    df_test = load_submission_file()
    df_test = prepare_submission_file(df_test)
    df_test['year'] = 2018
    df_test.head()

    # create predictions
    df_predict = df_test
    df_predict['DATA_VALUE'] = model.predict(df_test[required_features])

    #save predictions
    df_submission = save_submission(df_predict, PREDICITON_FILE_PATH)

In [39]:
generate_predictions_from_simple_random_forest_model()