In [4]:
import numpy as np
import pandas as pd
from matplotlib import pyplot
from datetime import datetime, timedelta

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelBinarizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.externals import joblib
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) #to supress import warnings

# Preprocessing Data

### Getting the data ready for Random Forest

In [25]:
# importing the preprocessed data for a quicker start

TRAIN_FILE = '../data/tmp/export_LSMT_MAX_yd.csv'
STATIONS_FILE = '../data/ghcnd-stations.csv'

def get_original_df():
    df_original = pd.read_csv(TRAIN_FILE)
    df_original = df_original.drop(['Unnamed: 0'], axis=1)
    return df_original

def get_station_df():
     return pd.read_csv(STATIONS_FILE, header=None, names=['station','lat', 'long', 'elev'], sep=';')

def add_coordinates(df_src, df_stations, src_index='station', foreign_index='station'):
    df_out = df_src.copy()
    return df_out.join(df_stations.set_index(foreign_index), on=src_index)


def add_day_of_year_column(df_src, column_name='date'):
    df_out = df_src.copy()
    df_out['day'] = df_out[column_name].apply(lambda d: date_to_nth_day(str(d)))
    return df_out

def date_to_nth_day(date, format='%Y%m%d'):
    date = datetime.strptime(date, format)
    new_year_day = datetime(year=date.year, month=1, day=1)
    return (date - new_year_day).days + 1

In [6]:
df = pd.read_csv('../data/tmp/export_LSMT_MAX_yd.csv', index_col=0, low_memory=False)

In [7]:
# pick random stations for test and training
seed = 93598357
np.random.seed(seed)
stations = df.station.unique()
np.random.shuffle(stations)
stations_shuffled = stations
fraction = 4
stations_train = stations_shuffled[:int(np.round(len(stations)/fraction))]
stations_holdout14 = stations_shuffled[int(np.round(len(stations)/fraction)):int(np.round(len(stations)/fraction*2))]
stations_holdout15 = stations_shuffled[int(np.round(len(stations)/fraction*2)):int(np.round(len(stations)/fraction*3))]
stations_holdout16 = stations_shuffled[int(np.round(len(stations)/fraction*3)):int(np.round(len(stations)/fraction*4))]

df_17 = df[df['station'].isin(stations_train)]
df_14 = df[df['station'].isin(stations_holdout14)]
df_15 = df[df['station'].isin(stations_holdout15)]
df_16 = df[df['station'].isin(stations_holdout16)]

In [8]:
print(len(df_17), len(df_14), len(df_15), len(df_16))

4639908 4680661 4705872 4657383


In [34]:
#divide test and training to test effective of model to different timeframe (start of 2017)
training_years = [2014,2015,2016]
testing_days = list(range(90))

df_train = df[df['day'].isin(testing_days)]
print(df_train.shape)

(4591256, 7)


In [10]:
#define split for CV later on
#split = [[df_train17.index.values, df_test17.index.values], [df_train16.index.values, df_test16.index.values],
#         [df_train15.index.values, df_test15.index.values],[df_train14.index.values, df_test14.index.values]]

In [11]:
#seperate target from features

df_X_raw = df_train.drop(columns='TMIN')
sy = df_train['TMIN']
y_raw = sy.reshape(-1,1)

In [12]:
df_X_raw.shape

(4591256, 6)

In [13]:
# int encode stations
#LB = LabelBinarizer()
#df_X['station'] = LB.fit_transform(df_X[['station']])
df_X_red = df_X_raw.drop(columns='station')

In [14]:
#X_dict = df_X.to_dict('records')
#vec = DictVectorizer()
#X = vec.fit_transform(X_dict).toarray()
#X_dummies = pd.get_dummies(df_X)
#X = X_dummies.to_dict('records')

In [15]:
# normalize features
X_raw = df_X_red.values
y_raw = y_raw.astype('float32')

scaler = MinMaxScaler(feature_range=(0, 1))                             
X = scaler.fit_transform(X_raw)
y = scaler.fit_transform(y_raw).ravel()

In [16]:
{'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': 3, 'max_depth': 10, 'bootstrap': True}

{'bootstrap': True,
 'max_depth': 10,
 'max_features': 3,
 'min_samples_leaf': 10,
 'min_samples_split': 10,
 'n_estimators': 100}

In [17]:
model = AdaBoostRegressor(n_estimators= 100, random_state=0)
model.fit(X, y)

print(model.score(X,y))

0.34205893804327847


In [19]:
joblib.dump(model, '../models/max/adaboosting.pkl')

['../models/max/adaboosting.pkl']

## Bagging

## Predict

In [30]:
SUBMISSION_PATH = '../data/2018_test_org.csv'
def load_submission_file():
    df_test = pd.read_csv(SUBMISSION_PATH)
    return df_test

def prepare_submission_file(df_test):
    df_stations = get_station_df()
    df_out = add_coordinates(df_test, df_stations, src_index='ID', foreign_index='station')
    df_out = add_day_of_year_column(df_out, column_name='DATE')
    return df_out
    
def save_submission(df_src, PATH):
    df_submission = pd.DataFrame()
    df_submission['SUB_ID'] = df_src['DATE'].apply(lambda d: str(d)) + df_src['ID']
    df_submission['DATA_VALUE'] = df_src['DATA_VALUE']
    df_submission.to_csv(PATH, index=False)
    return df_submission

In [31]:
def generate_predictions_from_simple_random_forest_model():
    model = joblib.load('../models/max/adaboosting.pkl')
    required_features = [
        'day',
        'year',
        'lat',
        'long',
        'elev',
        ]
    PREDICITON_FILE_PATH = '../data/predictions/prediction_adaboost_MAX.csv'

    df_test = load_submission_file()
    df_test = prepare_submission_file(df_test)
    df_test['year'] = 2018
    df_test.head()

    # create predictions
    df_predict = df_test
    df_predict['DATA_VALUE'] = model.predict(df_test[required_features])

    #save predictions
    df_submission = save_submission(df_predict, PREDICITON_FILE_PATH)

In [32]:
generate_predictions_from_simple_random_forest_model()