In [155]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from datetime import datetime, timedelta

from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelBinarizer, normalize
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.externals import joblib
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) #to supress import warnings

# Preprocessing Data

### Getting the data ready for Random Forest

In [137]:
# importing the preprocessed data for a quicker start

TRAIN_FILE = '../data/tmp/export_LSMT_MAX_yd.csv'
STATIONS_FILE = '../data/ghcnd-stations.csv'

def get_original_df():
    df_original = pd.read_csv(TRAIN_FILE)
    df_original = df_original.drop(['Unnamed: 0'], axis=1)
    return df_original

def get_station_df():
     return pd.read_csv(STATIONS_FILE, header=None, names=['station','lat', 'long', 'elev'], sep=';')

def add_coordinates(df_src, df_stations, src_index='station', foreign_index='station'):
    df_out = df_src.copy()
    return df_out.join(df_stations.set_index(foreign_index), on=src_index)


def add_day_of_year_column(df_src, column_name='date'):
    df_out = df_src.copy()
    df_out['day'] = df_out[column_name].apply(lambda d: date_to_nth_day(str(d)))
    return df_out

def date_to_nth_day(date, format='%Y%m%d'):
    date = datetime.strptime(date, format)
    new_year_day = datetime(year=date.year, month=1, day=1)
    return (date - new_year_day).days + 1

def select_max(results):
    (best_k, best_score) = 0,0
    for (k, s) in results:
        if s > best_score:
            best_k = k
            best_score = s
    return (best_k, best_score)

In [156]:
df = pd.read_csv('../data/tmp/export_LSMT_MAX_yd.csv', index_col=0, low_memory=False)

In [157]:
# pick random stations for test and training
seed = 93598357
np.random.seed(seed)
stations = df.station.unique()
np.random.shuffle(stations)
stations_shuffled = stations
fraction = 4
stations_train = stations_shuffled[:int(np.round(len(stations)/fraction))]
stations_holdout14 = stations_shuffled[int(np.round(len(stations)/fraction)):int(np.round(len(stations)/fraction*2))]
stations_holdout15 = stations_shuffled[int(np.round(len(stations)/fraction*2)):int(np.round(len(stations)/fraction*3))]
stations_holdout16 = stations_shuffled[int(np.round(len(stations)/fraction*3)):int(np.round(len(stations)/fraction*4))]

df_17 = df#[df['station'].isin(stations_train)]
df_14 = df#[df['station'].isin(stations_holdout14)]
df_15 = df#[df['station'].isin(stations_holdout15)]
df_16 = df#[df['station'].isin(stations_holdout16)]
df.head()

Unnamed: 0,station,TMIN,lat,long,elev,year,day
0,AE000041196,128,25.333,55.517,34.0,2014,1
1,AE000041196,145,25.333,55.517,34.0,2014,2
2,AE000041196,140,25.333,55.517,34.0,2014,3
3,AE000041196,162,25.333,55.517,34.0,2014,6
4,AE000041196,115,25.333,55.517,34.0,2014,9


In [66]:
print(len(df_17), len(df_14), len(df_15), len(df_16))

18683824 18683824 18683824 18683824


In [89]:
#divide test and training to test effective of model to different timeframe (start of 2017)
testing_days = list(range(90))

df_train = df[df['day'].isin(testing_days)]
print(df_train.shape)

(4591256, 7)


In [90]:
#divide test and training to test effective of model to different timeframe (start of 2017)
training_years = [2014,2015,2016]
testing_days = list(range(90))

df_train17 = df_17[df_17['year'].isin(training_years)]
df_train17 = df_train17[df_train17['day'].isin(testing_days)]
df_test17 = df_17[~df_17['year'].isin(training_years)]
df_test17 = df_test17[df_test17['day'].isin(testing_days)]
print(df_train17.shape,df_test17.shape)

training_years = [2017,2015,2016]
df_train14 = df_14[df_14['year'].isin(training_years)]
df_train14 = df_train14[df_train14['day'].isin(testing_days)]
df_test14 = df_14[~df_14['year'].isin(training_years)]
df_test14 = df_test14[df_test14['day'].isin(testing_days)]
print(df_train14.shape,df_test14.shape)

training_years = [2017,2014,2016]
df_train15 = df_15[df_15['year'].isin(training_years)]
df_train15 = df_train15[df_train15['day'].isin(testing_days)]
df_test15 = df_15[~df_15['year'].isin(training_years)]
df_test15 = df_test15[df_test15['day'].isin(testing_days)]
print(df_train15.shape,df_test15.shape)

training_years = [2017,2015,2014]
df_train16 = df_16[df_16['year'].isin(training_years)]
df_train16 = df_train16[df_train16['day'].isin(testing_days)]
df_test16 = df_16[~df_16['year'].isin(training_years)]
df_test16 = df_test16[df_test16['day'].isin(testing_days)]
print(df_train16.shape,df_test16.shape)


(3481227, 7) (1110029, 7)
(3412382, 7) (1178874, 7)
(3428517, 7) (1162739, 7)
(3451642, 7) (1139614, 7)


In [91]:
#define split for CV later on
split = [[df_train17.index.values, df_test17.index.values], [df_train16.index.values, df_test16.index.values],
         [df_train15.index.values, df_test15.index.values],[df_train14.index.values, df_test14.index.values]]

In [99]:
#seperate target from features
df_X_raw = df_train.drop(columns='TMIN')
df_X_raw_cv = df.drop(columns='TMIN')
sy = df_train['TMIN']
sy_cv = df['TMIN']
y_raw = sy.reshape(-1,1)
y_raw_cv = sy_cv.reshape(-1,1)

In [100]:
df_X_raw.shape

(4591256, 6)

In [101]:
# int encode stations
#LB = LabelBinarizer()
#df_X['station'] = LB.fit_transform(df_X[['station']])
df_X_red = df_X_raw.drop(columns='station')
df_X_red_cv = df_X_raw_cv.drop(columns='station')

In [102]:
#X_dict = df_X.to_dict('records')
#vec = DictVectorizer()
#X = vec.fit_transform(X_dict).toarray()
#X_dummies = pd.get_dummies(df_X)
#X = X_dummies.to_dict('records')

In [103]:
# normalize features
X_raw = df_X_red.values
X_raw_cv = df_X_red_cv.values
y_raw = y_raw.astype('float32')
y_raw_cv = y_raw_cv.astype('float32')

scaler = MinMaxScaler(feature_range=(0, 1))                             
X = scaler.fit_transform(X_raw)
X_cv = scaler.fit_transform(X_raw_cv)
y = scaler.fit_transform(y_raw).ravel()
y_cv = scaler.fit_transform(y_raw_cv).ravel()

In [104]:
{'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': 3, 'max_depth': 10, 'bootstrap': True}

{'bootstrap': True,
 'max_depth': 10,
 'max_features': 3,
 'min_samples_leaf': 10,
 'min_samples_split': 10,
 'n_estimators': 100}

In [107]:
model = GradientBoostingRegressor(n_estimators= 125, learning_rate=0.2, max_depth=5, max_features=3, min_samples_split=10, 
                                  min_samples_leaf=10, random_state=0, verbose=2)
model.fit(X, y)

print(model.score(X,y))

      Iter       Train Loss   Remaining Time 
         1           0.0023            5.46m
         2           0.0019            4.14m
         3           0.0017            3.60m
         4           0.0016            3.54m
         5           0.0015            3.46m
         6           0.0014            3.99m
         7           0.0013            3.74m
         8           0.0012            3.58m
         9           0.0012            3.45m
        10           0.0011            3.32m
        11           0.0011            3.42m
        12           0.0010            3.29m
        13           0.0010            3.50m
        14           0.0010            3.45m
        15           0.0009            3.38m
        16           0.0009            3.61m
        17           0.0009            3.84m
        18           0.0009            3.72m
        19           0.0008            3.66m
        20           0.0008            3.74m
        21           0.0008            3.66m
        2

In [115]:
joblib.dump(model, '../models/max/boosting_.pkl')

['../models/max/boosting_.pkl']

## Predict

In [116]:
SUBMISSION_PATH = '../data/2018_test_org.csv'
def load_submission_file():
    df_test = pd.read_csv(SUBMISSION_PATH)
    return df_test

def prepare_submission_file(df_test):
    df_stations = get_station_df()
    df_out = add_coordinates(df_test, df_stations, src_index='ID', foreign_index='station')
    df_out = add_day_of_year_column(df_out, column_name='DATE')
    return df_out
    
def save_submission(df_src, PATH):
    df_submission = pd.DataFrame()
    df_submission['SUB_ID'] = df_src['DATE'].apply(lambda d: str(d)) + df_src['ID']
    df_submission['DATA_VALUE'] = df_src['DATA_VALUE']
    df_submission.to_csv(PATH, index=False)
    return df_submission

In [117]:
def generate_predictions_from_simple_random_forest_model():
    model = joblib.load('../models/max/boosting_.pkl')
    required_features = [
        'day',
        'year',
        'lat',
        'long',
        'elev',
        ]
    PREDICITON_FILE_PATH = '../data/predictions/prediction_boosting__MAX.csv'

    df_test = load_submission_file()
    df_test = prepare_submission_file(df_test)
    df_test['year'] = 2018
    df_test.head()

    # create predictions
    df_predict = df_test
    df_predict['DATA_VALUE'] = model.predict(df_test[required_features])

    #save predictions
    df_submission = save_submission(df_predict, PREDICITON_FILE_PATH)

In [118]:
generate_predictions_from_simple_random_forest_model()