In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot
from datetime import datetime, timedelta

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelBinarizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.externals import joblib
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) #to supress import warnings

# Preprocessing Data

### Getting the data ready for Random Forest

In [184]:
# importing the preprocessed data for a quicker start

TRAIN_FILE = '../data/tmp/export_LSMT_MAX_yd.csv'
STATIONS_FILE = '../data/ghcnd-stations.csv'

def get_original_df():
    df_original = pd.read_csv(TRAIN_FILE)
    df_original = df_original.drop(['Unnamed: 0'], axis=1)
    return df_original

def get_station_df():
     return pd.read_csv(STATIONS_FILE, header=None, names=['station','lat', 'long', 'elev'], sep=';')

def add_coordinates(df_src, df_stations, src_index='station', foreign_index='station'):
    df_out = df_src.copy()
    return df_out.join(df_stations.set_index(foreign_index), on=src_index)

def add_cluster_ids(df_src, cluster_file_paths = []):
    df_out = df_src.copy()
    for path in cluster_file_paths:
        df_cluster = pd.read_csv(path)
        df_out = df_out.join(df_cluster.set_index('station'), on='ID', how='left')
    return df_out

def add_day_of_year_column(df_src, column_name='date'):
    df_out = df_src.copy()
    df_out['day'] = df_out[column_name].apply(lambda d: date_to_nth_day(str(d)))
    return df_out

def date_to_nth_day(date, format='%Y%m%d'):
    try:
        date = datetime.strptime(date, format)
        new_year_day = datetime(year=date.year, month=1, day=1)
    except:
        print(date)
    return (date - new_year_day).days + 1

In [185]:
df = pd.read_csv('../data/tmp/export_LSMT_MAX_yd.csv', index_col=0, low_memory=False)
df.head()

Unnamed: 0,station,TMIN,lat,long,elev,year,day
0,AE000041196,128,25.333,55.517,34.0,2014,1
1,AE000041196,145,25.333,55.517,34.0,2014,2
2,AE000041196,140,25.333,55.517,34.0,2014,3
3,AE000041196,162,25.333,55.517,34.0,2014,6
4,AE000041196,115,25.333,55.517,34.0,2014,9


In [186]:
df_cluster = pd.read_csv('../data/cluster/station_clustertemperature.csv', index_col=0, low_memory=False)
df_cluster.head()

Unnamed: 0_level_0,cluster_id_2,cluster_id_4,cluster_id_6
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AE000041196,0,3,4
AEM00041194,0,3,4
AEM00041217,0,3,4
AEM00041218,0,3,4
AG000060390,0,1,1


In [187]:
df = df.join(df_cluster, on='station', how='left')

In [188]:
df.head()

Unnamed: 0,station,TMIN,lat,long,elev,year,day,cluster_id_2,cluster_id_4,cluster_id_6
0,AE000041196,128,25.333,55.517,34.0,2014,1,0,3,4
1,AE000041196,145,25.333,55.517,34.0,2014,2,0,3,4
2,AE000041196,140,25.333,55.517,34.0,2014,3,0,3,4
3,AE000041196,162,25.333,55.517,34.0,2014,6,0,3,4
4,AE000041196,115,25.333,55.517,34.0,2014,9,0,3,4


In [189]:
# pick random stations for test and training
seed = 93598357
np.random.seed(seed)
stations = df.station.unique()
np.random.shuffle(stations)
stations_shuffled = stations
fraction = 4
stations_train = stations_shuffled[:int(np.round(len(stations)/fraction))]
stations_holdout14 = stations_shuffled[int(np.round(len(stations)/fraction)):int(np.round(len(stations)/fraction*2))]
stations_holdout15 = stations_shuffled[int(np.round(len(stations)/fraction*2)):int(np.round(len(stations)/fraction*3))]
stations_holdout16 = stations_shuffled[int(np.round(len(stations)/fraction*3)):int(np.round(len(stations)/fraction*4))]

df_17 = df#[df['station'].isin(stations_train)]
df_14 = df#[df['station'].isin(stations_holdout14)]
df_15 = df#[df['station'].isin(stations_holdout15)]
df_16 = df#[df['station'].isin(stations_holdout16)]

In [190]:
print(len(df_17), len(df_14), len(df_15), len(df_16))

18683824 18683824 18683824 18683824


In [191]:
#divide test and training to test effective of model to different timeframe (start of 2017)
testing_days = list(range(52))

df_train = df[df['day'].isin(testing_days)]
print(df_train.shape)

(4591256, 10)


In [235]:
#divide test and training to test effective of model to different timeframe (start of 2017)
training_years = [2014,2015,2016]
testing_days = list(range(51))

df_train17 = df_17[df_17['year'].isin(training_years)]
df_train17 = df_train17[df_train17['day'].isin(testing_days)]
df_test17 = df_17[~df_17['year'].isin(training_years)]
df_test17 = df_test17[df_test17['day'].isin(testing_days)]
print(df_train17.shape,df_test17.shape)

training_years = [2017,2015,2016]
df_train14 = df_14[df_14['year'].isin(training_years)]
df_train14 = df_train14[df_train14['day'].isin(testing_days)]
df_test14 = df_14[~df_14['year'].isin(training_years)]
df_test14 = df_test14[df_test14['day'].isin(testing_days)]
print(df_train14.shape,df_test14.shape)

training_years = [2017,2014,2016]
df_train15 = df_15[df_15['year'].isin(training_years)]
df_train15 = df_train15[df_train15['day'].isin(testing_days)]
df_test15 = df_15[~df_15['year'].isin(training_years)]
df_test15 = df_test15[df_test15['day'].isin(testing_days)]
print(df_train15.shape,df_test15.shape)

training_years = [2017,2015,2014]
df_train16 = df_16[df_16['year'].isin(training_years)]
df_train16 = df_train16[df_train16['day'].isin(testing_days)]
df_test16 = df_16[~df_16['year'].isin(training_years)]
df_test16 = df_test16[df_test16['day'].isin(testing_days)]
print(df_train16.shape,df_test16.shape)


(1952163, 10) (621748, 10)
(1914389, 10) (659522, 10)
(1921719, 10) (652192, 10)
(1933462, 10) (640449, 10)


In [236]:
#define split for CV later on
split = [[df_train17.index.values, df_test17.index.values], [df_train16.index.values, df_test16.index.values],
         [df_train15.index.values, df_test15.index.values],[df_train14.index.values, df_test14.index.values]]

In [237]:
#seperate target from features
df_X_raw = df_train.drop(columns=['TMIN', 'elev', 'lat', 'long'])
df_X_raw_cv = df.drop(columns=['TMIN', 'elev', 'lat', 'long'])
sy = df_train['TMIN']
sy_cv = df['TMIN']
y_raw = sy.reshape(-1,1)
y_raw_cv = sy_cv.reshape(-1,1)

In [238]:
df_X_raw.shape

(4591256, 6)

In [239]:
# int encode stations
#LB = LabelBinarizer()
#df_X['station'] = LB.fit_transform(df_X[['station']])
df_X_red = df_X_raw.drop(columns='station')
df_X_red_cv = df_X_raw_cv.drop(columns='station')

In [240]:
#X_dict = df_X.to_dict('records')
#vec = DictVectorizer()
#X = vec.fit_transform(X_dict).toarray()
#X_dummies = pd.get_dummies(df_X)
#X = X_dummies.to_dict('records')

In [241]:
# normalize features
X_raw = df_X_red.values
X_raw_cv = df_X_red_cv.values
y_raw = y_raw.astype('float32')
y_raw_cv = y_raw_cv.astype('float32')

X = X_raw
X_cv = X_raw_cv
y = y_raw.ravel()
y_cv = y_raw_cv.ravel()


#scaler = MinMaxScaler(feature_range=(0, 1))                             
X = scaler.fit_transform(X_raw)
#X_cv = scaler.fit_transform(X_raw_cv)
#y = scaler.fit_transform(y_raw).ravel()
#y_cv = scaler.fit_transform(y_raw_cv).ravel()



In [242]:
{'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': 3, 'max_depth': 10, 'bootstrap': True}

{'bootstrap': True,
 'max_depth': 10,
 'max_features': 3,
 'min_samples_leaf': 10,
 'min_samples_split': 10,
 'n_estimators': 100}

In [31]:
param_grid = {"n_estimators": [50, 100, 150],
    "max_depth":[7, 10, 13],
    "max_features": [2, 3, 4],
    "min_samples_split": [10],
    "min_samples_leaf": [10, 15],
    "bootstrap": [True]}


model = RandomForestRegressor(random_state=0)
grid = RandomizedSearchCV(estimator=model, verbose=2, param_distributions=param_grid, n_iter=25, cv=split, n_jobs=-1)
grid.fit(X_cv, y_cv)

print(grid.best_score_)
print(grid.best_params_)

building tree 1 of 10


KeyboardInterrupt: 

In [None]:

model = RandomForestRegressor(n_estimators=100, min_samples_split=10, min_samples_leaf=10, max_features=3, max_depth= 10, bootstrap=True, random_state=0, verbose=2)
#grid = RandomizedSearchCV(estimator=model, verbose=2, param_distributions=param_grid, n_iter=25, cv=split, n_jobs=-1)
model.fit(X, y)

building tree 1 of 100


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.6s remaining:    0.0s


building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  5.5min finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features=3, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=10,
           min_samples_split=10, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=0,
           verbose=2, warm_start=False)

In [None]:
boost = GradientBoostingRegressor(n_estimators= 100, learning_rate=0.15, max_depth=5, max_features=3, min_samples_split=100, 
                                  min_samples_leaf=100, random_state=0, verbose=2)
boost.fit(X, y)

print(boost.score(X,y))

      Iter       Train Loss   Remaining Time 
         1       12307.3693            5.06m
         2       10356.7172            5.41m
         3        9054.7328            5.58m
         4        7977.4248            5.31m
         5        7202.6330            5.19m
         6        6637.6004            5.05m
         7        6201.0322            4.95m
         8        5892.4709            4.82m
         9        5666.6661            4.78m
        10        5503.2164            4.78m
        11        5368.4790            4.74m
        12        5276.6127            4.64m
        13        5207.5102            4.54m
        14        5155.0206            4.50m
        15        5108.7731            4.65m
        16        5077.0640            4.66m
        17        5048.4421            4.57m
        18        5032.3667            4.56m
        19        5014.2602            4.53m
        20        4997.5841            4.47m
        21        4989.0457            4.40m
        2

In [None]:
joblib.dump(boost, '../models/max/boosting_clustered.pkl')

In [None]:
joblib.dump(model, '../models/max/simple_random_forest_clustered.pkl')

## Predict

In [None]:
SUBMISSION_PATH = '../data/2018_test_org.csv'
def load_submission_file():
    df_test = pd.read_csv(SUBMISSION_PATH)
    return df_test

def prepare_submission_file(df_test):
    df_stations = get_station_df()
    print('\t * Add coordinates')
    df_out = add_coordinates(df_test, df_stations, src_index='ID', foreign_index='station')
    print('\t * Add day of year')
    df_out = add_day_of_year_column(df_out, column_name='DATE')
    print('\t * Add cluster ids')
    df_out = add_cluster_ids(df_out, cluster_file_paths = [
        '../data/cluster/station_clustertemperature.csv'
    ])
    return df_out
    
def save_submission(df_src, PATH):
    df_submission = pd.DataFrame()
    df_submission['SUB_ID'] = df_src['DATE'].apply(lambda d: str(d)) + df_src['ID']
    df_submission['DATA_VALUE'] = df_src['DATA_VALUE']
    df_submission.to_csv(PATH, index=False)
    return df_submission

In [None]:
def generate_predictions_from_clustered_random_forest_model():
    model = joblib.load('../models/max/simple_random_forest_clustered.pkl')
    required_features = [
        'day',
        'year',
        'cluster_id_2',
        'cluster_id_4',
        'cluster_id_6'
        ]
    PREDICITON_FILE_PATH = '../data/predictions/prediction_simple_random_forest_clustered_MAX.csv'
    
    print('* Load submission file')
    df_test = load_submission_file()
    print('* Prepare submission file')
    df_test = prepare_submission_file(df_test)
    print('* Add year column')
    df_test['year'] = 2018
#    df_test = df_test.dropna(axis=0, how='all')

    # create predictions
    df_predict = df_test
    df_predict['DATA_VALUE'] = model.predict(df_test[required_features])

    #save predictions
    df_submission = save_submission(df_predict, PREDICITON_FILE_PATH)

In [None]:
def generate_predictions_from_clustered_boosting_model():
    model = joblib.load('../models/max/boosting_clustered.pkl')
    required_features = [
        'day',
        'year',
        'cluster_id_2',
        'cluster_id_4',
        'cluster_id_6'
        ]
    PREDICITON_FILE_PATH = '../data/predictions/prediction_boosting_clustered_MAX.csv'
    
    print('* Load submission file')
    df_test = load_submission_file()
    print('* Prepare submission file')
    df_test = prepare_submission_file(df_test)
    print('* Add year column')
    df_test['year'] = 2018
#    df_test = df_test.dropna(axis=0, how='all')

    # create predictions
    df_predict = df_test
    df_predict['DATA_VALUE'] = model.predict(df_test[required_features])

    #save predictions
    df_submission = save_submission(df_predict, PREDICITON_FILE_PATH)

In [None]:
generate_predictions_from_clustered_boosting_model()

In [None]:
generate_predictions_from_clustered_random_forest_model()