In [155]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from datetime import datetime, timedelta

from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelBinarizer, normalize
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.externals import joblib
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) #to supress import warnings

# Preprocessing Data

### Getting the data ready for Random Forest

In [137]:
# importing the preprocessed data for a quicker start

TRAIN_FILE = '../data/tmp/export_LSMT_MAX_yd.csv'
STATIONS_FILE = '../data/ghcnd-stations.csv'

def get_original_df():
    df_original = pd.read_csv(TRAIN_FILE)
    df_original = df_original.drop(['Unnamed: 0'], axis=1)
    return df_original

def get_station_df():
     return pd.read_csv(STATIONS_FILE, header=None, names=['station','lat', 'long', 'elev'], sep=';')

def add_coordinates(df_src, df_stations, src_index='station', foreign_index='station'):
    df_out = df_src.copy()
    return df_out.join(df_stations.set_index(foreign_index), on=src_index)


def add_day_of_year_column(df_src, column_name='date'):
    df_out = df_src.copy()
    df_out['day'] = df_out[column_name].apply(lambda d: date_to_nth_day(str(d)))
    return df_out

def date_to_nth_day(date, format='%Y%m%d'):
    date = datetime.strptime(date, format)
    new_year_day = datetime(year=date.year, month=1, day=1)
    return (date - new_year_day).days + 1

def select_max(results):
    (best_k, best_score) = 0,0
    for (k, s) in results:
        if s > best_score:
            best_k = k
            best_score = s
    return (best_k, best_score)

In [156]:
df = pd.read_csv('../data/tmp/export_LSMT_MAX_yd.csv', index_col=0, low_memory=False)

In [157]:
# pick random stations for test and training
seed = 93598357
np.random.seed(seed)
stations = df.station.unique()
np.random.shuffle(stations)
stations_shuffled = stations
fraction = 4
stations_train = stations_shuffled[:int(np.round(len(stations)/fraction))]
stations_holdout14 = stations_shuffled[int(np.round(len(stations)/fraction)):int(np.round(len(stations)/fraction*2))]
stations_holdout15 = stations_shuffled[int(np.round(len(stations)/fraction*2)):int(np.round(len(stations)/fraction*3))]
stations_holdout16 = stations_shuffled[int(np.round(len(stations)/fraction*3)):int(np.round(len(stations)/fraction*4))]

df_17 = df#[df['station'].isin(stations_train)]
df_14 = df#[df['station'].isin(stations_holdout14)]
df_15 = df#[df['station'].isin(stations_holdout15)]
df_16 = df#[df['station'].isin(stations_holdout16)]
df.head()

Unnamed: 0,station,TMIN,lat,long,elev,year,day
0,AE000041196,128,25.333,55.517,34.0,2014,1
1,AE000041196,145,25.333,55.517,34.0,2014,2
2,AE000041196,140,25.333,55.517,34.0,2014,3
3,AE000041196,162,25.333,55.517,34.0,2014,6
4,AE000041196,115,25.333,55.517,34.0,2014,9


In [158]:
df_pivot = df.pivot_table(index=['station'], values=['lat', 'long', 'elev'], aggfunc=np.mean)

In [66]:
print(len(df_17), len(df_14), len(df_15), len(df_16))

18683824 18683824 18683824 18683824


In [89]:
#divide test and training to test effective of model to different timeframe (start of 2017)
testing_days = list(range(90))

df_train = df[df['day'].isin(testing_days)]
print(df_train.shape)

(4591256, 7)


In [90]:
#divide test and training to test effective of model to different timeframe (start of 2017)
training_years = [2014,2015,2016]
testing_days = list(range(90))

df_train17 = df_17[df_17['year'].isin(training_years)]
df_train17 = df_train17[df_train17['day'].isin(testing_days)]
df_test17 = df_17[~df_17['year'].isin(training_years)]
df_test17 = df_test17[df_test17['day'].isin(testing_days)]
print(df_train17.shape,df_test17.shape)

training_years = [2017,2015,2016]
df_train14 = df_14[df_14['year'].isin(training_years)]
df_train14 = df_train14[df_train14['day'].isin(testing_days)]
df_test14 = df_14[~df_14['year'].isin(training_years)]
df_test14 = df_test14[df_test14['day'].isin(testing_days)]
print(df_train14.shape,df_test14.shape)

training_years = [2017,2014,2016]
df_train15 = df_15[df_15['year'].isin(training_years)]
df_train15 = df_train15[df_train15['day'].isin(testing_days)]
df_test15 = df_15[~df_15['year'].isin(training_years)]
df_test15 = df_test15[df_test15['day'].isin(testing_days)]
print(df_train15.shape,df_test15.shape)

training_years = [2017,2015,2014]
df_train16 = df_16[df_16['year'].isin(training_years)]
df_train16 = df_train16[df_train16['day'].isin(testing_days)]
df_test16 = df_16[~df_16['year'].isin(training_years)]
df_test16 = df_test16[df_test16['day'].isin(testing_days)]
print(df_train16.shape,df_test16.shape)


(3481227, 7) (1110029, 7)
(3412382, 7) (1178874, 7)
(3428517, 7) (1162739, 7)
(3451642, 7) (1139614, 7)


In [167]:
results = []
df_red = df_flattened.drop('station')
for i in range(2,1000,100):
    kmeans = KMeans(n_clusters=i, random_state=1).fit(df_pivot.reset_index())
    score = silhouette_score(df_head, kmeans.labels_)
    print('[' + str(i) + '] :: ', score)
    results.append((i, score))

[2] ::  0.4595332552040136
[102] ::  0.3191301768579204
[202] ::  0.32503241190549376
[302] ::  0.32339583698769286
[402] ::  0.32549496925798277
[502] ::  0.32431625211802895
[602] ::  0.32379820298246115
[702] ::  0.3251032686035695
[802] ::  0.3260094183841668
[902] ::  0.32167168246097916


In [91]:
#define split for CV later on
split = [[df_train17.index.values, df_test17.index.values], [df_train16.index.values, df_test16.index.values],
         [df_train15.index.values, df_test15.index.values],[df_train14.index.values, df_test14.index.values]]

In [99]:
#seperate target from features
df_X_raw = df_train.drop(columns='TMIN')
df_X_raw_cv = df.drop(columns='TMIN')
sy = df_train['TMIN']
sy_cv = df['TMIN']
y_raw = sy.reshape(-1,1)
y_raw_cv = sy_cv.reshape(-1,1)

In [100]:
df_X_raw.shape

(4591256, 6)

In [101]:
# int encode stations
#LB = LabelBinarizer()
#df_X['station'] = LB.fit_transform(df_X[['station']])
df_X_red = df_X_raw.drop(columns='station')
df_X_red_cv = df_X_raw_cv.drop(columns='station')

In [102]:
#X_dict = df_X.to_dict('records')
#vec = DictVectorizer()
#X = vec.fit_transform(X_dict).toarray()
#X_dummies = pd.get_dummies(df_X)
#X = X_dummies.to_dict('records')

In [103]:
# normalize features
X_raw = df_X_red.values
X_raw_cv = df_X_red_cv.values
y_raw = y_raw.astype('float32')
y_raw_cv = y_raw_cv.astype('float32')

scaler = MinMaxScaler(feature_range=(0, 1))                             
X = scaler.fit_transform(X_raw)
X_cv = scaler.fit_transform(X_raw_cv)
y = scaler.fit_transform(y_raw).ravel()
y_cv = scaler.fit_transform(y_raw_cv).ravel()

In [104]:
{'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': 3, 'max_depth': 10, 'bootstrap': True}

{'bootstrap': True,
 'max_depth': 10,
 'max_features': 3,
 'min_samples_leaf': 10,
 'min_samples_split': 10,
 'n_estimators': 100}

In [107]:
model = GradientBoostingRegressor(n_estimators= 125, learning_rate=0.2, max_depth=5, max_features=3, min_samples_split=10, 
                                  min_samples_leaf=10, random_state=0, verbose=2)
model.fit(X, y)

print(model.score(X,y))

      Iter       Train Loss   Remaining Time 
         1           0.0023            5.46m
         2           0.0019            4.14m
         3           0.0017            3.60m
         4           0.0016            3.54m
         5           0.0015            3.46m
         6           0.0014            3.99m
         7           0.0013            3.74m
         8           0.0012            3.58m
         9           0.0012            3.45m
        10           0.0011            3.32m
        11           0.0011            3.42m
        12           0.0010            3.29m
        13           0.0010            3.50m
        14           0.0010            3.45m
        15           0.0009            3.38m
        16           0.0009            3.61m
        17           0.0009            3.84m
        18           0.0009            3.72m
        19           0.0008            3.66m
        20           0.0008            3.74m
        21           0.0008            3.66m
        2

In [171]:
param_test1 = {'n_estimators':range(100,201,25)}
gsearch1 = GridSearchCV(estimator = GradientBoostingRegressor(learning_rate=0.2, 
                                                               min_samples_split=10,
                                                               min_samples_leaf=10,
                                                               max_depth=5,
                                                               max_features=3, 
                                                               random_state=10), 
                        param_grid = param_test1, n_jobs=-1, cv=split, verbose=2)

gsearch1.fit(X_cv,y_cv)

print(gsearch1.grid_scores_)
print(gsearch1.best_score_)
print(gsearch1.best_params_)

gsearch1.best_params_

Fitting 4 folds for each of 5 candidates, totalling 20 fits
[CV] n_estimators=100 ................................................
[CV] n_estimators=100 ................................................
[CV] n_estimators=100 ................................................
      Iter       Train Loss   Remaining Time 
[CV] n_estimators=100 ................................................
      Iter       Train Loss   Remaining Time 
         1           0.0000           12.58m
      Iter       Train Loss   Remaining Time 
         1           0.0000           20.32m
      Iter       Train Loss   Remaining Time 
         2           0.0000           16.31m
         1           0.0000           20.41m
         2           0.0000           18.66m
         1           0.0000           18.58m
         2           0.0000           18.77m
         3           0.0000           17.48m
         2           0.0000           17.60m
         3           0.0000           19.00m
         3           0

        43           0.0000            9.64m
        44           0.0000            9.55m
        44           0.0000            9.50m
        44           0.0000            9.28m
        44           0.0000            9.46m
        45           0.0000            9.33m
        45           0.0000            9.28m
        45           0.0000            9.22m
        45           0.0000            9.08m
        46           0.0000            9.12m
        46           0.0000            9.08m
        46           0.0000            9.02m
        46           0.0000            8.88m
        47           0.0000            8.92m
        47           0.0000            8.91m
        47           0.0000            8.68m
        47           0.0000            8.84m
        48           0.0000            8.91m
        48           0.0000            9.01m
        48           0.0000            8.80m
        48           0.0000            9.03m
        49           0.0000            9.00m
        49

        89           0.0000            1.81m
        90           0.0000            1.62m
        89           0.0000            1.83m
        90           0.0000            1.64m
        90           0.0000            1.64m
        91           0.0000            1.46m
        90           0.0000            1.67m
        91           0.0000            1.48m
        91           0.0000            1.48m
        92           0.0000            1.30m
        91           0.0000            1.51m
        92           0.0000            1.32m
        92           0.0000            1.32m
        93           0.0000            1.14m
        92           0.0000            1.34m
        93           0.0000            1.16m
        93           0.0000            1.16m
        94           0.0000           58.79s
        93           0.0000            1.18m
        94           0.0000           59.79s
        94           0.0000           59.50s
        95           0.0000           48.98s
        94

        31           0.0000           14.25m
        31           0.0000           14.05m
        31           0.0000           14.38m
        31           0.0000           14.61m
        32           0.0000           13.95m
        32           0.0000           14.18m
        32           0.0000           14.28m
        32           0.0000           14.48m
        33           0.0000           13.99m
        33           0.0000           13.81m
        33           0.0000           14.20m
        33           0.0000           14.36m
        34           0.0000           14.09m
        34           0.0000           14.00m
        34           0.0000           14.35m
        34           0.0000           14.60m
        35           0.0000           14.22m
        35           0.0000           14.04m
        35           0.0000           14.45m
        35           0.0000           14.55m
        36           0.0000           14.19m
        36           0.0000           14.04m
        36

        76           0.0000            8.88m
        77           0.0000            8.56m
        77           0.0000            8.68m
        77           0.0000            8.68m
        77           0.0000            8.75m
        78           0.0000            8.43m
        78           0.0000            8.58m
        78           0.0000            8.61m
        78           0.0000            8.60m
        79           0.0000            8.28m
        79           0.0000            8.43m
        79           0.0000            8.46m
        80           0.0000            8.13m
        79           0.0000            8.49m
        80           0.0000            8.28m
        80           0.0000            8.32m
        81           0.0000            8.01m
        80           0.0000            8.36m
        81           0.0000            8.15m
        81           0.0000            8.16m
        82           0.0000            7.86m
        81           0.0000            8.22m
        82

       121           0.0000           44.26s
       122           0.0000           32.97s
       123           0.0000           21.80s
       124           0.0000           10.80s
       122           0.0000           33.20s
       123           0.0000           21.99s
       124           0.0000           10.91s
       125           0.0000            0.00s
       123           0.0000           22.13s
       124           0.0000           11.00s
       125           0.0000            0.00s
       124           0.0000           11.05s
       125           0.0000            0.00s
       125           0.0000            0.00s
[CV] ................................. n_estimators=125, total=22.8min
[CV] n_estimators=150 ................................................
[CV] ................................. n_estimators=125, total=23.0min
[CV] n_estimators=150 ................................................
      Iter       Train Loss   Remaining Time 
[CV] ................................. n

        38           0.0000           20.03m
        39           0.0000           20.42m
        40           0.0000           20.24m
        39           0.0000           20.17m
        39           0.0000           19.88m
        40           0.0000           20.26m
        41           0.0000           19.96m
        40           0.0000           19.66m
        40           0.0000           20.00m
        41           0.0000           19.99m
        42           0.0000           19.72m
        41           0.0000           19.39m
        41           0.0000           19.74m
        43           0.0000           19.49m
        42           0.0000           19.85m
        42           0.0000           19.29m
        42           0.0000           19.61m
        43           0.0000           19.58m
        44           0.0000           19.39m
        43           0.0000           19.08m
        43           0.0000           19.39m
        44           0.0000           19.39m
        45

        84           0.0000           11.98m
        85           0.0000           11.95m
        85           0.0000           11.85m
        85           0.0000           11.67m
        85           0.0000           11.84m
        86           0.0000           11.66m
        86           0.0000           11.78m
        86           0.0000           11.49m
        86           0.0000           11.63m
        87           0.0000           11.48m
        87           0.0000           11.59m
        87           0.0000           11.31m
        87           0.0000           11.44m
        88           0.0000           11.31m
        88           0.0000           11.14m
        88           0.0000           11.43m
        88           0.0000           11.27m
        89           0.0000           11.12m
        89           0.0000           10.94m
        89           0.0000           11.24m
        89           0.0000           11.08m
        90           0.0000           10.93m
        90

       130           0.0000            3.70m
       130           0.0000            3.72m
       130           0.0000            3.68m
       132           0.0000            3.26m
       131           0.0000            3.53m
       131           0.0000            3.51m
       131           0.0000            3.49m
       133           0.0000            3.08m
       132           0.0000            3.35m
       132           0.0000            3.33m
       132           0.0000            3.31m
       134           0.0000            2.90m
       133           0.0000            3.14m
       133           0.0000            3.16m
       133           0.0000            3.13m
       135           0.0000            2.71m
       134           0.0000            2.95m
       134           0.0000            2.97m
       134           0.0000            2.94m
       136           0.0000            2.53m
       135           0.0000            2.76m
       135           0.0000            2.78m
       135

        21           0.0000           26.56m
        23           0.0000           26.17m
        22           0.0000           25.70m
        22           0.0000           26.24m
        22           0.0000           26.40m
        24           0.0000           26.05m
        23           0.0000           25.60m
        23           0.0000           26.15m
        23           0.0000           26.28m
        25           0.0000           25.88m
        24           0.0000           25.59m
        24           0.0000           26.17m
        24           0.0000           26.34m
        26           0.0000           25.70m
        25           0.0000           25.35m
        25           0.0000           25.86m
        25           0.0000           25.99m
        27           0.0000           25.35m
        26           0.0000           25.11m
        26           0.0000           25.72m
        26           0.0000           25.65m
        28           0.0000           25.16m
        27

        67           0.0000           18.03m
        67           0.0000           18.11m
        68           0.0000           17.65m
        69           0.0000           17.76m
        68           0.0000           17.85m
        68           0.0000           17.93m
        70           0.0000           17.53m
        69           0.0000           17.48m
        69           0.0000           17.66m
        69           0.0000           17.72m
        71           0.0000           17.35m
        70           0.0000           17.30m
        70           0.0000           17.44m
        70           0.0000           17.50m
        71           0.0000           17.08m
        72           0.0000           17.18m
        71           0.0000           17.24m
        71           0.0000           17.30m
        72           0.0000           16.99m
        73           0.0000           17.18m
        72           0.0000           17.37m
        72           0.0000           17.41m
        73

       113           0.0000           10.08m
       114           0.0000            9.99m
       113           0.0000           10.15m
       114           0.0000            9.88m
       115           0.0000            9.87m
       114           0.0000            9.96m
       115           0.0000            9.74m
       114           0.0000           10.03m
       115           0.0000            9.81m
       116           0.0000            9.76m
       115           0.0000            9.88m
       116           0.0000            9.61m
       116           0.0000            9.69m
       117           0.0000            9.64m
       116           0.0000            9.75m
       117           0.0000            9.48m
       117           0.0000            9.58m
       118           0.0000            9.35m
       117           0.0000            9.64m
       118           0.0000            9.53m
       118           0.0000            9.44m
       119           0.0000            9.20m
       119

       161           0.0000            2.21m
       158           0.0000            2.75m
       159           0.0000            2.60m
       159           0.0000            2.58m
       162           0.0000            2.06m
       159           0.0000            2.60m
       160           0.0000            2.45m
       160           0.0000            2.43m
       163           0.0000            1.90m
       160           0.0000            2.44m
       161           0.0000            2.29m
       161           0.0000            2.27m
       164           0.0000            1.75m
       161           0.0000            2.28m
       162           0.0000            2.13m
       165           0.0000            1.59m
       162           0.0000            2.11m
       162           0.0000            2.12m
       166           0.0000            1.43m
       163           0.0000            1.97m
       163           0.0000            1.96m
       163           0.0000            1.96m
       167

        28           0.0000           43.02m
        25           0.0000           44.83m
        25           0.0000           45.64m
        25           0.0000           46.13m
        26           0.0000           44.41m
        29           0.0000           43.04m
        26           0.0000           45.27m
        26           0.0000           45.60m
        27           0.0000           43.99m
        27           0.0000           44.70m
        30           0.0000           42.71m
        27           0.0000           45.22m
        28           0.0000           43.52m
        31           0.0000           42.33m
        28           0.0000           44.44m
        28           0.0000           44.80m
        29           0.0000           43.22m
        32           0.0000           41.99m
        29           0.0000           44.06m
        29           0.0000           44.59m
        30           0.0000           42.88m
        33           0.0000           41.61m
        30

KeyboardInterrupt: 

In [None]:
param_test2 = {'max_depth':range(5,16,2), 'min_samples_split':range(10,1001,200)}
gsearch2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.2, n_estimators=100, 
                                                               max_features='sqrt', subsample=0.8, 
                                                               random_state=10), 
                        param_grid = param_test1, n_jobs=-1, cv=split, verbose=2)


gsearch2.fit(X_cv,y_cv)

print(gsearch2.grid_scores_)
print(gsearch2.best_score_)
print(gsearch2.best_params_)

gsearch2.best_params_

In [115]:
joblib.dump(model, '../models/max/boosting_.pkl')

['../models/max/boosting_.pkl']

## Predict

In [116]:
SUBMISSION_PATH = '../data/2018_test_org.csv'
def load_submission_file():
    df_test = pd.read_csv(SUBMISSION_PATH)
    return df_test

def prepare_submission_file(df_test):
    df_stations = get_station_df()
    df_out = add_coordinates(df_test, df_stations, src_index='ID', foreign_index='station')
    df_out = add_day_of_year_column(df_out, column_name='DATE')
    return df_out
    
def save_submission(df_src, PATH):
    df_submission = pd.DataFrame()
    df_submission['SUB_ID'] = df_src['DATE'].apply(lambda d: str(d)) + df_src['ID']
    df_submission['DATA_VALUE'] = df_src['DATA_VALUE']
    df_submission.to_csv(PATH, index=False)
    return df_submission

In [117]:
def generate_predictions_from_simple_random_forest_model():
    model = joblib.load('../models/max/boosting_.pkl')
    required_features = [
        'day',
        'year',
        'lat',
        'long',
        'elev',
        ]
    PREDICITON_FILE_PATH = '../data/predictions/prediction_boosting__MAX.csv'

    df_test = load_submission_file()
    df_test = prepare_submission_file(df_test)
    df_test['year'] = 2018
    df_test.head()

    # create predictions
    df_predict = df_test
    df_predict['DATA_VALUE'] = model.predict(df_test[required_features])

    #save predictions
    df_submission = save_submission(df_predict, PREDICITON_FILE_PATH)

In [118]:
generate_predictions_from_simple_random_forest_model()