In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline

# Preprocessing

In [None]:
TOTAL_HISTORY_DEPTH = 350
HISTORY_HORIZON = 60

In [None]:
df = pd.read_csv('data/train.csv', header=None)
df.columns = ['price']

In [None]:
df_truncated = df['price'].iloc[-TOTAL_HISTORY_DEPTH:]
df_truncated = df_truncated.reset_index(drop=True)

In [None]:
train_features_raw = []
target_values = []
for i in range(df_truncated.shape[0] - HISTORY_HORIZON):
    next_day = df_truncated.iloc[HISTORY_HORIZON+i]
    features_list = df_truncated.values[i:HISTORY_HORIZON+i].tolist()
    target_values.append(next_day)
    train_features_raw.append(features_list)

In [None]:
train_df = pd.DataFrame(train_features_raw)
train_df.head(5)

# Feature generation

In [None]:
# Extract various features from timeseries
def feature_generator(line):
    # Max min features
    feature_list = []
    last_week = line[-7:]
    last_month = line[-30:]
    for slices in (last_week, last_month, line):
        feature_list.extend([np.max(slices), np.min(slices), np.median(slices), np.mean(slices)])
    
    # Raw last month values
    feature_list.extend(last_month)
    # Day-to-day diffs
    feature_list.extend(day_to_day_diff(last_month))
    last_month_mean = np.mean(last_month)
    # Diffs from average
    feature_list.extend([day_value - last_month_mean for day_value in last_month])
    return feature_list

def day_to_day_diff(line):
    return [line[i + 1] - line[i] for i in range(0, len(line) - 1)]

In [None]:
features_df = train_df.apply(lambda x: feature_generator(x.tolist()), axis=1)

# Train model

In [None]:
# XGBoost for sure!
from sklearn.cross_validation import KFold
import xgboost as xgb

In [None]:
params = {}
params["objective"] = "reg:linear"     
params["eta"] = 0.01
params["subsample"] = 0.9
params["colsample_bytree"] = 0.9
params["silent"] = 0
params["max_depth"] = 5
num_round = 1000

In [None]:
y = pd.Series(target_values)
kf = KFold(len(y), n_folds=4, shuffle=True, random_state=42)

In [None]:
for train_index, test_index in kf:
    
    xg_train = xgb.DMatrix(features_df[train_index].tolist(), label=y[train_index].values)
    xg_test = xgb.DMatrix(features_df[test_index].tolist(), label=y[test_index].values) 
    watchlist = [ (xg_train,'train'), (xg_test, 'test') ]
    bst = xgb.train(params, xg_train, num_round, watchlist, early_stopping_rounds=30)#, learning_rates = eta_f)

# Final training & save model

In [None]:
FINAL_NUM_ROUND = 500

In [None]:
bst = xgb.train(params, xgb.DMatrix(features_df.tolist(), label=y.values) , final_num_round)

In [1]:
import settings
bst.save_model(settings.ML_MODEL_PATH)