Now we have seen how to build a model, let's use these concepts to build a predictive model on our data.

Specifically, we are going to try to predict the return of SOL in the next hour with a simple model that uses hourly volatility and close returns

In [111]:
data_location = 'sqlite:///../../../data/data.db'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from functools import reduce
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

import pickle

def vol_ohlc(df, lookback=10):
    o = df.open
    h = df.high
    l = df.low
    c = df.close
    
    k = 0.34 / (1.34 + (lookback+1)/(lookback-1))
    cc = np.log(c/c.shift(1))
    ho = np.log(h/o)
    lo = np.log(l/o)
    co = np.log(c/o)
    oc = np.log(o/c.shift(1))
    oc_sq = oc**2
    cc_sq = cc**2
    rs = ho*(ho-co)+lo*(lo-co)
    close_vol = cc_sq.rolling(lookback).sum() * (1.0 / (lookback - 1.0))
    open_vol = oc_sq.rolling(lookback).sum() * (1.0 / (lookback - 1.0))
    window_rs = rs.rolling(lookback).sum() * (1.0 / (lookback - 1.0))
    result = (open_vol + k * close_vol + (1-k) * window_rs).apply(np.sqrt) * np.sqrt(252)
    result[:lookback-1] = np.nan
    
    return result

def plot_learning_curve(
    estimator,
    title,
    X,
    y,
    axes=None,
    ylim=None,
    cv=None,
    n_jobs=None,
    train_sizes=np.linspace(0.1, 1.0, 5),
    scoring=None
):
    if axes is None:
        _, axes = plt.subplots(1, 3, figsize=(20, 5))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(
        estimator,
        X,
        y,
        cv=cv,
        n_jobs=n_jobs,
        train_sizes=train_sizes,
        return_times=True,
        scoring=scoring,
    )
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(
        train_sizes,
        train_scores_mean - train_scores_std,
        train_scores_mean + train_scores_std,
        alpha=0.1,
        color="r",
    )
    axes[0].fill_between(
        train_sizes,
        test_scores_mean - test_scores_std,
        test_scores_mean + test_scores_std,
        alpha=0.1,
        color="g",
    )
    axes[0].plot(
        train_sizes, train_scores_mean, "o-", color="r", label="Training score"
    )
    axes[0].plot(
        train_sizes, test_scores_mean, "o-", color="g", label="Cross-validation score"
    )
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, "o-")
    axes[1].fill_between(
        train_sizes,
        fit_times_mean - fit_times_std,
        fit_times_mean + fit_times_std,
        alpha=0.1,
    )
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    # Plot fit_time vs score
    fit_time_argsort = fit_times_mean.argsort()
    fit_time_sorted = fit_times_mean[fit_time_argsort]
    test_scores_mean_sorted = test_scores_mean[fit_time_argsort]
    test_scores_std_sorted = test_scores_std[fit_time_argsort]
    axes[2].grid()
    axes[2].plot(fit_time_sorted, test_scores_mean_sorted, "o-")
    axes[2].fill_between(
        fit_time_sorted,
        test_scores_mean_sorted - test_scores_std_sorted,
        test_scores_mean_sorted + test_scores_std_sorted,
        alpha=0.1,
    )
    axes[2].set_xlabel("fit_times")
    axes[2].set_ylabel("Score")
    axes[2].set_title("Performance of the model")

    return plt

ohlc = pd.read_sql('SELECT * FROM ohlc', data_location)
## Data formatting


tokens = ohlc.token.unique()

def df_merge(left, right):
    return pd.merge(left, right, on='ts', how='inner')

X = reduce(df_merge, [
    (lambda df: 
    (
        df
        .assign(
            vol=vol_ohlc(df).fillna(0),
            ret=df.close.pct_change()
        )[['ts', 'vol', 'ret']]
        .rename(columns={
            col: f'{col}_{token}' for col in ['ts', 'vol', 'ret'] if col != 'ts'
        })
    ))(ohlc[ohlc.token == token])
    for token in tokens
]).set_index('ts')

y = X.ret_SOL.shift(-1)[:-1]
X = X[:-1]

# my model here
from itertools import combinations

# feature engineering: 
# demean returns window 5:
ret = X.loc[:, [c for c in X.columns if 'ret' in c]]
vol = X.loc[:, [c for c in X.columns if 'vol' in c]]
# demean_ret = pd.concat([X.loc[:,'ret_'+str(t)].apply for t in tokens])
demean_ret = ret.apply(lambda x : x - x.rolling(5).mean())
demean_ret.columns = ['demean_'+x for x in ret.columns]
X = pd.concat([X, demean_ret], axis = 1)


# clustering: 
def get_cluster(ls):
    comb_list = list(combinations(ls, 2))
    dist_list = [np.abs(comb[0]-comb[1]) for comb in comb_list]
    return np.mean(dist_list)

X.loc[:,'ret_cluster'] = ret.apply(get_cluster, axis = 1)
X.loc[:,'vol_cluster'] = ret.apply(get_cluster, axis = 1)

    
pipeline = Pipeline([
    ('impute', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0.)),
    ('scale', StandardScaler()),
    # ('pca', PCA(n_components=10)),
    ('pca', PCA()),
    ('model', Ridge())
])

evaluate_model(pipeline, X, y)




test_size = 0.2
cv = TimeSeriesSplit(n_splits=int(y.shape[0] * test_size), test_size=1)
scorer = make_scorer(mean_squared_error, greater_is_better=False, squared=False)

search = GridSearchCV(pipeline, {
    'pca__n_components': [1, 5, 10, 20, 22],
    'model__alpha': [0.1, 0.5,  1.]
}, scoring=scorer, refit=True, cv=cv, n_jobs=-1)
search.fit(X, y)

# print(X.head(10))

search.best_params_
best_model = search.best_estimator_
evaluate_model(best_model, X, y)

-0.00856016244338903

In [113]:
best_model.get_params()

{'memory': None,
 'steps': [('impute', SimpleImputer(fill_value=0.0, strategy='constant')),
  ('scale', StandardScaler()),
  ('pca', PCA(n_components=22)),
  ('model', Ridge(alpha=0.1))],
 'verbose': False,
 'impute': SimpleImputer(fill_value=0.0, strategy='constant'),
 'scale': StandardScaler(),
 'pca': PCA(n_components=22),
 'model': Ridge(alpha=0.1),
 'impute__add_indicator': False,
 'impute__copy': True,
 'impute__fill_value': 0.0,
 'impute__missing_values': nan,
 'impute__strategy': 'constant',
 'impute__verbose': 0,
 'scale__copy': True,
 'scale__with_mean': True,
 'scale__with_std': True,
 'pca__copy': True,
 'pca__iterated_power': 'auto',
 'pca__n_components': 22,
 'pca__random_state': None,
 'pca__svd_solver': 'auto',
 'pca__tol': 0.0,
 'pca__whiten': False,
 'model__alpha': 0.1,
 'model__copy_X': True,
 'model__fit_intercept': True,
 'model__max_iter': None,
 'model__normalize': 'deprecated',
 'model__positive': False,
 'model__random_state': None,
 'model__solver': 'auto',
 

we can now take this model and build a server around it, so that other systems can also now make predictions on the hourly returns of SOL!