# Assignment 8: Solana Returns Prediction Model / Optimization

## The aim of the model is to optimize Solana Returns Model:

### Import Initial Packages:

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from functools import reduce


### Volatility Calc.:

In [None]:
def vol_ohlc(df, lookback=10):
    o = df.open
    h = df.high
    l = df.low
    c = df.close
    
    k = 0.34 / (1.34 + (lookback+1)/(lookback-1))
    cc = np.log(c/c.shift(1))
    ho = np.log(h/o)
    lo = np.log(l/o)
    co = np.log(c/o)
    oc = np.log(o/c.shift(1))
    oc_sq = oc**2
    cc_sq = cc**2
    rs = ho*(ho-co)+lo*(lo-co)
    close_vol = cc_sq.rolling(lookback).sum() * (1.0 / (lookback - 1.0))
    open_vol = oc_sq.rolling(lookback).sum() * (1.0 / (lookback - 1.0))
    window_rs = rs.rolling(lookback).sum() * (1.0 / (lookback - 1.0))
    result = (open_vol + k * close_vol + (1-k) * window_rs).apply(np.sqrt) * np.sqrt(252)
    result[:lookback-1] = np.nan
    
    return result

### Setting the Learning Curve:

In [None]:
def plot_learning_curve(
    estimator,
    title,
    X,
    y,
    axes=None,
    ylim=None,
    cv=None,
    n_jobs=None,
    train_sizes=np.linspace(0.1, 1.0, 5),
    scoring=None
):
    if axes is None:
        _, axes = plt.subplots(1, 3, figsize=(20, 5))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(
        estimator,
        X,
        y,
        cv=cv,
        n_jobs=n_jobs,
        train_sizes=train_sizes,
        return_times=True,
        scoring=scoring,
    )
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(
        train_sizes,
        train_scores_mean - train_scores_std,
        train_scores_mean + train_scores_std,
        alpha=0.1,
        color="r",
    )
    axes[0].fill_between(
        train_sizes,
        test_scores_mean - test_scores_std,
        test_scores_mean + test_scores_std,
        alpha=0.1,
        color="g",
    )
    axes[0].plot(
        train_sizes, train_scores_mean, "o-", color="r", label="Training score"
    )
    axes[0].plot(
        train_sizes, test_scores_mean, "o-", color="g", label="Cross-validation score"
    )
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, "o-")
    axes[1].fill_between(
        train_sizes,
        fit_times_mean - fit_times_std,
        fit_times_mean + fit_times_std,
        alpha=0.1,
    )
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    # Plot fit_time vs score
    fit_time_argsort = fit_times_mean.argsort()
    fit_time_sorted = fit_times_mean[fit_time_argsort]
    test_scores_mean_sorted = test_scores_mean[fit_time_argsort]
    test_scores_std_sorted = test_scores_std[fit_time_argsort]
    axes[2].grid()
    axes[2].plot(fit_time_sorted, test_scores_mean_sorted, "o-")
    axes[2].fill_between(
        fit_time_sorted,
        test_scores_mean_sorted - test_scores_std_sorted,
        test_scores_mean_sorted + test_scores_std_sorted,
        alpha=0.1,
    )
    axes[2].set_xlabel("fit_times")
    axes[2].set_ylabel("Score")
    axes[2].set_title("Performance of the model")

    return plt

### Extracting Our Data:

In [None]:
connection_string = 'sqlite:///data/data.db'

In [None]:
ohlc = pd.read_sql('SELECT * FROM ohlc' , connection_string)
ohlc['ts'] = pd.to_datetime(ohlc['ts'])


### Understanding Our Data:

In [None]:
ohlc.shape

In [None]:
ohlc.head()

In [None]:
ohlc.info()

In [None]:
ohlc.describe()

### Preprocessing and Data Engineering:

In [None]:
tokens = ohlc.token.unique()

In [None]:
tokens

In [None]:
high_low = ohlc['high'] - ohlc['low']
high_cp = np.abs(ohlc['high'] - ohlc['close'].shift())
low_cp = np.abs(ohlc['low'] - ohlc['close'].shift())

df = pd.concat([high_low, high_cp, low_cp], axis=1)
true_range = np.max(ohlc, axis=1)

#### Additing Features:

* I added 4 additional features to enrich our model:<br>
1- Return close for period = 2, this will enrich the trend and understanding about returns.<br>
2- Volume price trend to identify the parity between the supply and demand of a crypto coin.<br>
3- USD_Volume return, this will enrich the trend and understanding about USD_volume returns.<br>
4- Average True Range shows how much a crypto coin moves, on average, during a given time frame.

In [None]:
def df_merge(left, right):
    return pd.merge(left, right, on='ts', how='inner')

X = reduce(df_merge, [
    (lambda df: 
    (
        df
        .assign(
            vol=vol_ohlc(df).fillna(0),
            ret=df.close.pct_change(),
            ret_period_2 = df.close.pct_change(2).fillna(0),
            volume_price_trend = (df.close.pct_change()*df.volume).fillna(0),
            USD_vol_ret = df.volumeUSD.pct_change().fillna(0),
            average_true_range = true_range.rolling(14).mean().fillna(0)
        )[['ts', 'vol', 'ret', 'ret_period_2', 'volume_price_trend', 'USD_vol_ret', 'average_true_range']]
        .rename(columns={
            col: f'{col}_{token}' for col in ['ts', 'vol', 'ret', 'ret_period_2', 'volume_price_trend', 'USD_vol_ret', 'average_true_range'] if col != 'ts'
        })
    ))(ohlc[ohlc.token == token])
    for token in tokens
]).set_index('ts')

In [None]:
X.tail()

In [None]:
y = X.ret_SOL.shift(-1)[:-1]
X = X[:-1]

In [None]:
X.shape

In [None]:
y.shape

In [None]:
y.describe()

In [None]:
from pandas.plotting import scatter_matrix, autocorrelation_plot


In [None]:
autocorrelation_plot(y[1:])


* No strong correlation for Solana.

In [None]:
pd.isnull(X).sum()

In [None]:
{col: y.corr(X[col]) for col in X.columns if X[col].dtype != 'object'}


### Setting up our transformer and creating our model:

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.svm import SVC

from sklearn.model_selection import cross_validate
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, make_scorer

from sklearn.model_selection import learning_curve

#### Custom Transformer:

In [None]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.columns]

#### Model Construction:

In [None]:
def evaluate_model(model, X, y, test_size=0.2):
    cv = TimeSeriesSplit(n_splits=int(y.shape[0] * test_size), test_size=1)
    scorer = make_scorer(mean_squared_error, greater_is_better=False, squared=False)
    
    return np.mean(cross_validate(model, X, y, cv=cv, scoring=scorer, n_jobs=-1)['test_score'])

#### Model pipeline:

##### With Simple Ridge:

In [None]:
pipeline = Pipeline([
    ('impute', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0.)),
    ('model', Ridge(alpha=0.1))
])

evaluate_model(pipeline, X, y)

In [None]:
fig, axes = plt.subplots(3, 1, figsize=(10, 15))

title = "Learning curves for ridge regression"

plot_learning_curve(
    pipeline, title, X, y, axes=axes, cv=cv, n_jobs=4, scoring=scorer
)

##### With RFR:

In [1]:
# It will take time to run.

In [None]:
pipeline = Pipeline([
    ('impute', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0.)),
    ('model', RandomForestRegressor(n_estimators=100, random_state=0))
])

evaluate_model(pipeline, X, y)

In [None]:
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ('impute', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0.)),
    ('scale', StandardScaler()),
    ('pca', PCA()),
    ('model', Ridge())
])

test_size = 0.2
cv = TimeSeriesSplit(n_splits=int(y.shape[0] * test_size), test_size=1)
scorer = make_scorer(mean_squared_error, greater_is_better=False, squared=False)

search = GridSearchCV(pipeline, {
    'pca__n_components': [5, 10, 20, 40, 66],
    'model__alpha' : [0.1, 0.5]
}, scoring=scorer, refit=True, cv=cv, n_jobs=-1)
search.fit(X, y)

In [None]:
search.best_params_

In [None]:
best_model = search.best_estimator_


In [None]:
evaluate_model(best_model, X, y)


In [None]:
fig, axes = plt.subplots(3, 1, figsize=(10, 15))

title = "Learning curves for ridge regression"

plot_learning_curve(
    best_model, title, X, y, axes=axes, cv=cv, n_jobs=4, scoring=scorer
)

* We were able to beat the class average cross-validated RMSE which was -0.008575141851714435.

  The above optimized model gives an average cross-validated RMSE of -0.00841268350558673.

* However, it is worth mentioning that we are better off committing to the first model (Cell 26) rather than tuning Ridge hyperparameters. In the tuned model, it has lower average cross-validated RMSE but its learning curve is worse than that of the first model.