In [38]:
import pandas as pd
import numpy as np
from solvexity.config.loader import load_config

# Load configuration
config = load_config('../config/configuration.yml')

df_raw = pd.read_csv('./data/feature_extraction.csv')
df_raw.head()

Unnamed: 0,timestamp,returns_btcusdt_1m_30,returns_btcusdt_1m_180,returns_btcusdt_5m_30,returns_btcusdt_5m_180,returns_btcusdt_15m_30,returns_btcusdt_15m_180,returns_btcusdt_1h_30,returns_btcusdt_1h_180,volatility_btcusdt_1m_30,...,skewness_btcusdt_1h_180,kurtosis_btcusdt_1m_30,kurtosis_btcusdt_1m_180,kurtosis_btcusdt_5m_30,kurtosis_btcusdt_5m_180,kurtosis_btcusdt_15m_30,kurtosis_btcusdt_15m_180,kurtosis_btcusdt_1h_30,kurtosis_btcusdt_1h_180,stopping_returns_btcusdt_1m_60
0,1640995200000,-0.003594,0.006908,-0.000849,-0.045105,-0.034953,-0.009167,-0.034346,-0.094251,0.000889,...,-0.088512,0.35363,0.440276,-0.70919,2.951686,0.235081,3.567445,5.638696,4.554796,0.008781
1,1641009600000,-0.000532,0.003557,-0.00068,-0.025934,0.01819,-0.004404,-0.00577,-0.080778,0.000544,...,-0.091479,0.001285,1.09105,-0.404089,2.072002,0.896315,3.635509,5.296725,4.433352,-0.002933
2,1641024000000,0.00439,0.009717,-0.005941,-0.006385,0.012528,0.005728,0.004253,-0.07659,0.000573,...,-0.075415,-0.816676,7.96069,0.266713,12.36428,8.422241,3.848065,4.08091,4.516914,-0.001048
3,1641038400000,-0.004422,-0.008287,-0.006643,0.014752,0.000521,-0.013853,-0.013225,-0.083972,0.000827,...,-0.06821,1.458643,4.244894,-0.194862,16.122893,6.481903,3.890128,4.255362,4.487127,0.005319
4,1641052800000,0.004609,0.00176,0.005006,0.013661,0.000188,-0.013675,-0.016832,-0.07207,0.001699,...,-0.074824,3.323061,6.268312,7.239923,19.551865,0.672424,4.025679,2.202504,4.355777,-0.000319


In [40]:
df = df_raw.copy().tail(1000)
# Get x_columns from config
x_columns = sorted([indicator.name for indicator in config.indicators.lookback])

x = df[x_columns]
y_column = [indicator.name for indicator in config.indicators.lookafter][0]
y = df[y_column] # train on the first y column

print(y_column, x_columns)

# Convert the timestamp from milliseconds to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')

# Sort by timestamp to ensure correct time-based splitting
df = df.sort_values(by='timestamp')

# Define the number of weeks for training and testing
train_weeks = 8
test_weeks = 1

# Calculate the start and end dates
latest_date = df['timestamp'].max()
splits = []


# Generate 8 splits for cross-validation
for i in range(8):
    test_end_date = latest_date - pd.Timedelta(weeks=7 - i)
    test_start_date = test_end_date - pd.Timedelta(weeks=test_weeks)
    train_end_date = test_start_date
    train_start_date = df['timestamp'].min()

    train_set = df[(df['timestamp'] >= train_start_date) & (df['timestamp'] < train_end_date)]
    test_set = df[(df['timestamp'] >= test_start_date) & (df['timestamp'] < test_end_date)]

    splits.append((train_set, test_set))

# Display summary of the splits
split_summary = []
for i, (train, test) in enumerate(splits):
    print({
        'Split': i+1,
        'Train Start': train['timestamp'].min(),
        'Train End': train['timestamp'].max(),
        'Test Start': test['timestamp'].min(),
        'Test End': test['timestamp'].max(),
        'Train Size': len(train),
        'Test Size': len(test),
    })

y.describe()

stopping_returns_btcusdt_1m_60 ['kurtosis_btcusdt_15m_180', 'kurtosis_btcusdt_15m_30', 'kurtosis_btcusdt_1h_180', 'kurtosis_btcusdt_1h_30', 'kurtosis_btcusdt_1m_180', 'kurtosis_btcusdt_1m_30', 'kurtosis_btcusdt_5m_180', 'kurtosis_btcusdt_5m_30', 'mdd_btcusdt_15m_180', 'mdd_btcusdt_15m_30', 'mdd_btcusdt_1h_180', 'mdd_btcusdt_1h_30', 'mdd_btcusdt_1m_180', 'mdd_btcusdt_1m_30', 'mdd_btcusdt_5m_180', 'mdd_btcusdt_5m_30', 'returns_btcusdt_15m_180', 'returns_btcusdt_15m_30', 'returns_btcusdt_1h_180', 'returns_btcusdt_1h_30', 'returns_btcusdt_1m_180', 'returns_btcusdt_1m_30', 'returns_btcusdt_5m_180', 'returns_btcusdt_5m_30', 'skewness_btcusdt_15m_180', 'skewness_btcusdt_15m_30', 'skewness_btcusdt_1h_180', 'skewness_btcusdt_1h_30', 'skewness_btcusdt_1m_180', 'skewness_btcusdt_1m_30', 'skewness_btcusdt_5m_180', 'skewness_btcusdt_5m_30', 'volatility_btcusdt_15m_180', 'volatility_btcusdt_15m_30', 'volatility_btcusdt_1h_180', 'volatility_btcusdt_1h_30', 'volatility_btcusdt_1m_180', 'volatility_btc

count    1000.000000
mean       -0.000262
std         0.005128
min        -0.048446
25%        -0.002660
50%        -0.000138
75%         0.002187
max         0.039978
Name: stopping_returns_btcusdt_1m_60, dtype: float64

In [24]:
import matplotlib.pyplot as plt

# Plot the histogram of y
plt.figure(figsize=(10, 6))
plt.hist(y, bins=30, color='blue', alpha=0.7, edgecolor='black')
plt.title('Histogram of Stopping Returns (y)', fontsize=16)
plt.xlabel('Stopping Returns', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

  plt.show()


In [37]:
import decimal
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import QuantileRegressor
from sklearn.metrics import mean_pinball_loss
from sklearn.preprocessing import PolynomialFeatures
# from sklearn.ensemble import GradientBoostingRegressor
from sklearn.decomposition import PCA



class AddConstant(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        # No fitting required for adding a constant, but method must be implemented
        return self

    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.copy()
            X['const'] = 1.0
        else:
            # If X is a NumPy array, append a constant column
            const_column = np.ones((X.shape[0], 1))
            X = np.hstack((const_column, X))
        return X

    def fit_transform(self, X, y=None):
        # Use fit and transform together
        self.fit(X, y)
        return self.transform(X)

q_decimal = decimal.Decimal('0.95')
q = float(q_decimal)

imputer = SimpleImputer(strategy='mean')
scaler = StandardScaler()
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
pca = PCA(n_components=0.95)
add_constant = AddConstant()


models = {
    'Baseline': QuantileRegressor(quantile=q, alpha=0.0, fit_intercept=False),
    'QuantileRegressor_alpha_0': QuantileRegressor(quantile=q, alpha=0.0, fit_intercept=False),
    'QuantileRegressor_alpha_0_05': QuantileRegressor(quantile=q, alpha=0.05, fit_intercept=False),
    'QuantileRegressor_alpha_0_50': QuantileRegressor(quantile=q, alpha=0.50, fit_intercept=False),
    'QuantileRegressor_alpha_1_00': QuantileRegressor(quantile=q, alpha=1.00, fit_intercept=False),
    # 'GrandientBoostingRegressor': GradientBoostingRegressor(loss='quantile', alpha=q, n_estimators=100, learning_rate=0.01),
}

cross_validation_results = {
    model_name: [] for model_name in models.keys()
}

for i, (train, test) in enumerate(splits):
    print(f'Split {i + 1}, Train Size: {len(train)}, Test Size: {len(test)}')
    x_train_imputed = imputer.fit_transform(train[x_columns])
    x_test_imputed = imputer.transform(test[x_columns])

    x_train_scaled = scaler.fit_transform(x_train_imputed)
    x_test_scaled = scaler.transform(x_test_imputed)

    x_train_interaction = poly.fit_transform(x_train_scaled)
    x_test_interaction = poly.transform(x_test_scaled)

    x_train_pca = pca.fit_transform(x_train_interaction)
    x_test_pca = pca.transform(x_test_interaction)

    x_train_pca_ext = add_constant.fit_transform(x_train_pca)
    x_test_pca_ext = add_constant.transform(x_test_pca)

    print(x_train_pca_ext.shape)


    y_train = train[y_column]
    y_test = test[y_column]

    for model_name, model in models.items():
        if model_name == 'Baseline':
            # model.fit(x_train_pca_df[['const']], y_train)
            # y_pred = model.predict(x_test_pca_df[['const']])
            model.fit(x_train_pca_ext[:, 0].reshape(-1, 1), y_train)
            y_pred = model.predict(x_test_pca_ext[:, 0].reshape(-1, 1))
        else:
            model.fit(x_train_pca_ext, y_train)
            y_pred = model.predict(x_test_pca_ext)
        # Calculate the mean absolute error
        loss_test = mean_pinball_loss(y_test, y_pred, alpha=0.0)
        cross_validation_results[model_name].append(loss_test)
        print(f'Model: {model_name}, Pinfall loss: {loss_test:.4f}')
        print(f"model.coef_: {model.coef_}")
# Display the cross-validation results
cross_validation_results_df = pd.DataFrame(cross_validation_results)
cross_validation_results_df.describe()





Split 1, Train Size: 663, Test Size: 42
(663, 102)
Model: Baseline, Pinfall loss: 0.0085
model.coef_: [0.00764914]
Model: QuantileRegressor_alpha_0, Pinfall loss: 0.0064
model.coef_: [ 5.98332752e-03 -8.83325163e-06 -2.89542631e-06  3.14214634e-05
  1.89972734e-05  5.28926402e-05  1.00555444e-05 -1.53160416e-04
 -3.89148327e-05  5.86579402e-05  5.39783342e-05 -6.32222526e-06
  7.14156377e-05  2.66799388e-05  6.81913345e-05 -5.53607277e-05
  9.26032892e-05 -5.01486946e-05  5.41826803e-05 -2.16095884e-05
 -1.18724267e-04  4.08554586e-05  1.19534605e-04 -1.98392224e-04
 -1.10984204e-05 -1.96246414e-04  8.45993635e-06 -7.97124488e-06
 -1.38775531e-04 -1.36198588e-04  1.75210268e-05  2.31431223e-04
  1.87414595e-04 -1.19418085e-04 -6.44712105e-05 -1.34723343e-05
  1.58976255e-04 -1.44760873e-04  5.28451973e-05  2.22852841e-04
 -2.51672508e-04  3.08654387e-04  1.43831328e-04  1.92562237e-04
 -3.41204515e-05 -3.09036920e-04 -2.18698133e-04 -2.10901973e-04
 -1.36474684e-04 -7.55052087e-05  2.5

Unnamed: 0,Baseline,QuantileRegressor_alpha_0,QuantileRegressor_alpha_0_05,QuantileRegressor_alpha_0_50,QuantileRegressor_alpha_1_00
count,8.0,8.0,8.0,8.0,8.0
mean,0.007992,0.007477,0.005462,0.001946,0.001946
std,0.000865,0.003034,0.001544,0.001098,0.001098
min,0.007269,0.004425,0.004444,0.001112,0.001112
25%,0.00753,0.006266,0.004699,0.001429,0.001429
50%,0.007638,0.007037,0.004928,0.001523,0.001523
75%,0.008107,0.007206,0.00533,0.001965,0.001965
max,0.009925,0.014599,0.009149,0.004481,0.004481


In [36]:
from sklearn.pipeline import Pipeline
import joblib

from solvexity.analytic.agent import generate_quantile_pipeline

pipeline_ = Pipeline(steps=[
    ('imputer', imputer),
    ('scaler', scaler),
    ('poly', poly),
    ('pca', pca),
    ('add_constant', add_constant),
    ('quantile', models['QuantileRegressor_alpha_0_50']) # modify this
])

data = {c: 1 for c in x_columns}
test_data = pd.DataFrame(data, index=[0])
pipeline_.predict(test_data)
models['QuantileRegressor_alpha_0_50'].coef_
# quantile_str = str(q_decimal).replace('.', '-')
# date_str = latest_date.strftime('%Y-%m-%d')

# pipeline = generate_quantile_pipeline(q_decimal)

# pipeline.fit(train[x_columns], train[y_column])

# joblib.dump(pipeline, f'pipeline_{quantile_str}_{date_str}.pkl')

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.])