In [None]:
import pandas as pd
import numpy as np
from solvexity.config.loader import load_config

# Load configuration
config = load_config('../config/configuration.yml')

df_raw = pd.read_csv('./data/feature_extraction.csv')
df_raw.head()

In [None]:
df = df_raw.copy().tail(1000)
# Get x_columns from config
x_columns = sorted([indicator.name for indicator in config.indicators.lookback])

x = df[x_columns]
y_column = [indicator.name for indicator in config.indicators.lookafter][0]
y = df[y_column] # train on the first y column

print(y_column, x_columns)

# Convert the timestamp from milliseconds to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')

# Sort by timestamp to ensure correct time-based splitting
df = df.sort_values(by='timestamp')

# Define the number of weeks for training and testing
train_weeks = 8
test_weeks = 1

# Calculate the start and end dates
latest_date = df['timestamp'].max()
splits = []


# Generate 8 splits for cross-validation
for i in range(8):
    test_end_date = latest_date - pd.Timedelta(weeks=7 - i)
    test_start_date = test_end_date - pd.Timedelta(weeks=test_weeks)
    train_end_date = test_start_date
    train_start_date = df['timestamp'].min()

    train_set = df[(df['timestamp'] >= train_start_date) & (df['timestamp'] < train_end_date)]
    test_set = df[(df['timestamp'] >= test_start_date) & (df['timestamp'] < test_end_date)]

    splits.append((train_set, test_set))

# Display summary of the splits
split_summary = []
for i, (train, test) in enumerate(splits):
    print({
        'Split': i+1,
        'Train Start': train['timestamp'].min(),
        'Train End': train['timestamp'].max(),
        'Test Start': test['timestamp'].min(),
        'Test End': test['timestamp'].max(),
        'Train Size': len(train),
        'Test Size': len(test),
    })

y.describe()

In [None]:
import matplotlib.pyplot as plt

# Plot the histogram of y
plt.figure(figsize=(10, 6))
plt.hist(y, bins=30, color='blue', alpha=0.7, edgecolor='black')
plt.title('Histogram of Stopping Returns (y)', fontsize=16)
plt.xlabel('Stopping Returns', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
import decimal
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import QuantileRegressor
from sklearn.metrics import mean_pinball_loss
from sklearn.preprocessing import PolynomialFeatures
# from sklearn.ensemble import GradientBoostingRegressor
from sklearn.decomposition import PCA



class AddConstant(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        # No fitting required for adding a constant, but method must be implemented
        return self

    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.copy()
            X['const'] = 1.0
        else:
            # If X is a NumPy array, append a constant column
            const_column = np.ones((X.shape[0], 1))
            X = np.hstack((const_column, X))
        return X

    def fit_transform(self, X, y=None):
        # Use fit and transform together
        self.fit(X, y)
        return self.transform(X)

q_decimal = decimal.Decimal('0.95')
q = float(q_decimal)

imputer = SimpleImputer(strategy='mean')
scaler = StandardScaler()
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
pca = PCA(n_components=0.95)
add_constant = AddConstant()


models = {
    'Baseline': QuantileRegressor(quantile=q, alpha=0.0, fit_intercept=False),
    'QuantileRegressor_alpha_0': QuantileRegressor(quantile=q, alpha=0.0, fit_intercept=False),
    'QuantileRegressor_alpha_0_05': QuantileRegressor(quantile=q, alpha=0.05, fit_intercept=False),
    'QuantileRegressor_alpha_0_50': QuantileRegressor(quantile=q, alpha=0.50, fit_intercept=False),
    'QuantileRegressor_alpha_1_00': QuantileRegressor(quantile=q, alpha=1.00, fit_intercept=False),
    # 'GrandientBoostingRegressor': GradientBoostingRegressor(loss='quantile', alpha=q, n_estimators=100, learning_rate=0.01),
}

cross_validation_results = {
    model_name: [] for model_name in models.keys()
}

for i, (train, test) in enumerate(splits):
    print(f'Split {i + 1}, Train Size: {len(train)}, Test Size: {len(test)}')
    x_train_imputed = imputer.fit_transform(train[x_columns])
    x_test_imputed = imputer.transform(test[x_columns])

    x_train_scaled = scaler.fit_transform(x_train_imputed)
    x_test_scaled = scaler.transform(x_test_imputed)

    x_train_interaction = poly.fit_transform(x_train_scaled)
    x_test_interaction = poly.transform(x_test_scaled)

    x_train_pca = pca.fit_transform(x_train_interaction)
    x_test_pca = pca.transform(x_test_interaction)

    x_train_pca_ext = add_constant.fit_transform(x_train_pca)
    x_test_pca_ext = add_constant.transform(x_test_pca)

    print(x_train_pca_ext.shape)


    y_train = train[y_column]
    y_test = test[y_column]

    for model_name, model in models.items():
        if model_name == 'Baseline':
            # model.fit(x_train_pca_df[['const']], y_train)
            # y_pred = model.predict(x_test_pca_df[['const']])
            model.fit(x_train_pca_ext[:, 0].reshape(-1, 1), y_train)
            y_pred = model.predict(x_test_pca_ext[:, 0].reshape(-1, 1))
        else:
            model.fit(x_train_pca_ext, y_train)
            y_pred = model.predict(x_test_pca_ext)
        # Calculate the mean absolute error
        loss_test = mean_pinball_loss(y_test, y_pred, alpha=0.0)
        cross_validation_results[model_name].append(loss_test)
        print(f'Model: {model_name}, Pinfall loss: {loss_test:.4f}')
        print(f"model.coef_: {model.coef_}")
# Display the cross-validation results
cross_validation_results_df = pd.DataFrame(cross_validation_results)
cross_validation_results_df.describe()





In [None]:
from sklearn.pipeline import Pipeline
import joblib

from solvexity.analytic.agent import generate_quantile_pipeline

pipeline_ = Pipeline(steps=[
    ('imputer', imputer),
    ('scaler', scaler),
    ('poly', poly),
    ('pca', pca),
    ('add_constant', add_constant),
    ('quantile', models['QuantileRegressor_alpha_0_50']) # modify this
])

data = {c: 1 for c in x_columns}
test_data = pd.DataFrame(data, index=[0])
pipeline_.predict(test_data)
models['QuantileRegressor_alpha_0_50'].coef_
# quantile_str = str(q_decimal).replace('.', '-')
# date_str = latest_date.strftime('%Y-%m-%d')

# pipeline = generate_quantile_pipeline(q_decimal)

# pipeline.fit(train[x_columns], train[y_column])

# joblib.dump(pipeline, f'pipeline_{quantile_str}_{date_str}.pkl')