In [None]:
import pandas as pd
import numpy as np
df_raw = pd.read_csv('./data/features.csv')
df_raw.head()

In [None]:
df = df_raw.copy()
x = df[['returns_1m_30', 'volatility_1m_30', 'mdd_1m_30',
       'skewness_1m_30', 'kurtosis_1m_30', 'returns_1m_180',
       'volatility_1m_180', 'mdd_1m_180', 'skewness_1m_180', 'kurtosis_1m_180',
       'returns_5m_30', 'volatility_5m_30', 'mdd_5m_30', 'skewness_5m_30',
       'kurtosis_5m_30', 'returns_5m_180', 'volatility_5m_180', 'mdd_5m_180',
       'skewness_5m_180', 'kurtosis_5m_180', 'returns_15m_30',
       'volatility_15m_30', 'mdd_15m_30', 'skewness_15m_30', 'kurtosis_15m_30',
       'returns_15m_180', 'volatility_15m_180', 'mdd_15m_180',
       'skewness_15m_180', 'kurtosis_15m_180', 'returns_1h_30',
       'volatility_1h_30', 'mdd_1h_30', 'skewness_1h_30', 'kurtosis_1h_30',
       'returns_1h_180', 'volatility_1h_180', 'mdd_1h_180', 'skewness_1h_180',
       'kurtosis_1h_180']]
y = df['stopping_returns_1m_60']
x_columns = x.columns

In [None]:
import matplotlib.pyplot as plt

# Plot the histogram of y
plt.figure(figsize=(10, 6))
plt.hist(y, bins=30, color='blue', alpha=0.7, edgecolor='black')
plt.title('Histogram of Stopping Returns (y)', fontsize=16)
plt.xlabel('Stopping Returns', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np

# Handle missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(x)
# Standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Check data shape
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
import statsmodels.api as sm

# Fit Quantile Regression for the 5th and 95th quantiles
quantiles = [0.05, 0.95]
quantile_models = {}

for q in quantiles:
    model = sm.QuantReg(y_train, sm.add_constant(X_train)).fit(q=q)
    quantile_models[q] = model

# Extract and display the summary of the models
summary_5 = quantile_models[0.05].summary()
summary_95 = quantile_models[0.95].summary()

summary_5, summary_95

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Extract coefficients and p-values for both quantiles
coef_5 = quantile_models[0.05].params[1:]  # Exclude intercept
pvals_5 = quantile_models[0.05].pvalues[1:]

coef_95 = quantile_models[0.95].params[1:]
pvals_95 = quantile_models[0.95].pvalues[1:]

# Filter significant coefficients (p < 0.05)
significant_5 = coef_5[pvals_5 < 0.05]
significant_95 = coef_95[pvals_95 < 0.05]

# Plot for 5th quantile
plt.figure(figsize=(12, 6))
plt.barh(significant_5.index, significant_5.values, color='blue', alpha=0.7)
plt.xlabel('Coefficient Value')
plt.title('Significant Features in 5th Quantile Regression (Extreme Negative Returns)')
plt.grid(axis='x')
plt.gca().invert_yaxis()
plt.show()

# Plot for 95th quantile
plt.figure(figsize=(12, 6))
plt.barh(significant_95.index, significant_95.values, color='green', alpha=0.7)
plt.xlabel('Coefficient Value')
plt.title('Significant Features in 95th Quantile Regression (Extreme Positive Returns)')
plt.grid(axis='x')
plt.gca().invert_yaxis()
plt.show()

# Calculate model efficiency using Pseudo R-squared
pseudo_r2_5 = quantile_models[0.05].prsquared
pseudo_r2_95 = quantile_models[0.95].prsquared

pseudo_r2_5, pseudo_r2_95

In [None]:
from sklearn.preprocessing import PolynomialFeatures

# Generate interaction terms using PolynomialFeatures (degree=2 for pairwise interactions)
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_interaction = poly.fit_transform(X_scaled)

# Convert to DataFrame for easier analysis
interaction_feature_names = poly.get_feature_names_out(x_columns)
X_interaction_df = pd.DataFrame(X_interaction, columns=interaction_feature_names)

# Add constant for quantile regression
X_interaction_df = sm.add_constant(X_interaction_df)

In [None]:
from sklearn.decomposition import PCA

# Apply PCA to reduce dimensions, keeping 95% variance
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_interaction)

# Convert to DataFrame
X_pca_df = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(X_pca.shape[1])])

# Add constant for quantile regression
X_pca_df = sm.add_constant(X_pca_df)

# Align with y_train
X_pca_train = X_pca_df.iloc[y_train.index]

# Fit Quantile Regression with PCA components
pca_models = {}
for q in quantiles:
    model = sm.QuantReg(y_train, X_pca_train).fit(q=q)
    pca_models[q] = model

# Extract and display the summary of the models
pca_summary_5 = pca_models[0.05].summary()
pca_summary_95 = pca_models[0.95].summary()

pca_summary_5, pca_summary_95

In [None]:
# Extract significant components for visualization
pvals_pca_5 = pca_models[0.05].pvalues[1:]
pvals_pca_95 = pca_models[0.95].pvalues[1:]

coef_pca_5 = pca_models[0.05].params[1:]
coef_pca_95 = pca_models[0.95].params[1:]

# Filter significant coefficients (p < 0.05)
significant_pca_5 = coef_pca_5[pvals_pca_5 < 0.05]
significant_pca_95 = coef_pca_95[pvals_pca_95 < 0.05]

# Plot for 5th quantile
plt.figure(figsize=(12, 6))
plt.barh(significant_pca_5.index, significant_pca_5.values, color='blue', alpha=0.7)
plt.xlabel('Coefficient Value')
plt.title('Significant PCA Components in 5th Quantile Regression (Extreme Negative Returns)')
plt.grid(axis='x')
plt.gca().invert_yaxis()
plt.show()

# Plot for 95th quantile
plt.figure(figsize=(12, 6))
plt.barh(significant_pca_95.index, significant_pca_95.values, color='green', alpha=0.7)
plt.xlabel('Coefficient Value')
plt.title('Significant PCA Components in 95th Quantile Regression (Extreme Positive Returns)')
plt.grid(axis='x')
plt.gca().invert_yaxis()
plt.show()