In [None]:
import pandas as pd
import numpy as np
from typing import Callable
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor
# from cuml.ensemble import RandomForestRegressor as cuRF

In [None]:
# --- Configuration ---
TARGET_COLUMN = 'btc_price' # Your target column name
TOP_N_FEATURES_TO_SELECT = 10 # How many top features to display

In [None]:
def load_and_sort_csv(path):
    df = pd.read_csv(path, parse_dates=['Date'], index_col='Date')
    df.index = df.index.strftime('%Y-%m-%d')
    return df.sort_index(ascending=True)
path = '/content/drive/MyDrive/Main Sharing 1/TMA/Data/processed_dataset.csv'

In [None]:
df = load_and_sort_csv(path)

In [None]:
# --- 2. Feature Selection Functions (custom loop + lr scheduling) ---

def get_feature_importances_regressor(
    model,
    X,
    y,
    model_name="Regressor Model",
    num_loops: int = 1,
    init_params: dict = None,
    loop_params_fn: Callable[[int, dict], dict] = None
):
    """
    Trains a regression model in multiple loops (warm-started)
    and returns feature importances.

    Args:
      model: an unfitted sklearn‐style or cuML regressor.
      X, y:     training data
      num_loops: number of times to call .fit()
      init_params: dict of __init__ kwargs for the model
      loop_params_fn: function(loop_idx, current_params) → new_params
                      to update model parameters each loop

    Returns:
      DataFrame with 'feature' and 'importance'.
    """
    import pandas as pd

    print(f"\n--- {model_name} Feature Selection (with {num_loops} loops) ---")

    # 1. Initialize model with any init_params
    params = init_params.copy() if init_params else {}
    # Only set supported init params
    supported_init = {k: v for k, v in params.items() if k in model.get_params()}
    if supported_init:
        model.set_params(**supported_init)

    # 2. Flatten y to 1D
    if hasattr(y, 'ndim') and y.ndim > 1 and y.shape[1] == 1:
        y_fit = y.iloc[:, 0]
    else:
        y_fit = y

    # 3. Loop training
    for loop in range(num_loops):
        if loop_params_fn:
            # Compute desired updates
            candidate = loop_params_fn(loop, model.get_params())
            supported = set(model.get_params())
            filtered = {k: v for k, v in candidate.items() if k in supported}
            dropped = set(candidate) - supported
            if dropped:
                print(f"    ▶️ Dropping unsupported params for {model_name}: {dropped}")
            if filtered:
                model.set_params(**filtered)
            train_params = filtered
        else:
            train_params = {}

        print(f"  Loop {loop+1}/{num_loops}: training with params: {train_params}")
        model.fit(X, y_fit)

    # 4. Get importances
    try:
        importances = model.feature_importances_
    except AttributeError:
        raise RuntimeError(f"{model_name} has no attribute 'feature_importances_'")

    feat_imp = pd.DataFrame({
        'feature': X.columns,
        'importance': importances
    }).sort_values('importance', ascending=False)

    top_n = min(len(feat_imp), 10)
    print(f"Top {top_n} features for {model_name}:")
    print(feat_imp.head(top_n))
    return feat_imp



In [None]:
TARGET_COLUMN = 'btc_price'
TOP_N_FEATURES_TO_SELECT = 40

X = df.drop(TARGET_COLUMN, axis=1)
y = df[TARGET_COLUMN]

print(f"\nShape of X (features): {X.shape}")
print(f"Shape of y (target): {y.shape}")
print(f"Features being used: {X.columns.tolist()}")


Shape of X (features): (1987, 61)
Shape of y (target): (1987,)
Features being used: ['PiCycle_cbbi', 'RUPL_cbbi', 'RHODL_cbbi', 'Puell_cbbi', '2YMA_cbbi', 'Trolololo_cbbi', 'MVRV_cbbi', 'ReserveRisk_cbbi', 'Woobull_cbbi', 'Confidence_cbbi', 'CSCSI20', 'active_addresses_blockchain', 'hash_rate_blockchain', 'miner_revenue_blockchain', 'difficulty_blockchain', 'estimated_transaction_volume_usd_blockchain', 'Gold_Price', 'Gold_Share', 'Gold_Volatility', 'Oil_Crude_Price', 'Oil_Brent_Price', 'Oil_Volatility', 'DJI', 'GSPC', 'IXIC', 'NYFANG', 'CBOE_Volatility', 'EM_ETF', 'DXY', 'EURUSD', 'btc_sma_14', 'btc_ema_14', 'btc_rsi_14', 'btc_macd', 'btc_macd_signal', 'btc_macd_diff', 'btc_bb_high', 'btc_bb_low', 'btc_bb_mid', 'btc_bb_width', 'btc_atr_14', 'btc_trading_volume', 'btc_volatility_index', 'Fear Greed', 'positive_sentiment', 'negative_sentiment', 'bullish_sentiment', 'bearish_sentiment', 'risk_uncertainty_sentiment', 'problem_malicious_sentiment', 'active_trading_sentiment', 'long_term_i

In [None]:
# Define your scheduling function
def schedule(loop, params):
    # add 20 trees each loop, decay lr by 5% every 3 loops
    new_lr = params['learning_rate'] * (0.95 ** (loop // 3))
    return {
        'n_estimators': params['n_estimators'] + 20,
        'learning_rate': new_lr
    }

In [None]:
# --- XGBoost Feature Selection ---
xgb_regressor = xgb.XGBRegressor(
    objective='reg:squarederror',
    random_state=42,
    n_estimators=100,
    tree_method='hist',   # keep as 'hist'
    device='cuda',        # explicitly move to GPU
    learning_rate=0.1     # your custom LR
)
xgb_importances = get_feature_importances_regressor(
    xgb_regressor, X, y, model_name="XGBoost Regressor",
    num_loops=10,
    init_params={'n_estimators': 50, 'learning_rate': 0.1},
    loop_params_fn=schedule
    )

# --- LightGBM Feature Selection ---
lgb_regressor = lgb.LGBMRegressor(
    objective='regression_l1', # MAE, or 'regression' for MSE
    random_state=42,
    n_estimators=100, # Default, can be tuned
    verbose=-1 ,
    device='gpu',               # enable GPU training
    gpu_platform_id=0,          # (optional) CUDA platform
    gpu_device_id=0             # (optional) which GPU
)
lgb_importances = get_feature_importances_regressor(
    lgb_regressor, X, y, model_name="LightGBM Regressor",
    num_loops=10,
    init_params={'n_estimators': 50, 'learning_rate': 0.1},
    loop_params_fn=schedule
    )

# --- Random Forest Feature Selection ---
# --- Random Forest Feature Selection ---
rf_regressor = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1, # Use all available cores
    max_features='sqrt' # Common choice for RF regression
)
rf_importances = get_feature_importances_regressor(
    rf_regressor, X, y, model_name="Random Forest Regressor"
    )

# --- Using Selected Features ---
# You can now take the top features from any of these models (or an intersection/union)
# to create a reduced dataset for your Deep Learning model.
if not xgb_importances.empty:
    selected_features_xgb = xgb_importances['feature'].head(TOP_N_FEATURES_TO_SELECT).tolist()
    print(f"\nTop {TOP_N_FEATURES_TO_SELECT} features from XGBoost Regressor to consider for DL model: {selected_features_xgb}")
    X_selected_xgb = X[selected_features_xgb]
    print(f"Shape of dataset with XGBoost selected features: {X_selected_xgb.shape}")

if not lgb_importances.empty:
    selected_features_lgb = lgb_importances['feature'].head(TOP_N_FEATURES_TO_SELECT).tolist()
    print(f"\nTop {TOP_N_FEATURES_TO_SELECT} features from LightGBM Regressor to consider for DL model: {selected_features_lgb}")
    X_selected_lgb = X[selected_features_lgb]
    print(f"Shape of dataset with LightGBM selected features: {X_selected_lgb.shape}")

if not rf_importances.empty:
    selected_features_rf = rf_importances['feature'].head(TOP_N_FEATURES_TO_SELECT).tolist()
    print(f"\nTop {TOP_N_FEATURES_TO_SELECT} features from Random Forest Regressor to consider for DL model: {selected_features_rf}")
    X_selected_rf = X[selected_features_rf]
    print(f"Shape of dataset with Random Forest selected features: {X_selected_rf.shape}")


--- XGBoost Regressor Feature Selection (with 10 loops) ---
  Loop 1/10: training with params: {'n_estimators': 70, 'learning_rate': 0.1}
  Loop 2/10: training with params: {'n_estimators': 90, 'learning_rate': 0.1}
  Loop 3/10: training with params: {'n_estimators': 110, 'learning_rate': 0.1}
  Loop 4/10: training with params: {'n_estimators': 130, 'learning_rate': 0.095}
  Loop 5/10: training with params: {'n_estimators': 150, 'learning_rate': 0.09025}
  Loop 6/10: training with params: {'n_estimators': 170, 'learning_rate': 0.0857375}
  Loop 7/10: training with params: {'n_estimators': 190, 'learning_rate': 0.07737809374999999}
  Loop 8/10: training with params: {'n_estimators': 210, 'learning_rate': 0.06983372960937499}
  Loop 9/10: training with params: {'n_estimators': 230, 'learning_rate': 0.06302494097246093}
  Loop 10/10: training with params: {'n_estimators': 250, 'learning_rate': 0.05403600876626369}
Top 10 features for XGBoost Regressor:
                 feature  importanc

In [None]:
common_features = list(set(selected_features_rf) & set(selected_features_lgb) & set(selected_features_xgb))
type(common_features)

list

In [None]:
# prompt: print each value in a list in separate line

for feature in common_features:
  print(feature)


Trolololo_cbbi
market_narrative_sentiment
Woobull_cbbi
Oil_Crude_Price
btc_macd_signal
volume_sentiment
2YMA_cbbi
RUPL_cbbi
positive_sentiment
Fear Greed
btc_macd
btc_macd_diff
btc_rsi_14
community_social_sentiment
long_term_investment_sentiment
EM_ETF
btc_volatility_index
risk_uncertainty_sentiment
active_trading_sentiment
bearish_sentiment
btc_trading_volume
btc_bb_width
price_sentiment
CBOE_Volatility
bullish_sentiment
marketcap_sentiment
