In [5]:
# save_btc_csv.py
import os
from pathlib import Path
import pandas as pd
import numpy as np
from binance.client import Client
from dotenv import load_dotenv

CSV_PATH = "btc_prices_and_metrics.csv"
SYMBOL = "BTCUSDT"
INTERVAL = Client.KLINE_INTERVAL_1DAY
START = "20 years ago UTC"  # Binance devolverá desde el inicio disponible (2017-08-17 para BTCUSDT spot)

# Halvings como fechas "naive" (sin tz) para que cuadre con el RL script
HALVING_DATES = pd.to_datetime([
    "2012-11-28",
    "2016-07-09",
    "2020-05-11",
    "2024-04-20",
])

def compute_days_since_halving(d: pd.Timestamp) -> int:
    # d será naive; HALVING_DATES también naive
    past = [h for h in HALVING_DATES if h <= d]
    ref = past[-1] if past else HALVING_DATES[0]
    return (d - ref).days

def main():
    # Cargar claves (no son estrictamente necesarias para klines públicos)
    load_dotenv("../../envs/.env")
    api_key = os.getenv("copy_key")
    api_secret = os.getenv("copy_secret")
    client = Client(api_key, api_secret)

    klines = client.get_historical_klines(SYMBOL, INTERVAL, START)

    df = pd.DataFrame(
        klines,
        columns=[
            "open_time","open","high","low","close","volume",
            "close_time","quote_asset_volume","number_of_trades",
            "taker_buy_base","taker_buy_quote","ignore"
        ]
    )

    # --- Mantener columnas necesarias para tu RL ---
    # Convertimos a datetime, primero en UTC y luego quitamos tz para que quede "naive".
    df["date"] = pd.to_datetime(df["open_time"], unit="ms", utc=True).dt.tz_localize(None)
    df["close"] = pd.to_numeric(df["close"], errors="coerce")

    df = df[["date", "close"]].sort_values("date").reset_index(drop=True)

    # Métrica requerida por tu RL (también la calcula ahí, pero así no hay fallos al validar)
    df["days_since_halving"] = df["date"].apply(compute_days_since_halving)

    # (Opcional) Espacios para FEATURES futuras:
    # for col in ["m2_growth", "fed_funds_rate", "active_addresses"]:
    #     df[col] = np.nan

    # Guardar CSV
    out_path = Path(CSV_PATH).resolve()
    df.to_csv(out_path, index=False)

    print(f"✅ Guardado {len(df)} filas en: {out_path}")
    print(f"Rango de fechas: {df['date'].min().date()} → {df['date'].max().date()}")

if __name__ == "__main__":
    main()


✅ Guardado 2916 filas en: /home/bebo/Documents/Cristina01/00/btc_prices_and_metrics.csv
Rango de fechas: 2017-08-17 → 2025-08-10


In [6]:
# pip install pandas numpy gymnasium stable-baselines3==2.3.2 torch scikit-learn

import math
import numpy as np
import pandas as pd
from dataclasses import dataclass
from typing import List, Optional, Tuple

import gymnasium as gym
from gymnasium import spaces

from sklearn.preprocessing import StandardScaler
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

# ---------- Config you will likely edit ----------
CSV_PATH = "btc_prices_and_metrics.csv"  # <- your file from Binance + metrics
DATE_COL = "date"
PRICE_COL = "close"

# Add the columns you want in the state (must exist in the CSV).
# The script will also compute 'days_since_halving' automatically and append it.
FEATURE_COLS = [
    # examples (edit to match your CSV):
    # "m2_growth",
    # "fed_funds_rate",
    # "active_addresses",
    # "tx_volume_usd",
]

# ~4 months in trading days: use 120 (you can change it).
HOLD_DAYS = 120

# Train/test split by date: everything strictly before TEST_START is train
TEST_START = "2023-01-01"

# -------------------------------------------------

HALVING_DATES = [
    "2012-11-28",
    "2016-07-09",
    "2020-05-11",
    "2024-04-20",
]
HALVING_DATES = pd.to_datetime(HALVING_DATES).to_list()


def compute_days_since_halving(d: pd.Timestamp) -> int:
    # last halving on or before date d; if none, use first halving distance
    past = [h for h in HALVING_DATES if h <= d]
    ref = past[-1] if past else HALVING_DATES[0]
    return (d - ref).days


@dataclass
class EpisodeResult:
    entered: bool
    entry_idx: Optional[int]
    exit_idx: Optional[int]
    pct_return: float


class BTCEnterOnlyEnv(gym.Env):
    """
    Action space:
      0 = WAIT (do nothing today)
      1 = ENTER (if not in position). Once entered, we auto-hold for HOLD_DAYS, then episode ends.

    Observation:
      Normalized vector of features:
        - FEATURE_COLS (whatever you pass in)
        - days_since_halving (auto-added)
        - days_to_series_end (optional helper for the net, included below)
    Reward:
      0 every step until the auto-exit day.
      On exit day: (price_exit / price_entry) - 1
      If episode ends without entering: reward = 0
    """

    metadata = {"render_modes": []}

    def __init__(
        self,
        df: pd.DataFrame,
        feature_cols: List[str],
        hold_days: int = 120,
        scaler: Optional[StandardScaler] = None,
        start_idx: int = 0,
        end_idx: Optional[int] = None,
    ):
        super().__init__()
        self.raw_df = df.reset_index(drop=True).copy()
        self.feature_cols = list(feature_cols)
        self.hold_days = hold_days
        self.scaler = scaler
        self.start_idx = start_idx
        self.end_idx = len(self.raw_df) if end_idx is None else end_idx

        # internal state
        self.t = None
        self.in_position = False
        self.entry_idx = None
        self.exit_idx = None
        self.entry_price = None

        # Build features matrix (unnormalized)
        feats = self.raw_df.loc[:, self.feature_cols].copy() if self.feature_cols else pd.DataFrame(index=self.raw_df.index)
        # days_since_halving
        feats["days_since_halving"] = self.raw_df["days_since_halving"].astype(float)
        # (Optional) days_to_series_end — can help avoid end-of-series traps
        feats["days_to_series_end"] = (len(self.raw_df) - 1 - np.arange(len(self.raw_df))).astype(float)

        self.full_feature_cols = list(feats.columns)
        self.X = feats.values.astype(np.float32)

        # Normalize with provided scaler (fit outside for train/test consistency)
        if self.scaler is not None:
            self.X = self.scaler.transform(self.X)

        # Observation space
        obs_dim = self.X.shape[1]
        self.observation_space = spaces.Box(low=-5.0, high=5.0, shape=(obs_dim,), dtype=np.float32)
        # Action space: WAIT or ENTER
        self.action_space = spaces.Discrete(2)

    def _get_obs(self) -> np.ndarray:
        return self.X[self.t]

    def reset(self, *, seed: Optional[int] = None, options: Optional[dict] = None):
        super().reset(seed=seed)
        # start somewhere that still allows a full hold window
        self.t = self.start_idx
        self.in_position = False
        self.entry_idx = None
        self.exit_idx = None
        self.entry_price = None
        return self._get_obs(), {}

    def step(self, action: int):
        terminated = False
        reward = 0.0
        info = {}

        # if we're not in position, decide to enter or wait
        if not self.in_position:
            if action == 1:
                # enter now if there's room to hold
                if self.t + self.hold_days < self.end_idx:
                    self.in_position = True
                    self.entry_idx = self.t
                    self.exit_idx = self.t + self.hold_days
                    self.entry_price = float(self.raw_df.loc[self.entry_idx, PRICE_COL])
                else:
                    # Cannot enter because there's not enough data left; treat as wait
                    pass
        else:
            # already in position; ignore actions (policy can still output something)
            pass

        # advance time
        self.t += 1

        # If we just reached or passed planned exit day, finalize
        if self.in_position and self.t >= self.exit_idx:
            exit_price = float(self.raw_df.loc[self.exit_idx, PRICE_COL])
            reward = (exit_price / self.entry_price) - 1.0
            terminated = True
            info["entry_idx"] = self.entry_idx
            info["exit_idx"] = self.exit_idx
            info["pct_return"] = reward

        # If end of data and never entered, terminate with zero reward
        if self.t >= self.end_idx - 1 and not terminated:
            terminated = True
            info["entry_idx"] = self.entry_idx
            info["exit_idx"] = self.exit_idx
            info["pct_return"] = 0.0

        obs = self._get_obs() if not terminated else np.zeros_like(self._get_obs(), dtype=np.float32)
        return obs, float(reward), terminated, False, info


def load_and_prepare(csv_path: str) -> pd.DataFrame:
    df = pd.read_csv(csv_path)
    # Basic checks
    if DATE_COL not in df.columns or PRICE_COL not in df.columns:
        raise ValueError(f"CSV must contain '{DATE_COL}' and '{PRICE_COL}' columns.")

    df[DATE_COL] = pd.to_datetime(df[DATE_COL])
    df = df.sort_values(DATE_COL).reset_index(drop=True)

    # forward-fill any missing metrics
    for c in FEATURE_COLS:
        if c not in df.columns:
            raise ValueError(f"Feature column '{c}' not found in CSV.")
        df[c] = pd.to_numeric(df[c], errors="coerce")
    use_cols = FEATURE_COLS.copy()

    df["days_since_halving"] = df[DATE_COL].apply(compute_days_since_halving)
    return df


def make_scaler(train_df: pd.DataFrame) -> Tuple[StandardScaler, List[str]]:
    feats = pd.DataFrame(index=train_df.index)
    for c in FEATURE_COLS:
        feats[c] = train_df[c].astype(float)
    feats["days_since_halving"] = train_df["days_since_halving"].astype(float)
    feats["days_to_series_end"] = (len(train_df) - 1 - np.arange(len(train_df))).astype(float)

    scaler = StandardScaler()
    scaler.fit(feats.values.astype(np.float32))
    full_feature_cols = list(feats.columns)
    return scaler, full_feature_cols


def date_to_index(df: pd.DataFrame, dt: str) -> int:
    dt = pd.to_datetime(dt)
    # first index >= dt
    idx = int(np.searchsorted(df[DATE_COL].values.astype("datetime64[ns]"), dt.to_datetime64(), side="left"))
    return idx


def train_and_eval():
    df = load_and_prepare(CSV_PATH)

    split_idx = date_to_index(df, TEST_START)
    train_end_idx = max(split_idx - 1, 0)

    # Fit scaler on train slice only
    scaler, full_cols = make_scaler(df.iloc[:split_idx].reset_index(drop=True))

    # Build envs
    train_env = DummyVecEnv([
        lambda: BTCEnterOnlyEnv(
            df=df,
            feature_cols=FEATURE_COLS,
            hold_days=HOLD_DAYS,
            scaler=scaler,
            start_idx=0,
            end_idx=split_idx,  # up to but not including test period
        )
    ])

    test_env = BTCEnterOnlyEnv(
        df=df,
        feature_cols=FEATURE_COLS,
        hold_days=HOLD_DAYS,
        scaler=scaler,
        start_idx=split_idx,
        end_idx=None,
    )

    # DQN hyperparams (tune as needed)
    model = DQN(
        "MlpPolicy",
        train_env,
        learning_rate=3e-4,
        buffer_size=50_000,
        learning_starts=1_000,
        batch_size=256,
        tau=1.0,
        gamma=0.99,
        train_freq=4,
        target_update_interval=2_000,
        exploration_fraction=0.2,
        exploration_final_eps=0.05,
        verbose=1,
        tensorboard_log=None,
        seed=42,
        policy_kwargs=dict(net_arch=[256, 256]),
    )

    # Train
    timesteps = 200_000  # adjust as needed
    model.learn(total_timesteps=timesteps)

    # Evaluate on test env (single-episode sweep over the test window)
    ep_rewards, ep_returns = run_single_episode(test_env, model)
    print("\n=== Test Episode ===")
    print(f"Entry idx: {ep_returns.entry_idx}, Exit idx: {ep_returns.exit_idx}")
    print(f"Test % return (RL): {ep_returns.pct_return:.4f}")

    # Compare to random-entry baseline on the same test window
    baseline = random_entry_baseline(test_env)
    print(f"Random-entry % return (1 trial): {baseline:.4f}")

    # Optional: multiple random trials
    trials = [random_entry_baseline(test_env, rng=np.random.default_rng(123 + i)) for i in range(50)]
    print(f"Random-entry mean over 50 trials: {np.mean(trials):.4f} (std {np.std(trials):.4f})")


def run_single_episode(env: BTCEnterOnlyEnv, model: DQN) -> Tuple[list, EpisodeResult]:
    obs, info = env.reset()
    done = False
    rewards = []
    while not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, r, terminated, truncated, info = env.step(int(action))
        done = terminated or truncated
        rewards.append(r)

    res = EpisodeResult(
        entered=info.get("entry_idx") is not None,
        entry_idx=info.get("entry_idx"),
        exit_idx=info.get("exit_idx"),
        pct_return=float(info.get("pct_return", 0.0)),
    )
    return rewards, res


def random_entry_baseline(env: BTCEnterOnlyEnv, rng: Optional[np.random.Generator] = None) -> float:
    """
    Baseline: pick a random day within the test window that still allows a full HOLD_DAYS,
    enter, then compute the same 4-month return.
    """
    rng = rng or np.random.default_rng(0)
    # indices allowed for entry:
    valid_start = env.start_idx
    valid_end = env.end_idx - env.hold_days - 1
    if valid_end <= valid_start:
        return 0.0

    entry_idx = rng.integers(valid_start, valid_end + 1)
    exit_idx = entry_idx + env.hold_days
    entry_price = float(env.raw_df.loc[entry_idx, PRICE_COL])
    exit_price = float(env.raw_df.loc[exit_idx, PRICE_COL])
    return (exit_price / entry_price) - 1.0


if __name__ == "__main__":
    train_and_eval()


Using cpu device
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.989    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 8524     |
|    time_elapsed     | 0        |
|    total_timesteps  | 481      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.977    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 8157     |
|    time_elapsed     | 0        |
|    total_timesteps  | 961      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.966    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 897      |
|    time_elapsed     | 1        |
|    total_timesteps  | 1442     |
| train/              |          |
|    learning_rate    | 0.0003   |
|  