In [None]:
# dataset_builder.py  ─────────────────────────────────────────────────────────
import os, time, json, ccxt, requests
!pip install ta
import numpy as np, pandas as pd, ta
from tqdm import tqdm
from datetime import datetime, timedelta
from textblob import TextBlob
from ta.momentum   import StochasticOscillator, WilliamsRIndicator, RSIIndicator
from ta.volume     import MFIIndicator, OnBalanceVolumeIndicator, VolumeWeightedAveragePrice
from ta.volatility import BollingerBands, AverageTrueRange
from ta.trend      import CCIIndicator, ADXIndicator, EMAIndicator

SYMBOL      = "BTC/USDT"
TIMEFRAMES  = {"short":"15m", "medium":"1h", "long":"4h"}
API_KEY_CP  = os.getenv("CP_KEY")         # ← CryptoPanic key в env-переменной
WINDOW_DAYS = 180
LIMITS      = {"short":2000, "medium":2000, "long":1500}
DATA_DIR    = "csv_data"; os.makedirs(DATA_DIR, exist_ok=True)

ex = ccxt.bybit({"enableRateLimit": True})

# ───────────────────────────────── news sentiment hourly──────────────────────
def fetch_news(start_ts: int, end_ts: int) -> pd.DataFrame:
    """Загружает новости CryptoPanic, считает polarity и агрегирует по часу."""
    url = ("https://cryptopanic.com/api/v1/posts/?auth_token={}"
           "&currencies=BTC&public=true&regions=en&filter=rising"
           "&from={}&to={}").format(API_KEY_CP,
                                    datetime.utcfromtimestamp(start_ts).isoformat(),
                                    datetime.utcfromtimestamp(end_ts).isoformat())
    res = requests.get(url, timeout=10).json()
    sentiments = []
    for post in res.get("results", []):
        ts   = int(datetime.fromisoformat(post["published_at"]).timestamp())
        text = post["title"] + " " + post.get("body", "")
        pol  = TextBlob(text).sentiment.polarity        # [-1;1]
        sentiments.append({"timestamp": ts*1000, "pol": pol})
    if not sentiments:
        return pd.DataFrame(columns=["timestamp", "sentiment"])
    df = pd.DataFrame(sentiments)
    # усредняем по часу
    df["timestamp"] = (df["timestamp"]//3_600_000)*3_600_000
    return df.groupby("timestamp")["pol"].mean().rename("sentiment").reset_index()

# ──────────────────────────────── цены + индикаторы ─────────────────────────
def add_features(df: pd.DataFrame) -> pd.DataFrame:
    df["return"]  = df["close"].pct_change()
    df["log_ret"] = np.log(df["close"]/df["close"].shift(1))

    df["EMA_20"]  = EMAIndicator(df["close"], 20).ema_indicator()
    df["EMA_50"]  = EMAIndicator(df["close"], 50).ema_indicator()
    df["MA_20"]   = df["close"].rolling(20).mean()
    df["MA_50"]   = df["close"].rolling(50).mean()

    df["ATR"] = AverageTrueRange(df["high"],df["low"],df["close"],14).average_true_range()
    bb = BollingerBands(df["close"], 20, 2)
    df["BB_mid"]   = bb.bollinger_mavg()
    df["BB_up"]    = bb.bollinger_hband()
    df["BB_low"]   = bb.bollinger_lband()
    df["BB_width"] = (df["BB_up"]-df["BB_low"])/df["BB_mid"]

    df["RSI"]  = RSIIndicator(df["close"],14).rsi()
    df["ADX"]  = ADXIndicator(df["high"],df["low"],df["close"],14).adx()
    df["CCI"]  = CCIIndicator(df["high"],df["low"],df["close"],20).cci()
    df["WillR"]= WilliamsRIndicator(df["high"],df["low"],df["close"],14).williams_r()

    st = StochasticOscillator(df["high"],df["low"],df["close"],14,3)
    df["Stoch_K"] = st.stoch(); df["Stoch_D"] = st.stoch_signal()

    df["OBV"]  = OnBalanceVolumeIndicator(df["close"],df["volume"]).on_balance_volume()
    vwap       = VolumeWeightedAveragePrice(df["high"],df["low"],df["close"],df["volume"],14)
    df["VWAP"] = vwap.volume_weighted_average_price()
    df["MFI"]  = MFIIndicator(df["high"],df["low"],df["close"],df["volume"],14).money_flow_index()

    for lag in (1,2,3):
        df[f"lag_close_{lag}"] = df["close"].shift(lag)
    for w in (5,10,20):
        df[f"roll_mean_{w}"] = df["close"].rolling(w).mean()
        df[f"roll_std_{w}"]  = df["close"].rolling(w).std()

    df["RSI_x_Vol"]   = df["RSI"] * df["volume"]
    df["ADX_x_BBW"]   = df["ADX"] * df["BB_width"]

    df["hour"]    = df["timestamp"].dt.hour   /23
    df["weekday"] = df["timestamp"].dt.weekday/ 6
    df["month"]   = df["timestamp"].dt.month /12
    return df

def fetch_window(symbol, timeframe, since, limit):
    """single call helper with retry"""
    for _ in range(5):
        try:
            return ex.fetch_ohlcv(symbol, timeframe, since=since, limit=limit)
        except Exception as e:
            time.sleep(1)
    raise RuntimeError("fetch failed")

START, END = datetime(2018,1,1), datetime.utcnow()
for tag, tf in TIMEFRAMES.items():
    print(f"▸ {tf}: {START.date()} → {END.date()}")
    limit = LIMITS[tag]
    cur   = START
    all_df = []
    # основной цикл по окнам
    pbar = tqdm(total=(END-START).days//WINDOW_DAYS+1, desc=f"{tf}")
    while cur < END:
        nxt = min(cur + timedelta(days=WINDOW_DAYS), END)
        since = int(cur.timestamp()*1000)
        raw = fetch_window(SYMBOL, tf, since, limit)
        df  = pd.DataFrame(raw, columns=["timestamp","open","high","low","close","volume"])
        df  = df[(df["timestamp"]>=since) & (df["timestamp"]<int(nxt.timestamp()*1000))]
        all_df.append(df)
        cur = nxt; pbar.update(1)
    pbar.close()
    price_df = pd.concat(all_df, ignore_index=True).drop_duplicates("timestamp")
    price_df["timestamp"] = pd.to_datetime(price_df["timestamp"], unit="ms")

    # новости за тот же диапазон
    news_df = fetch_news(int(START.timestamp()), int(END.timestamp()))
    news_df["timestamp"] = pd.to_datetime(news_df["timestamp"], unit="ms")

    full = price_df.merge(news_df, on="timestamp", how="left").sort_values("timestamp")
    full["sentiment"].fillna(0.0, inplace=True)

    full = add_features(full).dropna().reset_index(drop=True)
    full.to_csv(f"{DATA_DIR}/{SYMBOL.replace('/','_')}_{tf}.csv", index=False)
    print("  saved", full.shape)


SyntaxError: invalid syntax (4202375370.py, line 3)