<a href="https://colab.research.google.com/github/4L3M4R/cerbero/blob/main/cerbero.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!pip install ta yfinance vaderSentiment transformers torch feedparser

Collecting ta
  Downloading ta-0.11.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Collecting feedparser
  Downloading feedparser-6.0.12-py3-none-any.whl.metadata (2.7 kB)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading feedparser-6.0.12-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.5/81.5 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: ta, sgmllib3k
  Building wheel for ta (setup.py) ... [?25l[?25hdone
  Created wheel for ta: filename=ta-0.11.0-py3-none-any.whl size=29412 sha256=7cf56496

In [None]:
# ===============================================
#               1. IMPORTS
# ===============================================

import os
import requests
import pandas as pd
import numpy as np
import yfinance as yf
import feedparser
import nltk
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Inicializar recursos
nltk.download('vader_lexicon')
vader_analyzer = SentimentIntensityAnalyzer()
finbert_tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
finbert_model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")

# TA-Lib / ta
from ta.volatility import AverageTrueRange, BollingerBands
from ta.momentum import ROCIndicator
from ta.trend import MACD
from ta.momentum import RSIIndicator


In [None]:
# ===============================================
#          0. CONFIGURACIÓN Y PARÁMETROS
# ===============================================

# Lista de activos
activos = {}
with open("activos.txt", "r") as f:
    for line in f:
        symbol, source, search_name = line.strip().split(":")
        activos[symbol.strip()] = {
            "source": source.strip().lower(),
            "search_name": search_name.strip()
        }

# Configuración general
config = {}
with open("config.txt", "r") as f:
    for line in f:
        if ":" in line:
            key, value = line.strip().split(":")
            config[key.strip()] = value.strip()

granularity = config.get("frecuencia", "1h")
limit = int(config.get("limit", "100"))
period = config.get("period", "5d")

In [None]:
# --- Funciones delta / manejo de ficheros ---
def cargar_df_existente(symbol):
    filename = f"{symbol}_datos.txt"
    if os.path.exists(filename):
        df = pd.read_csv(filename, sep="\t")
        df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True, errors="coerce")
        existing_dates = set(df["timestamp"].dt.date)
    else:
        df = pd.DataFrame()
        existing_dates = set()
    return df, existing_dates

def filtrar_nuevas_fechas(df_new, existing_dates):
    df_new["timestamp"] = pd.to_datetime(df_new["timestamp"], utc=True)
    df_new["date_only"] = df_new["timestamp"].dt.date
    df_to_add = df_new[~df_new["date_only"].isin(existing_dates)].copy()
    df_to_add.drop(columns=["date_only"], inplace=True)
    return df_to_add

def save_df(df, symbol):
    filename = f"{symbol}_datos.txt"
    df.to_csv(filename, sep="\t", index=False)
    print(f"Guardado en {filename} | Filas totales: {len(df)}")
# --- Función para registrar lo que se hizo ---
def registrar_log(message, log_file="run_summary.log"):
    import datetime
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    with open(log_file, "a") as log:
        log.write(f"[{timestamp}] {message}\n")


In [None]:
def descargar_datos_bitget(symbol, granularity, limit):
    url = "https://api.bitget.com/api/v2/mix/market/history-candles"
    params = {"symbol": symbol, "productType": "USDT-FUTURES", "granularity": granularity, "limit": limit}
    print(f"Descargando desde Bitget: {symbol}...")
    response = requests.get(url, params=params)
    if response.status_code != 200:
        print(f"Error Bitget {symbol}: {response.text}")
        registrar_log(f"Error Bitget {symbol}: {response.text}")
        return None
    data = response.json().get("data", [])
    if not data:
        print(f"No hay datos para {symbol}")
        registrar_log(f"No hay datos para {symbol}")
        return None
    df = pd.DataFrame(data, columns=["timestamp", "open", "high", "low", "close", "volume", "quoteVolume"])
    df["timestamp"] = pd.to_datetime(df["timestamp"].astype(int), unit='ms', utc=True)
    df["symbol"] = symbol
    df = df.sort_values("timestamp")

    registrar_log(f"Descargados {len(df)} registros desde Bitget para {symbol}")
    return df

def descargar_datos_yf(symbol, interval, period, limit):
    print(f"Descargando desde Yahoo Finance: {symbol}...")
    data = yf.download(tickers=symbol, interval=interval, period=period)
    if data.empty:
        print(f"No se encontraron datos para {symbol}")
        registrar_log(f"No se encontraron datos para {symbol} en Yahoo Finance")
        return None
    if isinstance(data.columns, pd.MultiIndex):
        data.columns = [col[0].lower() for col in data.columns]
    else:
        data.columns = [col.lower() for col in data.columns]
    data = data.reset_index()
    rename_map = {'open':'open','high':'high','low':'low','close':'close','volume':'volume','Date':'timestamp'}
    data.rename(columns=rename_map, inplace=True)
    data["symbol"] = symbol
    data = data.head(limit)

    registrar_log(f"Descargados {len(data)} registros desde Yahoo Finance para {symbol}")
    return data

In [None]:
# --- Calculo de indicadores ---
def calcular_features(df):
    for col in ['open','high','low','close','volume']:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    df['gap_apertura_pct'] = (df['open'] - df['close'].shift(1)) / df['close'].shift(1) * 100
    df['volatilidad_diaria'] = (df['high'] - df['low']) / df['low'] * 100
    df['vol_5d'] = df['volatilidad_diaria'].rolling(5).std()
    df['vol_10d'] = df['volatilidad_diaria'].rolling(10).std()
    df['vol_rel_5d'] = df['volume'] / df['volume'].rolling(5).mean()
    df['return_pct'] = (df['close'] - df['close'].shift(1)) / df['close'].shift(1) * 100
    df['close_open_pct'] = (df['close'] - df['open']) / df['open'] * 100
    df['range_pct'] = (df['high'] - df['low']) / df['low'] * 100
    df['volume_change_pct'] = (df['volume'] - df['volume'].shift(1)) / df['volume'].shift(1) * 100

    print(f"[DEBUG] Filas antes de indicadores: {len(df)}")
    df['rsi_14'] = RSIIndicator(close=df['close'], window=14).rsi()
    macd = MACD(close=df['close'], window_slow=26, window_fast=12, window_sign=9)
    df['macd'] = macd.macd()
    df['macd_signal'] = macd.macd_signal()
    df['macd_diff'] = macd.macd_diff()
    df['atr_14'] = AverageTrueRange(high=df['high'], low=df['low'], close=df['close'], window=14).average_true_range()
    df['momentum_12'] = ROCIndicator(close=df['close'], window=12).roc()
    bb = BollingerBands(close=df['close'], window=20, window_dev=2)
    df['bb_upper'] = bb.bollinger_hband()
    df['bb_lower'] = bb.bollinger_lband()
    df['bb_pctb'] = bb.bollinger_pband()
    df.fillna(0, inplace=True)
    return df


In [None]:
def procesar_activo(symbol, source, search_name, granularity, period, limit):
    df_existing, existing_dates = cargar_df_existente(symbol)

    # 🔹 Descargar datos
    df_new = descargar_datos_bitget(symbol, granularity, limit) if source=="bitget" else descargar_datos_yf(symbol, granularity, period, limit)

    if df_new is None or df_new.empty:
        print(f"No se descargaron datos para {symbol}")
        registrar_log(f"{symbol} - No se descargaron datos")
        return

    os.makedirs("logs", exist_ok=True)
    # 🔹 Guardar todas las filas descargadas ese día
    df_new["download_date"] = pd.Timestamp.utcnow()
    df_new.to_csv(f"logs/{symbol}_descargadas_{pd.Timestamp.utcnow().date()}.csv", index=False)

    # 🔹 Filtrar solo las filas nuevas respecto al histórico
    df_to_add = filtrar_nuevas_fechas(df_new, existing_dates)

    if df_to_add.empty:
        print(f"No hay nuevas fechas para {symbol}")
        registrar_log(f"{symbol} - No hay nuevas filas para agregar")
        return

    # 🔹 Guardar las filas que efectivamente se agregarán

    df_to_add["added_date"] = pd.Timestamp.utcnow()
    df_to_add.to_csv(f"logs/{symbol}_nuevas_agregadas_{pd.Timestamp.utcnow().date()}.csv", index=False)

    # 🔹 Concatenar histórico + nuevas filas
    df_final = pd.concat([df_existing, df_to_add], ignore_index=True) if not df_existing.empty else df_to_add

    # 🔹 Calcular features
    df_final = calcular_features(df_final)

    # 🔹 Guardar histórico actualizado
    save_df(df_final, symbol)

    # 🔹 Log resumen

    registrar_log(f"{symbol} - Descargadas {len(df_new)} filas, agregadas {len(df_to_add)} filas, total histórico {len(df_final)}")


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

In [None]:
# ===============================================
#             3. EJECUCIÓN PRINCIPAL
# ===============================================

for symbol, info in activos.items():
    source = info["source"]
    search_name = info["search_name"]
    procesar_activo(symbol, source, search_name, granularity, period, limit)
