<a href="https://colab.research.google.com/github/4L3M4R/cerbero/blob/main/cerbero-pre.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ===============================================
#             CERBERO PREMARKET
#   Descarga noticias y calcula sentiment
#   (Actualiza el día anterior en los archivos *_datos.txt)
# ===============================================

import os
import pandas as pd
import feedparser
import nltk
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from datetime import datetime, timedelta
from urllib.parse import quote

# ===============================================
#             CONFIGURACIÓN
# ===============================================

# Lista de activos
activos = {}
with open("activos.txt", "r") as f:
    for line in f:
        symbol, source, search_name = line.strip().split(":")
        activos[symbol.strip()] = {
            "source": source.strip().lower(),
            "search_name": search_name.strip()
        }

# Inicialización de analizadores de sentimiento
nltk.download('vader_lexicon')
vader_analyzer = SentimentIntensityAnalyzer()
finbert_tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
finbert_model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")

# ===============================================
#           FUNCIONES AUXILIARES
# ===============================================

def registrar_log(message, log_file="run_summary_pre.log"):
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    with open(log_file, "a") as log:
        log.write(f"[{timestamp}] {message}\n")

def descargar_noticias_y_calcular_sentiment_df(df, symbol, search_name):
    """
    Descarga noticias, calcula el sentiment y actualiza el DataFrame pasado.
    Devuelve el DataFrame actualizado en memoria sin guardar todavía en disco.
    """
    import os
    import pandas as pd
    from urllib.parse import quote
    import feedparser

    os.makedirs("logs", exist_ok=True)

    # ========================
    # Descargar noticias
    # ========================
    query = quote(search_name)
    feed = feedparser.parse(f"https://news.google.com/rss/search?q={query}")
    noticias = []
    for entry in feed.entries:
        noticias.append({
            "timestamp": entry.published,
            "title": entry.title,
            "link": entry.link
        })

    noticias_df = pd.DataFrame(noticias)
    if noticias_df.empty:
        registrar_log(f"{symbol} - No se encontraron noticias")
    else:
        # Calcular sentimiento con VADER (placeholder FinBERT)
        noticias_df["vader_sentiment"] = noticias_df["title"].apply(lambda x: vader_analyzer.polarity_scores(x)["compound"])
        noticias_df["finbert_sentiment"] = 0

        # Promedios
        vader_promedio = noticias_df["vader_sentiment"].mean()
        finbert_promedio = noticias_df["finbert_sentiment"].mean()

        # Guardar noticias procesadas
        today_str = pd.Timestamp.utcnow().date()
        noticias_df.to_csv(f"logs/{symbol}_nuevas_agregadas_{today_str}.csv", index=False)
        registrar_log(f"{symbol} - Guardadas {len(noticias_df)} noticias con sentiment")

        # ========================
        # Actualizar DataFrame principal
        # ========================
        if df.empty:
            # Creamos fila mínima con todas las columnas del CSV si no existía
            df = pd.DataFrame([{
                "timestamp": (pd.Timestamp.utcnow().date() - pd.Timedelta(days=1)).strftime("%Y-%m-%d"),
                "vader": vader_promedio,
                "finbert": finbert_promedio,
                "sentiment_ratio": vader_promedio / (finbert_promedio + 1e-9),
                "sentiment_combined": (vader_promedio + finbert_promedio) / 2
            }])
        else:
            df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
            df["timestamp"] = df["timestamp"].dt.tz_localize(None)
            df["date_only"] = df["timestamp"].dt.date
            last_date = df["date_only"].max()
            last_idx = df[df["date_only"] == last_date].index

            if len(last_idx) == 0:
                # Añadir nueva fila mínima con columnas existentes
                new_row = {col: None for col in df.columns}
                new_row.update({
                    "timestamp": (pd.Timestamp.utcnow().date() - pd.Timedelta(days=1)).strftime("%Y-%m-%d"),
                    "vader": vader_promedio,
                    "finbert": finbert_promedio,
                    "sentiment_ratio": vader_promedio / (finbert_promedio + 1e-9),
                    "sentiment_combined": (vader_promedio + finbert_promedio) / 2
                })
                df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
            else:
                # Actualizar última fila
                df.loc[last_idx, "vader"] = vader_promedio
                df.loc[last_idx, "finbert"] = finbert_promedio
                df.loc[last_idx, "sentiment_ratio"] = vader_promedio / (finbert_promedio + 1e-9)
                df.loc[last_idx, "sentiment_combined"] = (vader_promedio + finbert_promedio) / 2

            df.drop(columns=["date_only"], inplace=True, errors="ignore")

    return df

def calcular_target_df(df, symbol=None):
    """
    Calcula el target de subida como número y su categoría.
    Se leen thresholds y modo desde config.txt.
    Mantiene todas las columnas existentes y añade:
        - target (numérico)
        - target_category (categórico)
    """
    import pandas as pd

    # ========================
    # Leer thresholds y modo
    # ========================
    target_config = {}
    mode = "daily"
    with open("config.txt", "r") as f:
        for line in f:
            if "=" in line:
                key, value = line.strip().split("=")
                key = key.strip()
                value = value.strip()
                if key in ["high_threshold", "medium_threshold", "low_threshold", "below_threshold"]:
                    target_config[key] = float(value)
                elif key == "mode":
                    mode = value.lower()

    # ========================
    # Crear columnas si no existen
    # ========================
    if "target_category" not in df.columns:
        df["target_category"] = None
    if "target" not in df.columns:
        df["target"] = pd.NA

    # ========================
    # Función de categorización
    # ========================
    def categorizar(r):
        if pd.isna(r):
            return None
        elif r >= target_config["high_threshold"]:
            return "Very High"
        elif r >= target_config["medium_threshold"]:
            return "High"
        elif r >= target_config["low_threshold"]:
            return "Medium"
        elif r >= target_config["below_threshold"]:
            return "Low"
        else:
            return "Negative"

    def categorizar_num(r):
        if pd.isna(r):
            return None
        elif r >= target_config["high_threshold"]:
            return "4"
        elif r >= target_config["medium_threshold"]:
            return "3"
        elif r >= target_config["low_threshold"]:
            return "2"
        elif r >= target_config["below_threshold"]:
            return "1"
        else:
            return "0"
    # ========================
    # Función de cálculo de target numérico
    # ========================
    def calcular_target(r):
        if pd.isna(r):
            return pd.NA
        # Simplemente devolvemos el return_pct como target
        # Si quieres otra fórmula, se puede ajustar aquí
        return r

    # ========================
    # Procesar solo si existe return_pct
    # ========================
    if "return_pct" not in df.columns:
        df["return_pct"] = pd.NA

    # Elegir filas según modo
    if mode == "daily":
        idx = df.index[-1:]  # última fila
    else:
        idx = df.index       # todas las filas

    # Aplicar cálculo
    df.loc[idx, "target"] = df.loc[idx, "return_pct"].apply(calcular_target)
    df.loc[idx, "target_category"] = df.loc[idx, "return_pct"].apply(categorizar)
    df.loc[idx, "target_num"] = df.loc[idx, "return_pct"].apply(categorizar_num)

    return df




# ===============================================
#           EJECUCIÓN PRINCIPAL
# ===============================================


for symbol, info in activos.items():
    filename = f"{symbol}_datos.txt"
    df = pd.read_csv(filename, sep="\t") if os.path.exists(filename) else pd.DataFrame()

    # Descargar noticias y actualizar sentimiento en memoria
    df = descargar_noticias_y_calcular_sentiment_df(df, symbol, info["search_name"])

    # Calcular target usando la función que lee el modo desde activos.txt
    df = calcular_target_df(df, symbol)

    # Guardar todo el DataFrame una sola vez
    df.to_csv(filename, sep="\t", index=False)






print("✅ Completed premarket (sentiment del día anterior actualizado)")
