In [9]:
from pathlib import Path
import sys

def find_project_root(start_path: Path, marker: str = "src") -> Path:
    current = start_path.resolve()
    for parent in [current] + list(current.parents):
        if (parent / marker).exists():
            return parent
    raise FileNotFoundError("No se encontró el root del proyecto.")

PROJECT_ROOT = find_project_root(Path.cwd())
print("Project root:", PROJECT_ROOT)

sys.path.append(str(PROJECT_ROOT))

Project root: C:\Users\aleja\OneDrive\Documentos\Proyectos\Python\Scraping


In [10]:
import pandas as pd
import numpy as np

In [11]:
dataset_path = PROJECT_ROOT / "src" / "data" / "resultados" / "dataset_clasificado.json"

df_news = pd.read_json(dataset_path)

print("Shape noticias:", df_news.shape)
df_news.head()

Shape noticias: (1306, 6)


Unnamed: 0,titulo,subtitulo,fecha,fuente,sentimiento_label,sentimiento_score
0,México ‘está esencialmente dirigido por cártel...,A solo unos días de que asuma la presidencia d...,07-01-2025,el_financiero,POS,0.695107
1,¿Por las malas? Trump usará ‘fuerza económica’...,Esto dejó de sonar como una broma. El presiden...,07-01-2025,el_financiero,NEG,0.799018
2,Trump se pone ‘belicón’: No descarta acción mi...,"El presidente electo Donald Trump, que quiere ...",07-01-2025,el_financiero,NEU,0.740869
3,Primera ministra de Dinamarca contesta a Trump...,"La primera ministra de Dinamarca, Mette Freder...",07-01-2025,el_financiero,POS,0.912642
4,¿Trump teme ser sentenciado? Pide a la Corte S...,El presidente electo Donald Trump solicitó el ...,08-01-2025,el_financiero,NEG,0.64595


In [12]:
df_news["fecha"] = pd.to_datetime(df_news["fecha"], format="%d-%m-%Y")
df_news = df_news.sort_values("fecha").reset_index(drop=True)

print("Rango noticias:")
print(df_news["fecha"].min(), "->", df_news["fecha"].max())

Rango noticias:
2025-01-07 00:00:00 -> 2026-01-30 00:00:00


In [13]:
raws_path = PROJECT_ROOT / "src" / "data" / "raws"

def load_index(path: Path, prefix: str):
    df = pd.read_csv(path, skiprows=[1])
    
    df.columns = df.columns.str.strip()

    if "Price" not in df.columns or "Close" not in df.columns:
        raise ValueError(
            f"Estructura inesperada en {path}. Columnas detectadas: {df.columns}"
        )

    df = df[df["Price"] != "Date"]

    df["Price"] = pd.to_datetime(df["Price"])
    df["Close"] = pd.to_numeric(df["Close"], errors="coerce")

    df = df.sort_values("Price")

    df = df[["Price", "Close"]].rename(columns={
        "Price": "fecha",
        "Close": f"close_{prefix}"
    })

    return df.reset_index(drop=True)

In [14]:
df_sp500 = load_index(raws_path / "SP500_historico.csv", "sp500")
df_nasdaq = load_index(raws_path / "NASDAQ_historico.csv", "nasdaq")
df_dji = load_index(raws_path / "DJI_historico.csv", "dji")

print("SP500:", df_sp500.shape)
print("NASDAQ:", df_nasdaq.shape)
print("DJI:", df_dji.shape)

SP500: (275, 2)
NASDAQ: (275, 2)
DJI: (275, 2)


In [15]:
df_model = df_news.copy()

df_model["fecha_t"] = df_model["fecha"]
df_model["fecha_t7"] = df_model["fecha"] + pd.Timedelta(days=7)

df_model = df_model.sort_values("fecha_t")

In [16]:
def merge_index(df_base, df_index, prefix):
    df_index_sorted = df_index.sort_values("fecha")

    # Merge t
    df_out = pd.merge_asof(
        df_base.sort_values("fecha_t"),
        df_index_sorted,
        left_on="fecha_t",
        right_on="fecha",
        direction="forward"
    )

    df_out = df_out.rename(columns={
        f"close_{prefix}": f"close_{prefix}_t"
    }).drop(columns=["fecha_y"]).rename(columns={"fecha_x": "fecha"})

    # Merge t+7
    df_out = pd.merge_asof(
        df_out.sort_values("fecha_t7"),
        df_index_sorted,
        left_on="fecha_t7",
        right_on="fecha",
        direction="forward"
    )

    df_out = df_out.rename(columns={
        f"close_{prefix}": f"close_{prefix}_t7"
    }).drop(columns=["fecha_y"]).rename(columns={"fecha_x": "fecha"})

    return df_out

In [17]:
df_model = merge_index(df_model, df_sp500, "sp500")
df_model = merge_index(df_model, df_nasdaq, "nasdaq")
df_model = merge_index(df_model, df_dji, "dji")

print("Shape después de merges:", df_model.shape)

Shape después de merges: (1306, 14)


In [18]:
final_columns = [
    "titulo",
    "subtitulo",
    "fecha",
    "fuente",
    "sentimiento_label",
    "sentimiento_score",
    "close_sp500_t",
    "close_sp500_t7",
    "close_nasdaq_t",
    "close_nasdaq_t7",
    "close_dji_t",
    "close_dji_t7",
]

df_final = (
    df_model[final_columns]
    .sort_values("fecha")
    .reset_index(drop=True)
)

print("Shape final:", df_final.shape)
df_final.head()

Shape final: (1306, 12)


Unnamed: 0,titulo,subtitulo,fecha,fuente,sentimiento_label,sentimiento_score,close_sp500_t,close_sp500_t7,close_nasdaq_t,close_nasdaq_t7,close_dji_t,close_dji_t7
0,México ‘está esencialmente dirigido por cártel...,A solo unos días de que asuma la presidencia d...,2025-01-07,el_financiero,POS,0.695107,5909.029785,5842.910156,19489.679688,19044.390625,42528.359375,42518.28125
1,¿Por las malas? Trump usará ‘fuerza económica’...,Esto dejó de sonar como una broma. El presiden...,2025-01-07,el_financiero,NEG,0.799018,5909.029785,5842.910156,19489.679688,19044.390625,42528.359375,42518.28125
2,Trump se pone ‘belicón’: No descarta acción mi...,"El presidente electo Donald Trump, que quiere ...",2025-01-07,el_financiero,NEU,0.740869,5909.029785,5842.910156,19489.679688,19044.390625,42528.359375,42518.28125
3,Primera ministra de Dinamarca contesta a Trump...,"La primera ministra de Dinamarca, Mette Freder...",2025-01-07,el_financiero,POS,0.912642,5909.029785,5842.910156,19489.679688,19044.390625,42528.359375,42518.28125
4,Trump prevé declarar emergencia económica para...,Los bonos y las acciones extendieron una ola d...,2025-01-08,el_financiero,NEU,0.844226,5918.25,5949.910156,19478.880859,19511.230469,42635.199219,43221.550781
