# Leer parámetros desde env (TICKERS, START_DATE, END_DATE). 


In [27]:
# Instalar librerías necesarias (si no están en la imagen base)
!pip install pandas yfinance psycopg2-binary lxml 



# Imports y Configuración de DB

In [28]:
import os
import pandas as pd
import yfinance as yf
from sqlalchemy import create_engine, text
from datetime import datetime
import time

# DB Params
PG_USER = os.getenv('PG_USER')
PG_PASSWORD = os.getenv('PG_PASSWORD')
PG_HOST = os.getenv('PG_HOST') 
PG_PORT = os.getenv('PG_PORT')
PG_DB = os.getenv('PG_DB')

# Business Params
TICKER = os.getenv('TICKERS')  
START_DATE = os.getenv('START_DATE') 
END_DATE = os.getenv('END_DATE')

# String de conexión SQLAlchemy
db_url = f"postgresql://{PG_USER}:{PG_PASSWORD}@{PG_HOST}:{PG_PORT}/{PG_DB}"
engine = create_engine(db_url)

print(f"Configuración cargada: Ticker={TICKER}, Rango={START_DATE} a {END_DATE}")

Configuración cargada: Ticker=NVDA, Rango=2020-01-01 a 2024-12-31


# Extracción (Yahoo Finance)

In [29]:
print(f"Descargando datos para {TICKER}...")

# Reintentos para descarga de datos
max_retries = 3
for attempt in range(max_retries):
    try:
        # Descarga de datos diarios
        df_raw = yf.download(
            tickers=TICKER, 
            start=START_DATE, 
            end=END_DATE, 
            interval='1d',
            progress=False,
            auto_adjust=False # Queremos precios raw y adj close separados
        )

        # yfinance a veces devuelve multi-index en columnas si es 1 ticker, aplanamos por si acaso
        if isinstance(df_raw.columns, pd.MultiIndex):
            df_raw.columns = df_raw.columns.get_level_values(0)

        df_raw.reset_index(inplace=True)

        print(f"Filas descargadas: {len(df_raw)}")
        break  
    except Exception as e:
        print(f"Error al descargar datos (intento {attempt + 1}): {e}")
        time.sleep(5 * (attempt + 1))
        if attempt == max_retries - 1:
            raise  


Descargando datos para NVDA...
Filas descargadas: 1257


# Aumento de metadatos

In [30]:
df_raw['ingested_at_utc'] = datetime.utcnow()
df_raw['ticker'] = TICKER
df_raw['run_id'] = datetime.now().strftime('run_%Y%m%d_%H%M')
df_raw['source_name'] = 'yahoo_finance'

# Renombrar columnas a minusculas

In [31]:
column_mapping = {
    'Date': 'date',
    'Open': 'open',
    'High': 'high',
    'Low': 'low',
    'Close': 'close',
    'Adj Close': 'adj_close',
    'Volume': 'volume'
}
df_raw.rename(columns=column_mapping, inplace=True)
df_raw.head()

Price,date,adj_close,close,high,low,open,volume,ingested_at_utc,ticker,run_id,source_name
0,2020-01-02,5.971078,5.99775,5.99775,5.918,5.96875,237536000,2025-12-06 02:20:34.780610,NVDA,run_20251206_0220,yahoo_finance
1,2020-01-03,5.875504,5.90175,5.94575,5.8525,5.8775,205384000,2025-12-06 02:20:34.780610,NVDA,run_20251206_0220,yahoo_finance
2,2020-01-06,5.900144,5.9265,5.93175,5.78175,5.808,262636000,2025-12-06 02:20:34.780610,NVDA,run_20251206_0220,yahoo_finance
3,2020-01-07,5.971576,5.99825,6.04425,5.90975,5.955,314856000,2025-12-06 02:20:34.780610,NVDA,run_20251206_0220,yahoo_finance
4,2020-01-08,5.982776,6.0095,6.051,5.95375,5.994,277108000,2025-12-06 02:20:34.780610,NVDA,run_20251206_0220,yahoo_finance


# Carga a Postgres

In [32]:
current_ticker = TICKER
min_date = df_raw['date'].min()
max_date = df_raw['date'].max()


table_name = 'prices_daily'
schema = os.getenv('RAW_SCHEMA', 'raw')

# Reintentos para asegurar existencia de la tabla
print(f"Iniciando carga para {current_ticker} ({min_date} a {max_date})")
max_retries = 3
for attempt in range(max_retries):
    try:
        print(f"Asegurando existencia de tabla {schema}.{table_name}...")
        with engine.begin() as conn:
            conn.execute(text(f"CREATE SCHEMA IF NOT EXISTS {schema};"))
            conn.execute(text(f"""
                CREATE TABLE IF NOT EXISTS {schema}.{table_name} (
                    date DATE,
                    ticker VARCHAR(20),
                    open NUMERIC,
                    high NUMERIC,
                    low NUMERIC,
                    close NUMERIC,
                    adj_close NUMERIC,
                    volume BIGINT,
                    source_name VARCHAR(50),
                    ingested_at_utc TIMESTAMP,
                    run_id VARCHAR(50)
                );
            """))
        break
    except Exception as e:
        print(f"Error al asegurar tabla (intento {attempt + 1}): {e}")
        time.sleep(5 * (attempt + 1))
        if attempt == max_retries - 1:
            raise  
    

print(f"Cargando datos en {schema}.{table_name}")

# Reintentos para inserción de datos
for attempt in range(max_retries):
    try:
        with engine.begin() as conn:
            # Limpiar datos previos para el mismo ticker y rango
            delete_query = text("""
                DELETE FROM raw.prices_daily
            WHERE ticker = :ticker
            AND date >= :min_date 
            AND date <= :max_date
            """
            )   
        
            result = conn.execute(delete_query, {
                'ticker': current_ticker,
                'min_date': min_date,
                'max_date': max_date
            })
            print(f"Filas eliminadas previamente: {result.rowcount}")

        # Insertar nuevos datos
        df_raw.to_sql(
            name=table_name,
            con=engine,
            schema=schema,
            if_exists='append', # 'replace' si quieres borrar todo cada vez, 'append' para historial
            index=False,
            chunksize=1000 # Insertar por lotes para no saturar memoria
        )
        print(f"Filas insertadas: {len(df_raw)}")
        break
    except Exception as e:
        print(f"Error durante la insercion (intento {attempt + 1}): {e}")
        # esperar antes de reintentar
        time.sleep(5 * (attempt + 1))
        if attempt == max_retries - 1:
            raise



Iniciando carga para NVDA (2020-01-02 00:00:00 a 2024-12-30 00:00:00)
Asegurando existencia de tabla raw.prices_daily...
Cargando datos en raw.prices_daily
Filas eliminadas previamente: 1257
Filas insertadas: 1257


# Carga de earning_dates

In [34]:

ticker_obj = yf.Ticker(TICKER)
earnings_df = ticker_obj.get_earnings_dates(limit=20) # Trae los últimos 20 trimestres

# yfinance devuelve la fecha en el índice con zona horaria, hay que limpiarlo
earnings_df.reset_index(inplace=True)
earnings_df.rename(columns={'Earnings Date': 'earnings_date', 'EPS Estimate': 'eps_estimate', 'Reported EPS': 'reported_eps'}, inplace=True)

# Nos aseguramos de que sea solo fecha (sin hora) para cruzar con precios
earnings_df['earnings_date'] = pd.to_datetime(earnings_df['earnings_date']).dt.date
earnings_df['ticker'] = TICKER

# Solo nos importan las columnas clave
df_earnings_clean = earnings_df[['earnings_date', 'ticker', 'eps_estimate', 'reported_eps']].copy()
df_earnings_clean.head(10)

Unnamed: 0,earnings_date,ticker,eps_estimate,reported_eps
0,2026-02-25,NVDA,1.52,
1,2025-11-19,NVDA,1.26,1.3
2,2025-08-27,NVDA,1.01,1.05
3,2025-05-28,NVDA,0.75,0.81
4,2025-02-26,NVDA,0.85,0.89
5,2024-11-20,NVDA,0.75,0.81
6,2024-08-28,NVDA,0.64,0.68
7,2024-05-22,NVDA,0.56,0.61
8,2024-02-21,NVDA,0.46,0.52
9,2023-11-21,NVDA,0.34,0.4
