# Treino do Recomendador Simples de Ações

Executar este notebook gera os artefatos em `artifacts/` para a API.


In [14]:
import os, json, math, time, datetime as dt
from pathlib import Path
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from joblib import dump

BASE = '../'
ART = f"{BASE}/artifacts"
os.makedirs(ART, exist_ok=True)

US_TICKERS = ["AAPL","MSFT","AMZN","GOOGL","META","NVDA","TSLA","JPM","V","PG","KO","PEP","NFLX","AMD","INTC","DIS"]
BR_TICKERS = ["PETR4.SA","VALE3.SA","ITUB4.SA","BBDC4.SA","ABEV3.SA","WEGE3.SA","BBAS3.SA","B3SA3.SA","RAIL3.SA","PRIO3.SA","LREN3.SA","GGBR4.SA"]
TICKERS = US_TICKERS + BR_TICKERS

def region_for(t):
    return "BR" if t.endswith(".SA") else "US"

def _normalize_ohlcv(df: pd.DataFrame) -> pd.DataFrame:
    if df is None or df.empty:
        return pd.DataFrame()
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = df.columns.get_level_values(-1)
    df = df.rename(columns={c: c.strip() for c in df.columns})
    if "Close" not in df.columns:
        if "Adj Close" in df.columns:
            df["Close"] = df["Adj Close"]
        else:
            return pd.DataFrame()
    if "Volume" not in df.columns:
        df["Volume"] = np.nan
    keep = [c for c in ["Open","High","Low","Close","Adj Close","Volume"] if c in df.columns]
    return df[keep].copy()

def _reset_index_naive_utc(df: pd.DataFrame) -> pd.DataFrame:
    idx = df.index
    if hasattr(idx, "tz") and idx.tz is not None:
        df.index = idx.tz_convert("UTC").tz_localize(None)
    df = df.reset_index()
    date_col = "Date" if "Date" in df.columns else df.columns[0]
    df = df.rename(columns={date_col: "date"})
    return df

def fetch_single(ticker: str, start: dt.date, end: dt.date, interval="1d", tries=3, sleep_s=0.6) -> pd.DataFrame:
    for _ in range(tries):
        try:
            t = yf.Ticker(ticker)
            df = t.history(start=start, end=end, interval=interval, auto_adjust=False, actions=False, repair=True)
            df = _normalize_ohlcv(df)
            if not df.empty:
                df = _reset_index_naive_utc(df)
                df["ticker"] = ticker
                return df
        except Exception:
            pass
        time.sleep(sleep_s)
    try:
        df = yf.download(ticker, period="1y", interval=interval, auto_adjust=False, progress=False)
        df = _normalize_ohlcv(df)
        if not df.empty:
            df = _reset_index_naive_utc(df)
            df["ticker"] = ticker
            return df
    except Exception:
        pass
    return pd.DataFrame()

def fetch_history(tickers, period_days=420, interval="1d"):
    end = dt.date.today()
    start = end - dt.timedelta(days=period_days)
    frames = []
    for t in tickers:
        df = fetch_single(t, start=start, end=end, interval=interval)
        if not df.empty:
            frames.append(df)
    if not frames:
        return pd.DataFrame()
    out = pd.concat(frames, ignore_index=True)
    if "Volume" not in out.columns:
        out["Volume"] = 0.0
    else:
        out["Volume"] = out["Volume"].fillna(0.0)
    out = out.dropna(subset=["Close"])
    out["date"] = pd.to_datetime(out["date"], utc=False, errors="coerce")
    out = out.dropna(subset=["date"])
    return out

def compute_features(df):
    dfs = []
    for t, g in df.groupby("ticker"):
        g = g.sort_values("date").copy()
        g["ret_1d"] = g["Close"].pct_change()
        g["ret_1m"] = g["Close"].pct_change(21)
        g["ret_3m"] = g["Close"].pct_change(63)
        g["ret_6m"] = g["Close"].pct_change(126)
        g["vol_21"] = g["ret_1d"].rolling(21).std().fillna(0.0)
        g["vol_63"] = g["ret_1d"].rolling(63).std().fillna(0.0)
        g["volavg_21"] = g["Volume"].rolling(21).mean().bfill().fillna(0.0)
        g["volavg_63"] = g["Volume"].rolling(63).mean().bfill().fillna(0.0)
        last = g.iloc[-1:][["ticker","ret_1m","ret_3m","ret_6m","vol_21","vol_63","volavg_21","volavg_63","Close"]].copy()
        dfs.append(last)
    if not dfs:
        return pd.DataFrame()
    feat = pd.concat(dfs, ignore_index=True)
    return feat

def fetch_meta(ticker: str):
    name, sector = ticker, "Desconhecido"
    try:
        info = yf.Ticker(ticker).info
        name = info.get("shortName") or info.get("longName") or ticker
        sector = info.get("sector") or "Desconhecido"
    except Exception:
        try:
            fast = yf.Ticker(ticker).fast_info
            name = str(fast.get("shortName") or fast.get("longName") or ticker)
        except Exception:
            pass
    return {"ticker": ticker, "name": name, "setor": sector}

def label_vol(q):
    if q <= 0.33:
        return "baixa"
    if q >= 0.66:
        return "alta"
    return "media"

def label_liq(q):
    if q <= 0.33:
        return "baixa"
    if q >= 0.66:
        return "alta"
    return "media"

def label_trend(x):
    if x > 0.03:
        return "alta"
    if x < -0.03:
        return "baixa"
    return "estavel"

raw = fetch_history(TICKERS, period_days=420, interval="1d")
if raw.empty:
    raise RuntimeError("Sem dados baixados. Teste rede ou reduza TICKERS para ['AAPL','MSFT'] temporariamente.")
raw["date"] = pd.to_datetime(raw["date"], utc=False, errors="coerce")
raw = raw.dropna(subset=["date"])
feat = compute_features(raw)
if feat.empty:
    raise RuntimeError("Sem features calculadas.")
feat["pais"] = feat["ticker"].apply(region_for)
meta_rows = [fetch_meta(t) for t in feat["ticker"]]
meta = pd.DataFrame(meta_rows)
df = feat.merge(meta, on="ticker", how="left")
feature_cols = ["ret_1m","ret_3m","ret_6m","vol_21","vol_63","volavg_21","volavg_63"]
X = df[feature_cols].fillna(0.0).replace([np.inf,-np.inf],0.0)
pipe = Pipeline([("scaler", StandardScaler()), ("kmeans", KMeans(n_clusters=3, n_init=10, random_state=42))])
pipe.fit(X)
df["cluster"] = pipe.named_steps["kmeans"].labels_
cluster_stats = df.groupby("cluster")["vol_63"].mean().sort_values().reset_index()
order = cluster_stats["cluster"].tolist()
cluster_to_profile = {}
if len(order) == 3:
    cluster_to_profile[order[0]] = "Conservador"
    cluster_to_profile[order[1]] = "Equilibrado"
    cluster_to_profile[order[2]] = "Ousado"
else:
    for i, c in enumerate(order):
        cluster_to_profile[c] = f"Grupo {i+1}"
df["perfil_cluster"] = df["cluster"].map(cluster_to_profile)
v_quant = df["vol_63"].rank(pct=True)
l_quant = df["volavg_21"].rank(pct=True)
df["vol_label"] = v_quant.apply(label_vol)
df["liq_label"] = l_quant.apply(label_liq)
df["trend_label"] = df["ret_1m"].apply(label_trend)
final_cols = ["ticker","name","setor","pais","ret_1m","ret_3m","ret_6m","vol_21","vol_63","volavg_21","volavg_63","perfil_cluster","vol_label","liq_label","trend_label"]
universe = df[final_cols].copy()
universe.to_csv(f"{ART}/universe.csv", index=False)
with open(f"{ART}/feature_cols.json", "w", encoding="utf-8") as f:
    json.dump(feature_cols, f, ensure_ascii=False, indent=2)
with open(f"{ART}/cluster_map.json", "w", encoding="utf-8") as f:
    json.dump({str(k): v for k, v in cluster_to_profile.items()}, f, ensure_ascii=False, indent=2)
dump(pipe, f"{ART}/model.joblib")
print(str(ART))


..//artifacts
