In [None]:
import os, random, warnings
from typing import List, Tuple
import numpy as np
import pandas as pd
import FinanceDataReader as fdr
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from tqdm import tqdm

warnings.filterwarnings("ignore")

TQDM_KW = dict(ncols=80, leave=False)

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

TRADING_DAYS_3M = 63
CWD = os.getcwd()
OUTPUT_DIR = CWD

def load_tickers_from_csv(path: str) -> List[str]:
    if not os.path.exists(path):
        raise FileNotFoundError(f"티커 CSV가 존재하지 않습니다: {path}")

    df = pd.read_csv(path)
    if df.empty:
        raise RuntimeError(f"티커 CSV가 비어 있습니다: {path}")

    cols = [c for c in df.columns if c.lower() in ("ticker", "symbol", "code")]
    col = cols[0] if cols else df.columns[0]

    tickers = (
        df[col].astype(str).str.strip()
        .replace({"": np.nan}).dropna()
        .drop_duplicates().tolist()
    )
    print(f"[tickers] {os.path.basename(path)} → {len(tickers)}개 로드")
    return tickers

def collect_nasdaq_meta() -> Tuple[pd.DataFrame, List[str]]:
    df = fdr.StockListing("NASDAQ")
    syms = df["Symbol"].dropna().astype(str).str.strip().tolist()
    meta_df = df.copy()
    meta_df["__key__"] = "Symbol"
    return meta_df, syms

def download_fdr(tickers: List[str], start: str, end: str) -> dict:
    data = {}
    MIN_LEN = 1
    for t in tqdm(tickers, desc=f"시세 다운로드 {start}~{end}", **TQDM_KW):
        try:
            df = fdr.DataReader(t, start, end)
            if isinstance(df, pd.DataFrame) and len(df) >= MIN_LEN:
                df = df.rename(columns=str.lower)
                df["ticker"] = t
                cols = [c for c in ["open","high","low","close","volume","ticker"] if c in df.columns or c=="ticker"]
                data[t] = df[cols]
        except Exception:
            pass
    return data

def add_features(df: pd.DataFrame) -> pd.DataFrame:
    px = df["close"]
    df["ret_1d"]  = px.pct_change(1,  fill_method=None)
    df["ret_5d"]  = px.pct_change(5,  fill_method=None)
    df["ret_20d"] = px.pct_change(20, fill_method=None)
    df["ret_63d"] = px.pct_change(TRADING_DAYS_3M, fill_method=None)
    df["target_3m"] = px.shift(-TRADING_DAYS_3M) / px - 1
    return df

def make_panel(data_dict: dict) -> pd.DataFrame:
    frames = []
    for t, df in data_dict.items():
        if "close" not in df.columns:
            continue

        tmp = add_features(df.copy())
        tmp.replace([np.inf, -np.inf], np.nan, inplace=True)

        feat_cols = ["ret_1d","ret_5d","ret_20d","ret_63d","open","high","low","close","volume"]
        available_feats = [c for c in feat_cols if c in tmp.columns]
        tmp[available_feats] = tmp[available_feats].fillna(0)

        tmp["date"] = tmp.index
        tmp["ticker"] = t
        frames.append(tmp)

    if not frames:
        return pd.DataFrame()
    return pd.concat(frames).reset_index(drop=True)

def find_anomalous_tickers(train_df: pd.DataFrame) -> List[str]:
    bad = []
    grouped = train_df.groupby("ticker")

    for t, g in grouped:
        closes = g["close"].astype(float).values
        if np.any(~np.isfinite(closes)) or np.any(closes <= 0):
            bad.append(t); continue

        if "volume" in g.columns:
            vols = g["volume"].astype(float).values
            if np.any(~np.isfinite(vols)) or np.any(vols < 0):
                bad.append(t); continue

        if "ret_1d" in g.columns and g["ret_1d"].abs().max() > 5.0:
            bad.append(t); continue

        cmin, cmax = closes.min(), closes.max()
        if cmin > 0 and cmax / cmin > 1000:
            bad.append(t); continue

    bad = sorted(set(bad))
    print(f"- 이상치 의심 종목 수: {len(bad)}개")
    return bad

def run_task_multi_year(
    train_start: str,
    train_end: str,
    years: List[int],
    universe_csv_template: str,
    output_dir: str,
    out_prefix: str = "TS_nasdaq",
):
    print(f"\n=== Multi-year 실행 시작 (years={years}) ===")

    # 0) 연도별 유니버스 로드 → union
    year_to_tickers = {}
    all_pred_tickers = set()
    for y in years:
        csv_path = universe_csv_template.format(year=y)
        tickers_y = load_tickers_from_csv(csv_path)
        year_to_tickers[y] = tickers_y
        all_pred_tickers.update(tickers_y)
        print(f"- {y} 유니버스: {len(tickers_y)}개 ({os.path.basename(csv_path)})")
    print(f"- 전체 예측 티커 union: {len(all_pred_tickers)}개")

    # 1) 학습 유니버스
    meta_df, _ = collect_nasdaq_meta()
    list_col = None
    for cand in ["ListingDate","ListedDate","IPODate","상장일","상장일자"]:
        if cand in meta_df.columns:
            list_col = cand
            break

    if list_col is not None:
        mdf = meta_df[["Symbol", list_col]].copy()
        mdf["__listdate__"] = pd.to_datetime(mdf[list_col].astype(str).str.replace(" ",""), errors="coerce")
        train_universe = (
            mdf[mdf["__listdate__"] <= pd.to_datetime(train_end)]["Symbol"]
            .astype(str).str.strip().tolist()
        )
    else:
        train_universe = meta_df["Symbol"].dropna().astype(str).str.strip().tolist()

    train_universe = sorted(set(train_universe))
    print(f"- 학습용 나스닥 전체 티커 수: {len(train_universe)}개")

    # 2) 다운로드
    total_tickers = sorted(set(train_universe) | set(all_pred_tickers))
    print(f"- FDR 다운로드 대상 총 티커 수: {len(total_tickers)}개")

    max_year = max(years)
    data_end = (pd.to_datetime(f"{max_year}-12-31") + pd.Timedelta(days=7)).strftime("%Y-%m-%d")

    data = download_fdr(total_tickers, train_start, data_end)
    if not data:
        print("⚠️ 다운로드된 종목이 없습니다. 스킵")
        return

    panel = make_panel(data)
    if panel.empty:
        print("⚠️ 패널 생성 실패")
        return
    panel["date"] = pd.to_datetime(panel["date"])
    print(f"- 패널 크기: {panel.shape}")

    # 3) 학습
    FEAT_COLS = ["ret_1d","ret_5d","ret_20d","ret_63d","open","high","low","close","volume"]
    train_mask = (
        (panel["date"] >= pd.to_datetime(train_start)) &
        (panel["date"] <= pd.to_datetime(train_end)) &
        (panel["target_3m"].notna())
    )
    train_df = panel.loc[train_mask].copy()
    print(f"- 이상치 제거 전 학습 데이터: {train_df.shape}")

    bad = find_anomalous_tickers(train_df)
    if bad:
        train_df = train_df[~train_df["ticker"].isin(bad)].copy()
        print(f"- 이상치 제거 후 학습 데이터: {train_df.shape} (제거 {len(bad)}개)")

    X_tr = train_df[FEAT_COLS]
    y_tr = train_df["target_3m"]
    valid = (~X_tr.isnull().any(axis=1)) & (~y_tr.isnull())
    X_tr = X_tr[valid]
    y_tr = y_tr[valid]

    model = Pipeline([("scaler", StandardScaler()), ("ridge", Ridge(alpha=1.0, random_state=SEED))])
    print("- 모델 학습 중...")
    model.fit(X_tr, y_tr)
    print("- 모델 학습 완료")

    # 4) 연도별 예측
    all_dates = sorted(panel["date"].unique())
    for y in years:
        pred_days = [d for d in all_dates if (d >= pd.to_datetime(f"{y}-01-01")) and (d <= pd.to_datetime(f"{y}-12-31"))]
        if not pred_days:
            print(f"⚠️ {y}: pred_days 비어 있음 → 스킵")
            continue

        pred_set = set(year_to_tickers[y])
        results = []

        for d in tqdm(pred_days, desc=f"{y} 일별 예측", **TQDM_KW):
            day_df = panel[(panel["date"] == d) & (panel["ticker"].isin(pred_set))].copy()
            if day_df.empty:
                continue

            preds = model.predict(day_df[FEAT_COLS].fillna(0))
            k = min(100, len(day_df))
            top_idx = np.argsort(-preds)[:k]
            sel = day_df.iloc[top_idx]

            results.append(pd.DataFrame({
                "date": sel["date"].dt.strftime("%Y-%m-%d"),
                "rank": np.arange(1, k+1, dtype=int),
                "ticker": sel["ticker"].astype(str),
            }))

        if results:
            out = pd.concat(results, ignore_index=True)[["date","rank","ticker"]]
            out_path = os.path.join(output_dir, f"Baseline_submission_{out_prefix}_{y}.csv")
            out.to_csv(out_path, index=False)
            print(f"✅ {y} 저장 완료 → {out_path} (rows={len(out)})")
        else:
            print(f"⚠️ {y}: 예측 결과 비어있음")

# 실행
UNIVERSE_TEMPLATE = "../main_round/data/universe/final_universe/{year}_final_universe.csv"

run_task_multi_year(
    train_start="2010-01-01",
    train_end="2019-12-31",
    years=[2020, 2021, 2022, 2023, 2024],
    universe_csv_template=UNIVERSE_TEMPLATE,
    output_dir=OUTPUT_DIR,
    out_prefix="TS_nasdaq",
)
