In [0]:
import os, sys, json
import pandas as pd
from pyspark.sql import functions as F
import mlflow
from mlflow.tracking import MlflowClient

# src 모듈 import
sys.path.append("/Workspace/Users/1dt003@msacademy.msai.kr/team4-CICD/src")
from src.config_loader import load_config
from src.data_preprocessor import preprocess_dataframe
from src.model_serving import predict_with_model, predict_with_fallback

# ==============================
# Config 로드
# ==============================
config = load_config("/Workspace/Users/1dt003@msacademy.msai.kr/team4-CICD/configs/config_v2.json")

# 환경변수 우선 적용
BASE_MODEL_NAME = os.environ.get("BASE_MODEL_NAME") or config.get("model_name", "team4-pred-model")
JDBC_URL = os.environ.get("JDBC_URL", config['jdbc_url'])
BATCH_TABLE = os.environ.get("BATCH_TABLE", config['batch_table'])
TARGET = config.get("target", "realScore_clean")

client = MlflowClient()

# ==============================
# MLflow 최신 모델 로드
# ==============================
latest_versions = client.get_latest_versions(name=BASE_MODEL_NAME, stages=["None"])
if not latest_versions:
    raise ValueError(f"No versions found for {BASE_MODEL_NAME}")

latest_version_num = max([int(v.version) for v in latest_versions])
model_uri = f"models:/{BASE_MODEL_NAME}/{latest_version_num}"
model = mlflow.sklearn.load_model(model_uri)

# ==============================
# 샘플 데이터 로드
# ==============================
connection_properties = config['connection_properties']
df_batch = spark.read.jdbc(url=JDBC_URL, table=BATCH_TABLE, properties=connection_properties)

# ==============================
# 컬럼 전처리
# ==============================
required_cols = ["difficultyLevel", "guessLevel", "discriminationLevel"]
for col in required_cols:
    if col not in df_batch.columns and f"{col}_mean" in df_batch.columns:
        df_batch = df_batch.withColumnRenamed(f"{col}_mean", col)

if "correctRate" not in df_batch.columns:
    df_batch = df_batch.withColumn("correctRate", (F.col("correct_cnt") / F.col("items_attempted")).cast("double"))
df_batch = df_batch.withColumn("accuracy", F.col("correctRate"))

# 필수 컬럼 기본값 처리
default_values = {"age": 0, "subjectName": "unknown", "itemType": "unknown",
                  "answerTime_avg": 0.0, "theta_clean": 0.0}
for col_name, default_val in default_values.items():
    if col_name not in df_batch.columns:
        df_batch = df_batch.withColumn(col_name, F.lit(default_val))

df_pandas = df_batch.toPandas()

try:
    df_processed, categorical_cols = preprocess_dataframe(
        df_pandas,
        categorical_candidates=config.get("categorical_candidates", ["gender", "grade"]),
        version="v2"
    )
except:
    df_processed = df_pandas
    categorical_cols = config.get("categorical_candidates", ["gender", "grade", "subjectName", "itemType"])

# ==============================
# 모델 예측 + RMSE 계산
# ==============================
deploy_decision = False
rmse_estimate = None
try:
    feature_cols = [c for c in df_processed.columns if c not in config.get("exclude_cols", []) + [TARGET]]

    preds = predict_with_fallback(
        primary_model=model,
        fallback_model=model,
        df=df_processed,
        feature_cols=feature_cols
    )

    rmse_estimate = ((preds - df_processed[TARGET]) ** 2).mean() ** 0.5
    threshold_rmse = float(os.environ.get("STAGING_RMSE_THRESHOLD", config.get("staging_rmse_threshold", 1e12)))

    deploy_decision = rmse_estimate <= threshold_rmse
    print(f"Staging RMSE: {rmse_estimate:.4f}, deploy_decision: {deploy_decision}")
except Exception as e:
    print(f"Prediction failed: {e}")
    deploy_decision = False

# ==============================
# Notebook Exit → CI/CD workflow 전달
# ==============================
dbutils.notebook.exit(json.dumps({
    "deploy": bool(deploy_decision),
    "model_version": int(latest_version_num),
    "rmse_estimate": float(rmse_estimate) if rmse_estimate is not None else None
}))