In [0]:
import pandas as pd
import numpy as np

## 데이터로드

In [0]:
jdbc_url = "jdbc:sqlserver://1dt-team4-sqlserver.database.windows.net:1433;database=1dt-team4-sqldb"
connection_properties = {
    "user": "azureuser",
    "password": "team4123!@#",
    "driver": "com.microsoft.sqlserver.jdbc.SQLServerDriver"
}
# 추가데이터
df_recent = spark.read.jdbc(url=jdbc_url, table="gold.realtime_pred_realscore", properties=connection_properties)

df_recent = df_recent.withColumnRenamed('pred_realScore_clean', 'realScore_clean')

cols_to_drop = ['percent_rank', 'grade_percentile_calc']
for c in cols_to_drop:
    if c in df_recent.columns:
        df_recent = df_recent.drop(c)

df_recent.head()

In [0]:
# -----------------------
# 전처리 함수
# -----------------------
def preprocess_v2(df_spark):
    # Spark → pandas
    df = df_spark.toPandas()

    # 정확도 열 추가
    if 'correct_cnt' in df.columns and 'items_attempted' in df.columns:
        df['accuracy'] = df['correct_cnt'] / df['items_attempted'].replace(0, np.nan)
        df['accuracy'] = df['accuracy'].fillna(0)
    else:
        df['accuracy'] = 0

    # 결측치 0 처리
    df = df.fillna(0)

    # 범주형 변수 자동 감지
    categorical_candidates = ['gender', 'grade']
    categorical_cols = [c for c in categorical_candidates if c in df.columns]

    return df, categorical_cols

# -----------------------
# 사용 예시
# -----------------------
df_recent, categorical_cols = preprocess_v2(df_recent)

## 모델 로드

In [0]:
from mlflow.tracking import MlflowClient
import mlflow.sklearn

EXPERIMENT_NAME = "/Users/1dt003@msacademy.msai.kr/real_score_experiment"
MODEL_NAME = "real-score-model"

client = MlflowClient()

# 추후변경
# v1, v2 최신 버전 가져오기 
v1_version = 6
v2_version = 5

model_v1_uri = f"models:/{MODEL_NAME}/{v1_version}"
model_v2_uri = f"models:/{MODEL_NAME}/{v2_version}"

model_v1 = mlflow.sklearn.load_model(model_v1_uri)
model_v2 = mlflow.sklearn.load_model(model_v2_uri)


## 예측

실제데이터

In [0]:
TARGET = "realScore_clean"
X_eval = df_recent.drop(columns=[TARGET])
y_eval = df_recent[TARGET]

pred_v1 = model_v1.predict(X_eval)
pred_v2 = model_v2.predict(X_eval)

## 모델 성능 비교2

In [0]:
import matplotlib.pyplot as plt
# 데이터
v_labels = ['v1', 'v2']
metrics = {'v1': {'rmse': 0.5}, 'v2': {'rmse': 0.3}}; rmse_values = [metrics['v1']['rmse'], metrics['v2']['rmse']]

# 막대 색상 (파랑, 노랑)
colors = ['blue', 'orange']

plt.bar(v_labels, rmse_values, color=colors)
plt.title("Compare RMSE")
plt.show()


In [0]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

metrics = {}
for name, pred in zip(['v1','v2'], [pred_v1, pred_v2]):
    metrics[name] = {
        'rmse': np.sqrt(mean_squared_error(y_eval, pred)),
        'mae': mean_absolute_error(y_eval, pred),
        'r2': r2_score(y_eval, pred)
    }
    print(name, metrics[name])


## 시각화

In [0]:
import matplotlib.pyplot as plt

# 데이터
v_labels = ['v1', 'v2']
rmse_values = [metrics['v1']['rmse'], metrics['v2']['rmse']]

# 막대 색상 (파랑, 노랑)
colors = ['blue', 'yellow']

plt.bar(v_labels, rmse_values, color=colors)
plt.title("Compare RMSE")
plt.show()


## 성능 비교 후 Production 결정

In [0]:
if metrics['v2']['rmse'] < metrics['v1']['rmse']:
    print("v2 모델 성능 개선 → Production 승격")
    client.transition_model_version_stage(
        name=MODEL_NAME,
        version=v2_version,
        stage="Production",
        archive_existing_versions=True  # 기존 Production v1은 Staging으로 이동
    )
else:
    print("v2 모델 성능 미흡 → Rollback (v1 유지)")
    # v1을 다시 Production으로 설정 (필요시)
    client.transition_model_version_stage(
        name=MODEL_NAME,
        version=v1_version,
        stage="Production",
        archive_existing_versions=True
    )