In [0]:
import os
import json
import numpy as np
import pandas as pd
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from math import sqrt

## 데이터처리

In [0]:
# 배치데이터
jdbc_url = "jdbc:sqlserver://1dt-team4-sqlserver.database.windows.net:1433;database=1dt-team4-sqldb"
connection_properties = {
    "user": "azureuser",
    "password": "team4123!@#",
    "driver": "com.microsoft.sqlserver.jdbc.SQLServerDriver"
}

df_batch = spark.read.jdbc(url=jdbc_url, table="gold.gold_realscore_pred", properties=connection_properties)
df_batch.head()

In [0]:
# 추가데이터
df_recent = spark.read.jdbc(url=jdbc_url, table="gold.realtime_pred_realscore", properties=connection_properties)

df_recent = df_recent.withColumnRenamed('pred_realScore_clean', 'realScore_clean')

cols_to_drop = ['percent_rank', 'grade_percentile_calc']
for c in cols_to_drop:
    if c in df_recent.columns:
        df_recent = df_recent.drop(c)

df_recent.head()
df_recent.count()

In [0]:
df_merge = df_batch.unionByName(df_recent, allowMissingColumns=True)
df_merge.head()

## 데이터 전처리

In [0]:
# -----------------------
# 전처리 함수
# -----------------------
def drop_mean_columns(df):
    cols_to_drop = [c for c in df.columns if c.endswith('_mean')]
    return df.drop(columns=cols_to_drop, errors='ignore')

def preprocess_v2(df_spark):
    # Spark → pandas
    df = df_spark.toPandas()

    # 정확도 열 추가
    if 'correct_cnt' in df.columns and 'items_attempted' in df.columns:
        df['accuracy'] = df['correct_cnt'] / df['items_attempted'].replace(0, np.nan)
        df['accuracy'] = df['accuracy'].fillna(0)
    else:
        df['accuracy'] = 0

    # 결측치 0 처리
    df = df.fillna(0)

    # '_mean' 컬럼 제거
    df = drop_mean_columns(df)
    
    # 범주형 변수 자동 감지
    categorical_candidates = ['gender', 'grade']
    categorical_cols = [c for c in categorical_candidates if c in df.columns]

    return df, categorical_cols

# -----------------------
# 사용 예시
# -----------------------
df_v2, categorical_cols = preprocess_v2(df_merge)

## train-test-split

In [0]:
TARGET = "realScore_clean"   # 예측할 target
EXCLUDE_COLS = ['learnerID', 'testID', 'correct_cnt', 'items_attempted']  # 학습에서 제외할 컬럼
RANDOM_STATE = 42

# -----------------------
# feature/target 분리
# -----------------------
feature_cols = [c for c in df_v2.columns if c not in ([TARGET] + EXCLUDE_COLS)]
X = df_v2[feature_cols]
y = df_v2[TARGET]

# train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

print("Train:", X_train.shape, "Test:", X_test.shape)
print("Categorical columns:", categorical_cols)
print("Feature columns:", feature_cols[:10], "...")

## 전처리 파이프라인

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from math import sqrt
from sklearn.model_selection import GridSearchCV

numeric_cols = [c for c in X_train.columns if c not in categorical_cols]

num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0))
])

cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipeline, numeric_cols),
    ('cat', cat_pipeline, categorical_cols)
])

## 모델 학습

In [0]:
# -----------------------
# 모델 후보
# -----------------------
pipelines = {
    'dt': Pipeline(steps=[('preproc', preprocessor),
                          ('model', DecisionTreeRegressor(random_state=RANDOM_STATE))]),
    'rf': Pipeline(steps=[('preproc', preprocessor),
                          ('model', RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=-1))])
}

param_grids = {
    'dt': {'model__max_depth': [3, 5, 7]},
    'rf': {'model__n_estimators': [50, 100],
           'model__max_depth': [5, 10, 7]}
}

# -----------------------
# 학습 & 평가
# -----------------------
results = {}
for name, pipe in pipelines.items():
    print(f"\nTraining {name} ...")
    gs = GridSearchCV(pipe, param_grids[name], cv=3,
                      scoring='neg_mean_squared_error', n_jobs=-1)
    gs.fit(X_train, y_train)
    
    best = gs.best_estimator_
    preds = best.predict(X_test)
    
    rmse = sqrt(mean_squared_error(y_test, preds))
    mae = mean_absolute_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    
    results[name] = {
        'best_model': best,
        'best_params': gs.best_params_,
        'rmse': rmse,
        'mae': mae,
        'r2': r2
    }
    print(f"{name} best params: {gs.best_params_}")
    print(f"→ RMSE={rmse:.4f}, MAE={mae:.4f}, R2={r2:.4f}")

## 모델 선택

In [0]:
# -----------------------
# 베스트 모델 선택
# -----------------------
best_name = min(results.keys(), key=lambda k: results[k]['rmse'])
best_model = results[best_name]['best_model']
print("\n선택된 모델:", best_name, results[best_name])

## MLFlow 실험 기록 & 모델 저장

In [0]:
# -----------------------
# MLflow experiment 설정
# -----------------------
EXPERIMENT_NAME = "/Users/1dt003@msacademy.msai.kr/team4_pred_experiment"
MODEL_NAME = "team4-pred-models"
mlflow.set_experiment(EXPERIMENT_NAME)

# -----------------------
# MLflow 실행 (run) 시작
# -----------------------
with mlflow.start_run(run_name=f"real-score-v2-{best_name}") as run:
    run_id = run.info.run_id
    
    # 어떤 모델이 선택되었는지 기록
    mlflow.log_param("selected_model", best_name)
    
    # 선택된 모델의 하이퍼파라미터 기록
    for p, v in results[best_name]['best_params'].items():
        mlflow.log_param(p, v)
    
    # 모든 모델의 평가 지표 기록
    for name, res in results.items():
        mlflow.log_metric(f"{name}_rmse", res['rmse'])
        mlflow.log_metric(f"{name}_mae", res['mae'])
        mlflow.log_metric(f"{name}_r2", res['r2'])
    
    # 선택된 모델의 최종 지표 기록
    mlflow.log_metric("rmse", results[best_name]['rmse'])
    mlflow.log_metric("mae", results[best_name]['mae'])
    mlflow.log_metric("r2", results[best_name]['r2'])
    
    # -----------------------
    # 모델 MLflow에 자동 등록
    # -----------------------
    mlflow.sklearn.log_model(
        sk_model=best_model,
        artifact_path="model",
        registered_model_name=MODEL_NAME
    )
    
    print("✅ v2 모델 MLflow에 등록 완료 (자동 버전 관리)")