## 7. Retrain Model

### 7.1 Setup

In [1]:
from typing import Any, Dict, List, Callable
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
from scipy.stats import ks_2samp
import joblib
import warnings
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import catboost as cb
from catboost import CatBoostRegressor
warnings.filterwarnings("ignore")

### 7.2 Create a model for retraining

In [2]:
def create_catboost_model():
    """
    TẠO MODEL CATBOOST VỚI MULTI-RMSE + USE BEST MODEL
    → TỰ ĐỘNG TÁCH EVAL_SET → KHÔNG LỖI
    """
    # 1. Load best params
    result = joblib.load('../models/daily/BEST_CATBOOST_TUNED_DAILY.joblib')
    best_params = result['best_params']
    
    # 2. Cập nhật params
    final_params = best_params.copy()
    final_params.update({
        "loss_function": 'MultiRMSE',
        "random_seed": 42,
        "thread_count": -1,
        "od_type": "Iter",
        "od_wait": 50,
        "use_best_model": True,
        "verbose": 100 
    })
    
    return final_params

### 7.3 Create Auto retraining object

In [3]:
class AutoRetraining:
    def __init__(self,
                 model_creator: Callable[[], Any],
                 initial_baseline: Dict[str, Dict[str, float]],
                 horizons: List[int] = [1, 2, 3, 4, 5],
                 window_size: int = 90,
                 max_archive: int = 180,
                 max_idle_days: int = 60,
                 baseline_update_freq: int = 30):
        
        self.model_creator = model_creator
        self.horizons = horizons
        self.target_labels = [f"T+{h}" for h in horizons]
        
        self.feature_buffer = pd.DataFrame()
        self.target_buffer = pd.DataFrame()
        self.daily_predictions = []
        self.performance_history = []
        self.retrain_events = []
        
        self.psi_limit = 0.15
        self.ks_alpha = 0.05
        self.rmse_rise_limit = 0.20
        self.r2_drop_limit = 0.03
        
        self.window_size = window_size
        self.max_archive = max_archive
        self.max_idle_days = max_idle_days
        self.baseline_update_freq = baseline_update_freq
        self.days_without_retrain = 0
        
        self.live_model = None
        self.baseline_scores = initial_baseline.copy()
        
        print("AutoRetraining FULL SYSTEM INITIALIZED")

    # ===================================================================
    # 1. DEPLOY MODEL
    # ===================================================================
    def deploy_model(self, model: Any, X_train: pd.DataFrame, y_train: pd.DataFrame):
        self.live_model = model
        self.feature_buffer = X_train.copy()
        self.target_buffer = y_train.copy()
        self.days_without_retrain = 0
        
        print(f"MODEL DEPLOYED!")
        print(f"   • Training samples: {len(X_train):,}")
        print(f"   • Horizons: {', '.join(self.target_labels)}")

    # ===================================================================
    # 2. DAILY FORECAST + LOG + REPORT
    # ===================================================================
    def forecast_daily(self, X_input: pd.DataFrame, y_actual: pd.DataFrame, date: datetime) -> Dict:
        if self.live_model is None:
            raise RuntimeError("No model deployed! Use deploy_model() first.")

        y_pred = self.live_model.predict(X_input)
        
        horizon_scores = {}
        for i, label in enumerate(self.target_labels):
            true = y_actual.iloc[:, i].values
            pred = y_pred[:, i]
            horizon_scores[label] = {
                'RMSE': float(np.sqrt(mean_squared_error(true, pred))),
                'MAE': float(mean_absolute_error(true, pred)),
                'R2': float(r2_score(true, pred))
            }
        
        overall_rmse = np.sqrt(mean_squared_error(y_actual, y_pred, multioutput='uniform_average'))
        overall_r2 = r2_score(y_actual, y_pred, multioutput='uniform_average')
        
        record = {
            'date': date,
            'input': X_input.copy(),
            'actual': y_actual.copy(),
            'prediction': y_pred,
            'scores': horizon_scores,
            'overall': {'RMSE': overall_rmse, 'R2': overall_r2}
        }
        self.daily_predictions.append(record)
        self.performance_history.append({
            'date': date,
            'scores': horizon_scores,
            'overall_R2': overall_r2
        })
        
        self.feature_buffer = pd.concat([self.feature_buffer, X_input], ignore_index=True)
        self.target_buffer = pd.concat([self.target_buffer, y_actual], ignore_index=True)
        self._limit_buffer_size()
        self.days_without_retrain += 1

        if date.day == 1 and len(self.performance_history) >= self.baseline_update_freq:
            self.update_smart_baseline()

        self._raise_degradation_alerts(horizon_scores, date)
        self.print_daily_rmse()
        
        return record

    # ===================================================================
    # 3. DRIFT DETECTION
    # ===================================================================
    def check_distribution_shift(self, features: List[str]) -> Dict:
        if len(self.daily_predictions) < 7:
            return {'shift': False, 'details': {}, 'count': 0}
        
        recent = pd.concat([p['input'] for p in self.daily_predictions[-7:]], ignore_index=True)
        report = {}
        drift_count = 0

        for col in features:
            if col not in self.feature_buffer.columns or col not in recent.columns:
                continue
            ref = self.feature_buffer[col].dropna().values
            curr = recent[col].dropna().values
            if len(ref) < 10 or len(curr) < 5:
                continue

            psi = self._calc_psi(ref, curr)
            _, p = ks_2samp(ref, curr)
            drift = (psi > self.psi_limit) or (p < self.ks_alpha)
            if drift:
                drift_count += 1
            
            report[col] = {'PSI': round(psi, 4), 'KS_p': round(p, 4), 'drift': drift}
        
        return {'shift': drift_count > 0, 'details': report, 'count': drift_count}

    def _calc_psi(self, ref: np.ndarray, curr: np.ndarray, bins: int = 10) -> float:
        try:
            r, e = np.histogram(ref, bins=bins, density=True)
            c, _ = np.histogram(curr, bins=e, density=True)
            r += 1e-8; c += 1e-8
            return float(np.sum((c - r) * np.log(c / r)))
        except:
            return 1.0

    # ===================================================================
    # 4. HEALTH CHECK
    # ===================================================================
    def assess_model_health(self) -> Dict:
        if len(self.performance_history) < 5:
            return {'status': 'healthy', 'alerts': []}
        
        recent = self.performance_history[-5:]
        alerts = []

        for label in self.target_labels:
            base_rmse = self.baseline_scores[label]['RMSE']
            recent_rmse = np.mean([r['scores'][label]['RMSE'] for r in recent])
            rise = (recent_rmse - base_rmse) / base_rmse
            if rise > self.rmse_rise_limit:
                alerts.append(f"{label}: RMSE up {rise:.1%}")

            base_r2 = self.baseline_scores[label]['R2']
            recent_r2 = np.mean([r['scores'][label]['R2'] for r in recent])
            if (base_r2 - recent_r2) > self.r2_drop_limit:
                alerts.append(f"{label}: R² down {base_r2 - recent_r2:.3f}")

        return {'status': 'degraded' if alerts else 'healthy', 'alerts': alerts}

    # ===================================================================
    # 5. RETRAIN DECISION
    # ===================================================================
    def should_retrain(self) -> tuple[bool, str]:
        triggers = []

        if len(self.performance_history) < 7:
            return False, "chưa đủ 7 ngày dữ liệu"
        
        if self.days_without_retrain < 3:
            return False, f"chờ thêm {3 - self.days_without_retrain} ngày"

        # KHẨN CẤP: Tăng > 20%
        health = self.assess_model_health()
        if health['status'] == 'degraded':
            for alert in health['alerts']:
                if 'up' in alert:
                    try:
                        rise_pct = float(alert.split('up')[1].strip('% '))
                        if rise_pct >= 20.0:
                            return True, f"KHẨN CẤP: {alert}"
                    except:
                        pass

        # RMSE 3 ngày liên tiếp > 18%
        recent_3 = self.performance_history[-3:]
        for label in self.target_labels:
            base_rmse = self.baseline_scores[label]['RMSE']
            rmses_3days = [log['scores'][label]['RMSE'] for log in recent_3]
            if all(rmse > base_rmse * 1.18 for rmse in rmses_3days):
                avg_rise = np.mean([(rmse / base_rmse - 1) for rmse in rmses_3days])
                triggers.append(f"{label} up {avg_rise:.1%} (3 ngày)")

        # DRIFT ≥ 2 feature
        available_features = [f for f in ['temp', 'humidity', 'windspeed', 'pressure', 'rain'] 
                             if f in self.feature_buffer.columns]
        if len(available_features) >= 2:
            drift = self.check_distribution_shift(available_features)
            if drift['shift'] and drift['count'] >= 2:
                triggers.append(f"drift mạnh ({drift['count']} features)")

        if triggers:
            return True, "; ".join(triggers)
        return False, "model ổn định"

    # ===================================================================
    # 6. AUTO RETRAIN
    # ===================================================================
    def perform_retrain(self):
        if len(self.feature_buffer) < 30:
            print("Not enough data!")
            return False

        X_full = self.feature_buffer.tail(self.window_size)
        y_full = self.target_buffer.tail(self.window_size)

        split_idx = int(len(X_full) * 0.9)
        X_train = X_full.iloc[:split_idx]
        y_train = y_full.iloc[:split_idx]
        X_eval = X_full.iloc[split_idx:]
        y_eval = y_full.iloc[split_idx:]

        print(f"RETRAINING with {len(X_train):,} train + {len(X_eval):,} eval samples...")

        params = create_catboost_model()
        model = cb.CatBoostRegressor(**params)
        
        model.fit(
            X_train, y_train,
            eval_set=(X_eval, y_eval),
            use_best_model=True
        )

        self.live_model = model
        self.days_without_retrain = 0
        
        reason = self.should_retrain()[1]
        self.retrain_events.append({
            'time': datetime.now(),
            'samples': len(X_full),
            'reason': reason
        })

        v = len(self.retrain_events)
        joblib.dump(model, f"model_v{v}.joblib")
        print(f"MODEL v{v} DEPLOYED! Best iteration: {model.get_best_iteration()}")
        return True

    # ===================================================================
    # 7. SMART BASELINE
    # ===================================================================
    def update_smart_baseline(self, window: int = 30):
        if len(self.performance_history) < window:
            print(f"Not enough data ({len(self.performance_history)} < {window})")
            return

        recent = self.performance_history[-window:]
        new = {}

        for label in self.target_labels:
            rmses = [log['scores'][label]['RMSE'] for log in recent]
            r2s = [log['scores'][label]['R2'] for log in recent]
            new[label] = {
                'RMSE': float(np.median(rmses)),
                'R2': float(np.median(r2s))
            }

        self.baseline_scores = new
        print(f"SMART BASELINE UPDATED (median of last {window} days):")
        for l, s in new.items():
            print(f"   • {l}: RMSE={s['RMSE']:.4f}, R²={s['R2']:.4f}")

    # ===================================================================
    # 8. UTILITIES
    # ===================================================================
    def _limit_buffer_size(self):
        if len(self.feature_buffer) > self.max_archive:
            self.feature_buffer = self.feature_buffer.tail(self.max_archive)
            self.target_buffer = self.target_buffer.tail(self.max_archive)

    def _raise_degradation_alerts(self, scores: Dict, date: datetime):
        alerts = []
        for label, m in scores.items():
            base = self.baseline_scores[label]['RMSE']
            rise = (m['RMSE'] - base) / base
            if rise > self.rmse_rise_limit:
                alerts.append(f"{label}: RMSE up {rise:.1%}")
        if alerts:
            print(f"PERFORMANCE ALERT [{date.date()}]")
            for a in alerts[:3]:
                print(f"   • {a}")

    def print_daily_rmse(self):
        if not self.performance_history:
            print("No data yet!")
            return

        print("\n" + "="*90)
        print(f"{' '*30} DAILY RMSE REPORT")
        print("="*90)
        print(f"{'Date':<12} {'T+1':>8} {'T+2':>8} {'T+3':>8} {'T+4':>8} {'T+5':>8} {'MEAN':>8}")
        print("-" * 90)

        for log in self.performance_history[-10:]:
            d = log['date'].strftime("%Y-%m-%d")
            rmses = [log['scores'][l]['RMSE'] for l in self.target_labels]
            mean = np.mean(rmses)
            print(f"{d:<12} " + " ".join(f"{r:8.4f}" for r in rmses) + f" {mean:8.4f}")

        print("-" * 90)
        base_rmses = [self.baseline_scores[l]['RMSE'] for l in self.target_labels]
        base_mean = np.mean(base_rmses)
        print(f"{'BASELINE':<12} " + " ".join(f"{r:8.4f}" for r in base_rmses) + f" {base_mean:8.4f}")
        print("="*90 + "\n")

### 7.4 Testing

In [4]:
# ===================================================================
# 1. LOAD MODEL & PREPROCESSOR
# ===================================================================
print("LOADING BEST MODEL & PREPROCESSOR...\n")
result = joblib.load('../models/daily/BEST_CATBOOST_TUNED_DAILY.joblib')
best_model = result['model']
preprocessor = result['preprocessor']
top_features = result['feature_names']

print(f"Model loaded: {type(best_model).__name__}")
print(f"Features used: {len(top_features)} → {top_features[:5]}...\n")

# ===================================================================
# 2. LOAD & PREPROCESS DATA 
# ===================================================================
print("LOADING & PREPROCESSING DATA...\n")
df = pd.read_csv('../data/processed/feature_engineering_daily_data2.csv')
df['datetime'] = pd.to_datetime(df['datetime'])
df.set_index('datetime', inplace=True)

print(f"Raw data: {df.shape}")
print(f"Index đầu tiên: {df.index[0]}")

# Tách X, y
X = df.drop(columns=['target5+', 'target4+', 'target3+', 'target2+', 'target1+'])
y = df[['target5+', 'target4+', 'target3+', 'target2+', 'target1+']]

# Chuẩn hóa tên cột y
y.columns = ['T+5', 'T+4', 'T+3', 'T+2', 'T+1']
print(f"X size: {X.shape} | y size: {y.shape}")

# Preprocess + chọn top features
X_selected = pd.DataFrame(
    preprocessor.transform(X),
    columns=preprocessor.get_feature_names_out(),
    index=X.index
)[top_features]

print(f"X_selected size: {X_selected.shape}")
print(f"Index đầu tiên: {X_selected.index[0]}\n")

# ===================================================================
# 3. TRAIN/TEST SPLIT
# ===================================================================
test_size = 0.2
split_idx = int(len(X) * (1 - test_size))
X_train = X_selected.iloc[:split_idx].copy()
X_test = X_selected.iloc[split_idx:].copy()
y_train = y.iloc[:split_idx].copy()
y_test = y.iloc[split_idx:].copy()

print(f"Train: {X_train.shape[0]:,} ngày | Test: {X_test.shape[0]:,} ngày")
print(f"Train period: {X_train.index[0]} → {X_train.index[-1]}")
print(f"Test period : {X_test.index[0]} → {X_test.index[-1]}\n")

# ===================================================================
# 4. KHỞI TẠO HỆ THỐNG
# ===================================================================
system = AutoRetraining(
    model_creator=create_catboost_model,
    initial_baseline={
        'T+1': {'RMSE': 1.4887, 'R2': 0.914455},
        'T+2': {'RMSE': 2.0034, 'R2': 0.844838},
        'T+3': {'RMSE': 2.2155, 'R2': 0.809956},
        'T+4': {'RMSE': 2.3363, 'R2': 0.788630},
        'T+5': {'RMSE': 2.4058, 'R2': 0.775827}
    },
    window_size=90,
    max_idle_days=60
)

system.deploy_model(best_model, X_train, y_train)

# ===================================================================
# 5. CHẠY TỪNG NGÀY TRONG TEST SET (30 NGÀY ĐẦU)
# ===================================================================
start_datetime = X_test.index[0]
dates = [start_datetime + timedelta(days=i) for i in range(30)]  # Chỉ 30 ngày

print(f"\nBẮT ĐẦU CHẠY 30 NGÀY TRONG TEST SET")
print(f"Từ: {dates[0].strftime('%d/%m/%Y %H:%M')} → {dates[-1].strftime('%d/%m/%Y')}\n" + "="*95 + "\n")

for i, pred_date in enumerate(dates):
    print(f"NGÀY {i+1:03d} | {pred_date.strftime('%d/%m/%Y')} | {pred_date.strftime('%A')[:3].upper()}")

    # Lấy đúng 1 ngày từ X_test, y_test
    X_today = X_test.iloc[[i]]
    y_today = y_test.iloc[[i]]

    # Dự báo + giám sát
    record = system.forecast_daily(X_today, y_today, pred_date)

    # Tự động retrain
    need_retrain, reason = system.should_retrain()
    if need_retrain:
        print(f"RETRAIN KÍCH HOẠT: {reason}")
        system.perform_retrain()
        print(f"MODEL MỚI ĐÃ TRIỂN KHAI (v{len(system.retrain_events)})")
    else:
        print(f"Model ổn định: {reason}")

    print("-" * 95)

# ===================================================================
# 6. TỔNG KẾT SAU 30 NGÀY
# ===================================================================
print("\nTỔNG KẾT HOẠT ĐỘNG MLOps PRODUCTION\n" + "="*95)
print(f"Số ngày dự báo: {len(system.daily_predictions)}")
print(f"Số lần retrain: {len(system.retrain_events)}")

if system.retrain_events:
    print("\nLỊCH SỬ RETRAIN:")
    for idx, e in enumerate(system.retrain_events, 1):
        print(f" • v{idx} | {e['time'].strftime('%d/%m/%Y %H:%M')} | {e['reason']}")

print(f"\nBASELINE CUỐI CÙNG (sau cập nhật tự động):")
for label, scores in system.baseline_scores.items():
    print(f"   • {label}: RMSE={scores['RMSE']:.4f}, R²={scores['R2']:.4f}")

print(f"\nHOÀN TẤT!")
print("="*95)

LOADING BEST MODEL & PREPROCESSOR...

Model loaded: CatBoostRegressor
Features used: 95 → ['day_length_hours_lag_21', 'day_length_hours_lag_30', 'temp_sealevelpressure_interaction', 'feelslike', 'temp']...

LOADING & PREPROCESSING DATA...

Raw data: (3619, 947)
Index đầu tiên: 2015-10-31 00:00:00
X size: (3619, 942) | y size: (3619, 5)
X_selected size: (3619, 95)
Index đầu tiên: 2015-10-31 00:00:00

Train: 2,895 ngày | Test: 724 ngày
Train period: 2015-10-31 00:00:00 → 2023-10-03 00:00:00
Test period : 2023-10-04 00:00:00 → 2025-09-26 00:00:00

AutoRetraining FULL SYSTEM INITIALIZED
MODEL DEPLOYED!
   • Training samples: 2,895
   • Horizons: T+1, T+2, T+3, T+4, T+5

BẮT ĐẦU CHẠY 30 NGÀY TRONG TEST SET
Từ: 04/10/2023 00:00 → 02/11/2023

NGÀY 001 | 04/10/2023 | WED
PERFORMANCE ALERT [2023-10-04]
   • T+4: RMSE up 32.9%
   • T+5: RMSE up 46.2%

                               DAILY RMSE REPORT
Date              T+1      T+2      T+3      T+4      T+5     MEAN
------------------------------