In [1]:
#!/usr/bin/env python3
"""
Script 3: Training Pipeline v·ªõi XGBoost v√† c√°c m√¥ h√¨nh h·ªìi quy kh√°c
Train v√† so s√°nh nhi·ªÅu models: XGBoost, Random Forest, LightGBM, Linear Regression
"""

import pandas as pd
import numpy as np
import warnings
from pathlib import Path
import os
os.chdir('..')
import json
import pickle
from datetime import datetime

# Machine Learning
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
import xgboost as xgb
import lightgbm as lgb

warnings.filterwarnings('ignore')

print("=" * 80)
print("TRAINING PIPELINE - XGBOOST V√Ä C√ÅC M√î H√åN H·ªíI QUY")
print("=" * 80)

# ============================================================================
# 1. LOAD D·ªÆ LI·ªÜU ƒê√É X·ª¨ L√ù
# ============================================================================

print("\n" + "=" * 80)
print("B∆Ø·ªöC 1: LOAD D·ªÆ LI·ªÜU")
print("=" * 80)

print("\nüìÇ ƒêang load d·ªØ li·ªáu ƒë√£ x·ª≠ l√Ω...")
df = pd.read_parquet("./output/processed_data.parquet")

print(f"‚úÖ Dataset shape: {df.shape}")
print(f"   - S·ªë records: {len(df)}")
print(f"   - S·ªë buildings: {df['building_id'].nunique()}")

# Load features info
with open('output/features_info.json', 'r') as f:
    features_info = json.load(f)

print(f"\nüìä Features:")
print(f"   - Continuous: {len(features_info['continuous_features'])}")
print(f"   - Categorical: {len(features_info['categorical_features'])}")
print(f"   - Time features: {len(features_info['time_features'])}")
print(f"   - Lag features: {len(features_info['lag_features'])}")

TRAINING PIPELINE - XGBOOST V√Ä C√ÅC M√î H√åN H·ªíI QUY

B∆Ø·ªöC 1: LOAD D·ªÆ LI·ªÜU

üìÇ ƒêang load d·ªØ li·ªáu ƒë√£ x·ª≠ l√Ω...
‚úÖ Dataset shape: (25187366, 61)
   - S·ªë records: 25187366
   - S·ªë buildings: 1572

üìä Features:
   - Continuous: 10
   - Categorical: 5
   - Time features: 12
   - Lag features: 6


In [2]:
sample_size = min(2000, df['building_id'].nunique())


In [3]:
sample_buildings = np.random.choice(
    df['building_id'].unique(), 
    size=sample_size, 
    replace=False
)
sample_buildings

array(['Bull_education_Barry', 'Bull_assembly_Brandon',
       'Cockatoo_education_Mayra', ..., 'Gator_public_Jolene',
       'Robin_office_Gayle', 'Rat_education_Nellie'],
      shape=(1572,), dtype=object)

In [4]:
df_train = df[df['building_id'].isin(sample_buildings)].copy()
df_train.head(5)

Unnamed: 0,timestamp,building_id,electricity_consumption,site_id,building_id_kaggle,site_id_kaggle,primaryspaceusage,sub_primaryspaceusage,sqm,sqft,...,day_of_week_sin,day_of_week_cos,month_sin,month_cos,electricity_lag1,electricity_lag24,electricity_lag168,electricity_rolling_mean_24h,electricity_rolling_std_24h,electricity_rolling_mean_7d
0,2016-01-01 00:00:00,Bear_assembly_Angel,237.85,Bear,602.0,4.0,Entertainment/public assembly,Entertainment/public assembly,22117.0,238065.0,...,-0.433884,-0.900969,0.5,0.866025,0.0,0.0,0.0,237.85,237.85,237.85
1,2016-01-01 01:00:00,Bear_assembly_Angel,221.25,Bear,602.0,4.0,Entertainment/public assembly,Entertainment/public assembly,22117.0,238065.0,...,-0.433884,-0.900969,0.5,0.866025,237.85,0.0,0.0,229.55,11.737973,229.55
2,2016-01-01 02:00:00,Bear_assembly_Angel,222.5,Bear,602.0,4.0,Entertainment/public assembly,Entertainment/public assembly,22117.0,238065.0,...,-0.433884,-0.900969,0.5,0.866025,221.25,0.0,0.0,227.2,9.244323,227.2
3,2016-01-01 03:00:00,Bear_assembly_Angel,221.4,Bear,602.0,4.0,Entertainment/public assembly,Entertainment/public assembly,22117.0,238065.0,...,-0.433884,-0.900969,0.5,0.866025,222.5,0.0,0.0,225.75,8.085893,225.75
4,2016-01-01 04:00:00,Bear_assembly_Angel,224.25,Bear,602.0,4.0,Entertainment/public assembly,Entertainment/public assembly,22117.0,238065.0,...,-0.433884,-0.900969,0.5,0.866025,221.4,0.0,0.0,225.45,7.034646,225.45


In [5]:
df_train = df_train.sort_values(['building_id', 'timestamp']).reset_index(drop=True)

In [6]:
all_features = (
    features_info['continuous_features'] + 
    features_info['time_features'] + 
    features_info['lag_features']
)

# Lo·∫°i b·ªè c√°c features kh√¥ng c√≥ trong dataset
all_features = [f for f in all_features if f in df_train.columns]
categorical_features = [f for f in features_info['categorical_features'] if f in df_train.columns]

print(f"\nüìä Features ƒë∆∞·ª£c s·ª≠ d·ª•ng:")
print(f"   - Continuous/Time/Lag: {len(all_features)}")
print(f"   - Categorical: {len(categorical_features)}")


üìä Features ƒë∆∞·ª£c s·ª≠ d·ª•ng:
   - Continuous/Time/Lag: 28
   - Categorical: 5


In [7]:
categorical_features

['primaryspaceusage', 'sub_primaryspaceusage', 'site_id', 'timezone', 'season']

In [8]:
X = df_train[all_features + categorical_features].copy()
X.head(5)

Unnamed: 0,sqm,yearbuilt,numberoffloors,occupants,airTemperature,cloudCoverage,dewTemperature,windSpeed,seaLvlPressure,precipDepth1HR,...,electricity_lag24,electricity_lag168,electricity_rolling_mean_24h,electricity_rolling_std_24h,electricity_rolling_mean_7d,primaryspaceusage,sub_primaryspaceusage,site_id,timezone,season
0,22117.0,1933.0,6.0,0.0,4.4,0.0,-2.2,0.0,1020.9,0.0,...,0.0,0.0,237.85,237.85,237.85,Entertainment/public assembly,Entertainment/public assembly,Bear,US/Pacific,Winter
1,22117.0,1933.0,6.0,0.0,4.4,0.0,-4.4,2.1,1020.5,0.0,...,0.0,0.0,229.55,11.737973,229.55,Entertainment/public assembly,Entertainment/public assembly,Bear,US/Pacific,Winter
2,22117.0,1933.0,6.0,0.0,4.4,0.0,-6.7,2.1,1020.8,0.0,...,0.0,0.0,227.2,9.244323,227.2,Entertainment/public assembly,Entertainment/public assembly,Bear,US/Pacific,Winter
3,22117.0,1933.0,6.0,0.0,4.4,0.0,-7.8,2.6,1020.7,0.0,...,0.0,0.0,225.75,8.085893,225.75,Entertainment/public assembly,Entertainment/public assembly,Bear,US/Pacific,Winter
4,22117.0,1933.0,6.0,0.0,5.0,0.0,-9.4,0.0,1020.6,0.0,...,0.0,0.0,225.45,7.034646,225.45,Entertainment/public assembly,Entertainment/public assembly,Bear,US/Pacific,Winter


In [9]:
y = df_train[features_info['target']].copy()
y.head(5)

0    237.85
1    221.25
2    222.50
3    221.40
4    224.25
Name: electricity_consumption, dtype: float64

In [10]:
# Ki·ªÉm tra duplicate columns
print("üìä Ki·ªÉm tra duplicate columns...")
duplicate_cols = X.columns[X.columns.duplicated()].tolist()
if duplicate_cols:
    print(f"‚ö†Ô∏è  C√≥ duplicate columns: {duplicate_cols}")
    print(f"   T·ªïng s·ªë columns: {len(X.columns)}")
    print(f"   Unique columns: {len(X.columns.unique())}")

# Ki·ªÉm tra t·ª´ng categorical feature
print(f"\nüìä Ki·ªÉm tra categorical features:")
for col in categorical_features:
    if col not in X.columns:
        print(f"   ‚ùå '{col}' kh√¥ng t·ªìn t·∫°i trong X")
    else:
        col_data = X[col]
        if isinstance(col_data, pd.DataFrame):
            print(f"   ‚ö†Ô∏è  '{col}' l√† DataFrame v·ªõi shape {col_data.shape}")
        elif isinstance(col_data, pd.Series):
            print(f"   ‚úÖ '{col}' l√† Series v·ªõi shape {col_data.shape}")
        else:
            print(f"   ‚ö†Ô∏è  '{col}' c√≥ type: {type(col_data)}")

# Encode categorical features
label_encoders = {}
for col in categorical_features:
    if col not in X.columns:
        print(f"‚ö†Ô∏è  Warning: Column '{col}' not found in X, skipping...")
        continue
    
    # ƒê·∫£m b·∫£o l·∫•y Series 1D, kh√¥ng ph·∫£i DataFrame
    col_data = X[col]
    if isinstance(col_data, pd.DataFrame):
        # N·∫øu l√† DataFrame (c√≥ duplicate column names), l·∫•y c·ªôt ƒë·∫ßu ti√™n
        col_data = col_data.iloc[:, 0]
        print(f"‚ö†Ô∏è  Warning: Column '{col}' is a DataFrame, using first column")
    
    # Convert to Series n·∫øu ch∆∞a ph·∫£i
    if not isinstance(col_data, pd.Series):
        col_data = pd.Series(col_data)
    
    le = LabelEncoder()
    X[col] = le.fit_transform(col_data.astype(str))
    label_encoders[col] = le

print(f"\n‚úÖ ƒê√£ encode {len(label_encoders)} categorical features")

# Lo·∫°i b·ªè duplicate columns (n·∫øu c√≥)
print("\nüìä Ki·ªÉm tra v√† lo·∫°i b·ªè duplicate columns...")
if X.columns.duplicated().any():
    duplicate_cols = X.columns[X.columns.duplicated()].tolist()
    print(f"‚ö†Ô∏è  Ph√°t hi·ªán duplicate columns: {duplicate_cols}")
    # Gi·ªØ l·∫°i c·ªôt ƒë·∫ßu ti√™n, lo·∫°i b·ªè c√°c c·ªôt duplicate
    X = X.loc[:, ~X.columns.duplicated()]
    print(f"‚úÖ ƒê√£ lo·∫°i b·ªè duplicate columns. Shape m·ªõi: {X.shape}")

# ƒê·∫£m b·∫£o t·∫•t c·∫£ c√°c c·ªôt ƒë·ªÅu l√† Series 1D
print("\nüìä ƒê·∫£m b·∫£o t·∫•t c·∫£ c·ªôt ƒë·ªÅu l√† Series 1D...")
for col in X.columns:
    col_data = X[col]
    if isinstance(col_data, pd.DataFrame):
        # N·∫øu l√† DataFrame, l·∫•y c·ªôt ƒë·∫ßu ti√™n
        X[col] = col_data.iloc[:, 0]
        print(f"‚ö†Ô∏è  ƒê√£ s·ª≠a c·ªôt '{col}' t·ª´ DataFrame th√†nh Series")
    elif not isinstance(col_data, pd.Series):
        # N·∫øu kh√¥ng ph·∫£i Series, convert
        X[col] = pd.Series(col_data, index=X.index)
        print(f"‚ö†Ô∏è  ƒê√£ convert c·ªôt '{col}' th√†nh Series")

print(f"‚úÖ X shape cu·ªëi c√πng: {X.shape}")
print(f"‚úÖ T·∫•t c·∫£ c·ªôt ƒë·ªÅu l√† Series 1D")

üìä Ki·ªÉm tra duplicate columns...
‚ö†Ô∏è  C√≥ duplicate columns: ['season']
   T·ªïng s·ªë columns: 33
   Unique columns: 32

üìä Ki·ªÉm tra categorical features:
   ‚úÖ 'primaryspaceusage' l√† Series v·ªõi shape (25187366,)
   ‚úÖ 'sub_primaryspaceusage' l√† Series v·ªõi shape (25187366,)
   ‚úÖ 'site_id' l√† Series v·ªõi shape (25187366,)
   ‚úÖ 'timezone' l√† Series v·ªõi shape (25187366,)
   ‚ö†Ô∏è  'season' l√† DataFrame v·ªõi shape (25187366, 2)

‚úÖ ƒê√£ encode 5 categorical features

üìä Ki·ªÉm tra v√† lo·∫°i b·ªè duplicate columns...
‚ö†Ô∏è  Ph√°t hi·ªán duplicate columns: ['season']
‚úÖ ƒê√£ lo·∫°i b·ªè duplicate columns. Shape m·ªõi: (25187366, 32)

üìä ƒê·∫£m b·∫£o t·∫•t c·∫£ c·ªôt ƒë·ªÅu l√† Series 1D...
‚úÖ X shape cu·ªëi c√πng: (25187366, 32)
‚úÖ T·∫•t c·∫£ c·ªôt ƒë·ªÅu l√† Series 1D


In [11]:

print("\n" + "=" * 80)
print("B∆Ø·ªöC 3: CHIA TRAIN/TEST SET")
print("=" * 80)

# Chia theo th·ªùi gian (80% train, 20% test)
split_idx = int(len(df_train) * 0.8)

X_train = X.iloc[:split_idx]
y_train = y.iloc[:split_idx]
X_test = X.iloc[split_idx:]
y_test = y.iloc[split_idx:]

print(f"‚úÖ Train set: {X_train.shape[0]} samples")
print(f"‚úÖ Test set: {X_test.shape[0]} samples")
print(f"\n   Train period: {df_train.iloc[0]['timestamp']} ƒë·∫øn {df_train.iloc[split_idx-1]['timestamp']}")
print(f"   Test period: {df_train.iloc[split_idx]['timestamp']} ƒë·∫øn {df_train.iloc[-1]['timestamp']}")


B∆Ø·ªöC 3: CHIA TRAIN/TEST SET
‚úÖ Train set: 20149892 samples
‚úÖ Test set: 5037474 samples

   Train period: 2016-01-01 00:00:00 ƒë·∫øn 2016-09-05 15:00:00
   Test period: 2016-09-05 16:00:00 ƒë·∫øn 2017-12-31 23:00:00


In [12]:
print("\n" + "=" * 80)
print("B∆Ø·ªöC 4: TRAIN C√ÅC M√î H√åN")
print("=" * 80)

models = {}
results = {}

# ============================================================================
# 4.1. XGBoost
# ============================================================================

print("\n" + "-" * 80)
print("4.1. Training XGBoost...")
print("-" * 80)

xgb_model = xgb.XGBRegressor(
    n_estimators=200,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=3,
    random_state=42,
    n_jobs=-1,
    objective='reg:squarederror',
    eval_metric='rmse'
)

print("ƒêang training...")
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    verbose=50
)

# Predictions
y_pred_train_xgb = xgb_model.predict(X_train)
y_pred_test_xgb = xgb_model.predict(X_test)

# Metrics
train_rmse_xgb = np.sqrt(mean_squared_error(y_train, y_pred_train_xgb))
test_rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_test_xgb))
train_mae_xgb = mean_absolute_error(y_train, y_pred_train_xgb)
test_mae_xgb = mean_absolute_error(y_test, y_pred_test_xgb)
train_r2_xgb = r2_score(y_train, y_pred_train_xgb)
test_r2_xgb = r2_score(y_test, y_pred_test_xgb)
test_mape_xgb = mean_absolute_percentage_error(y_test, y_pred_test_xgb)

models['XGBoost'] = xgb_model
results['XGBoost'] = {
    'train_rmse': train_rmse_xgb,
    'test_rmse': test_rmse_xgb,
    'train_mae': train_mae_xgb,
    'test_mae': test_mae_xgb,
    'train_r2': train_r2_xgb,
    'test_r2': test_r2_xgb,
    'test_mape': test_mape_xgb
}

print(f"‚úÖ XGBoost - Test RMSE: {test_rmse_xgb:.2f}, Test R¬≤: {test_r2_xgb:.4f}")


B∆Ø·ªöC 4: TRAIN C√ÅC M√î H√åN

--------------------------------------------------------------------------------
4.1. Training XGBoost...
--------------------------------------------------------------------------------
ƒêang training...
[0]	validation_0-rmse:223.28391	validation_1-rmse:230.24568
[50]	validation_0-rmse:27.04630	validation_1-rmse:35.60330
[100]	validation_0-rmse:19.56809	validation_1-rmse:29.64339
[150]	validation_0-rmse:18.80954	validation_1-rmse:29.48399
[199]	validation_0-rmse:18.31831	validation_1-rmse:30.29663
‚úÖ XGBoost - Test RMSE: 30.30, Test R¬≤: 0.9843


In [22]:
# ============================================================================
# 4.2. LightGBM
# ============================================================================

print("\n" + "-" * 80)
print("4.2. Training LightGBM...")
print("-" * 80)

lgb_model = lgb.LGBMRegressor(
    n_estimators=200,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_samples=20,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

print("ƒêang training...")
lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    eval_names=['train', 'test'],
    eval_metric='rmse',
    callbacks=[lgb.early_stopping(stopping_rounds=20), lgb.log_evaluation(50)]
)

# Predictions
y_pred_train_lgb = lgb_model.predict(X_train)
y_pred_test_lgb = lgb_model.predict(X_test)

# Metrics
train_rmse_lgb = np.sqrt(mean_squared_error(y_train, y_pred_train_lgb))
test_rmse_lgb = np.sqrt(mean_squared_error(y_test, y_pred_test_lgb))
train_mae_lgb = mean_absolute_error(y_train, y_pred_train_lgb)
test_mae_lgb = mean_absolute_error(y_test, y_pred_test_lgb)
train_r2_lgb = r2_score(y_train, y_pred_train_lgb)
test_r2_lgb = r2_score(y_test, y_pred_test_lgb)
test_mape_lgb = mean_absolute_percentage_error(y_test, y_pred_test_lgb)

models['LightGBM'] = lgb_model
results['LightGBM'] = {
    'train_rmse': train_rmse_lgb,
    'test_rmse': test_rmse_lgb,
    'train_mae': train_mae_lgb,
    'test_mae': test_mae_lgb,
    'train_r2': train_r2_lgb,
    'test_r2': test_r2_lgb,
    'test_mape': test_mape_lgb
}

print(f"‚úÖ LightGBM - Test RMSE: {test_rmse_lgb:.2f}, Test R¬≤: {test_r2_lgb:.4f}")


--------------------------------------------------------------------------------
4.2. Training LightGBM...
--------------------------------------------------------------------------------
ƒêang training...
Training until validation scores don't improve for 20 rounds
[50]	train's rmse: 8.25511	train's l2: 68.1469	test's rmse: 5.31647	test's l2: 28.2648
[100]	train's rmse: 6.62558	train's l2: 43.8983	test's rmse: 3.56382	test's l2: 12.7008
[150]	train's rmse: 6.3609	train's l2: 40.4611	test's rmse: 3.46857	test's l2: 12.031
Early stopping, best iteration is:
[178]	train's rmse: 6.27884	train's l2: 39.4239	test's rmse: 3.44945	test's l2: 11.8987
‚úÖ LightGBM - Test RMSE: 3.45, Test R¬≤: 0.9683


In [23]:
# ============================================================================
# 4.3. Random Forest
# ============================================================================

print("\n" + "-" * 80)
print("4.3. Training Random Forest...")
print("-" * 80)

rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

print("ƒêang training...")
rf_model.fit(X_train, y_train)

# Predictions
y_pred_train_rf = rf_model.predict(X_train)
y_pred_test_rf = rf_model.predict(X_test)

# Metrics
train_rmse_rf = np.sqrt(mean_squared_error(y_train, y_pred_train_rf))
test_rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_test_rf))
train_mae_rf = mean_absolute_error(y_train, y_pred_train_rf)
test_mae_rf = mean_absolute_error(y_test, y_pred_test_rf)
train_r2_rf = r2_score(y_train, y_pred_train_rf)
test_r2_rf = r2_score(y_test, y_pred_test_rf)
test_mape_rf = mean_absolute_percentage_error(y_test, y_pred_test_rf)

models['RandomForest'] = rf_model
results['RandomForest'] = {
    'train_rmse': train_rmse_rf,
    'test_rmse': test_rmse_rf,
    'train_mae': train_mae_rf,
    'test_mae': test_mae_rf,
    'train_r2': train_r2_rf,
    'test_r2': test_r2_rf,
    'test_mape': test_mape_rf
}

print(f"‚úÖ Random Forest - Test RMSE: {test_rmse_rf:.2f}, Test R¬≤: {test_r2_rf:.4f}")


--------------------------------------------------------------------------------
4.3. Training Random Forest...
--------------------------------------------------------------------------------
ƒêang training...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    2.8s


‚úÖ Random Forest - Test RMSE: 3.35, Test R¬≤: 0.9702


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    8.8s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.0s finished


In [13]:
# ============================================================================
# 4.4. Linear Regression (Baseline)
# ============================================================================

print("\n" + "-" * 80)
print("4.4. Training Linear Regression (Baseline)...")
print("-" * 80)

# Chu·∫©n h√≥a d·ªØ li·ªáu cho Linear Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lr_model = LinearRegression()
print("ƒêang training...")
lr_model.fit(X_train_scaled, y_train)

# Predictions
y_pred_train_lr = lr_model.predict(X_train_scaled)
y_pred_test_lr = lr_model.predict(X_test_scaled)

# Metrics
train_rmse_lr = np.sqrt(mean_squared_error(y_train, y_pred_train_lr))
test_rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_test_lr))
train_mae_lr = mean_absolute_error(y_train, y_pred_train_lr)
test_mae_lr = mean_absolute_error(y_test, y_pred_test_lr)
train_r2_lr = r2_score(y_train, y_pred_train_lr)
test_r2_lr = r2_score(y_test, y_pred_test_lr)
test_mape_lr = mean_absolute_percentage_error(y_test, y_pred_test_lr)

models['LinearRegression'] = lr_model
models['Scaler'] = scaler  # L∆∞u scaler ƒë·ªÉ d√πng sau
results['LinearRegression'] = {
    'train_rmse': train_rmse_lr,
    'test_rmse': test_rmse_lr,
    'train_mae': train_mae_lr,
    'test_mae': test_mae_lr,
    'train_r2': train_r2_lr,
    'test_r2': test_r2_lr,
    'test_mape': test_mape_lr
}

print(f"‚úÖ Linear Regression - Test RMSE: {test_rmse_lr:.2f}, Test R¬≤: {test_r2_lr:.4f}")


--------------------------------------------------------------------------------
4.4. Training Linear Regression (Baseline)...
--------------------------------------------------------------------------------
ƒêang training...
‚úÖ Linear Regression - Test RMSE: 35.38, Test R¬≤: 0.9786


In [14]:
# ============================================================================
# 5. SO S√ÅNH K·∫æT QU·∫¢
# ============================================================================

print("\n" + "=" * 80)
print("B∆Ø·ªöC 5: SO S√ÅNH K·∫æT QU·∫¢")
print("=" * 80)

results_df = pd.DataFrame(results).T
results_df = results_df.round(4)

print("\nüìä K·∫øt qu·∫£ c√°c m√¥ h√¨nh:")
print("=" * 80)
print(results_df.to_string())

# T√¨m model t·ªët nh·∫•t
best_model_name = results_df['test_rmse'].idxmin()
print(f"\nüèÜ Model t·ªët nh·∫•t (RMSE th·∫•p nh·∫•t): {best_model_name}")
print(f"   - Test RMSE: {results_df.loc[best_model_name, 'test_rmse']:.2f}")
print(f"   - Test R¬≤: {results_df.loc[best_model_name, 'test_r2']:.4f}")
print(f"   - Test MAE: {results_df.loc[best_model_name, 'test_mae']:.2f}")


B∆Ø·ªöC 5: SO S√ÅNH K·∫æT QU·∫¢

üìä K·∫øt qu·∫£ c√°c m√¥ h√¨nh:
                  train_rmse  test_rmse  train_mae  test_mae  train_r2  test_r2  test_mape
XGBoost              18.3183    30.2966     6.2711    7.6082    0.9939   0.9843     0.1688
LinearRegression     38.7552    35.3788    14.7708   15.1824    0.9728   0.9786     1.9500

üèÜ Model t·ªët nh·∫•t (RMSE th·∫•p nh·∫•t): XGBoost
   - Test RMSE: 30.30
   - Test R¬≤: 0.9843
   - Test MAE: 7.61


In [15]:
# ============================================================================
# 6. L∆ØU MODELS V√Ä K·∫æT QU·∫¢
# ============================================================================

print("\n" + "=" * 80)
print("B∆Ø·ªöC 6: L∆ØU MODELS V√Ä K·∫æT QU·∫¢")
print("=" * 80)

os.makedirs('output/models', exist_ok=True)

# L∆∞u t·ª´ng model
for model_name, model in models.items():
    if model_name != 'Scaler':  # Scaler s·∫Ω l∆∞u ri√™ng
        model_path = f"output/models/{model_name.lower().replace(' ', '_')}.pkl"
        with open(model_path, 'wb') as f:
            pickle.dump(model, f)
        print(f"‚úÖ ƒê√£ l∆∞u {model_name} v√†o: {model_path}")

# L∆∞u scaler
scaler_path = "output/models/scaler.pkl"
with open(scaler_path, 'wb') as f:
    pickle.dump(scaler, f)
print(f"‚úÖ ƒê√£ l∆∞u Scaler v√†o: {scaler_path}")

# L∆∞u label encoders
encoders_path = "output/models/label_encoders.pkl"
with open(encoders_path, 'wb') as f:
    pickle.dump(label_encoders, f)
print(f"‚úÖ ƒê√£ l∆∞u Label Encoders v√†o: {encoders_path}")

# L∆∞u k·∫øt qu·∫£
results_df.to_csv('output/models/results_comparison.csv')
print(f"‚úÖ ƒê√£ l∆∞u k·∫øt qu·∫£ so s√°nh v√†o: output/models/results_comparison.csv")

# L∆∞u th√¥ng tin v·ªÅ features v√† best model
model_info = {
    'best_model': best_model_name,
    'features_used': all_features + categorical_features,
    'categorical_features': categorical_features,
    'training_date': datetime.now().isoformat(),
    'train_size': len(X_train),
    'test_size': len(X_test),
    'results': results
}

with open('output/models/model_info.json', 'w') as f:
    json.dump(model_info, f, indent=2, default=str)

print(f"‚úÖ ƒê√£ l∆∞u th√¥ng tin model v√†o: output/models/model_info.json")

print("\n" + "=" * 80)
print("HO√ÄN TH√ÄNH TRAINING!")
print("=" * 80)
print(f"‚úÖ ƒê√£ train {len(models)} m√¥ h√¨nh")
print(f"üèÜ Model t·ªët nh·∫•t: {best_model_name}")
print(f"üìÅ Models ƒë√£ ƒë∆∞·ª£c l∆∞u trong: output/models/")



B∆Ø·ªöC 6: L∆ØU MODELS V√Ä K·∫æT QU·∫¢
‚úÖ ƒê√£ l∆∞u XGBoost v√†o: output/models/xgboost.pkl
‚úÖ ƒê√£ l∆∞u LinearRegression v√†o: output/models/linearregression.pkl
‚úÖ ƒê√£ l∆∞u Scaler v√†o: output/models/scaler.pkl
‚úÖ ƒê√£ l∆∞u Label Encoders v√†o: output/models/label_encoders.pkl
‚úÖ ƒê√£ l∆∞u k·∫øt qu·∫£ so s√°nh v√†o: output/models/results_comparison.csv
‚úÖ ƒê√£ l∆∞u th√¥ng tin model v√†o: output/models/model_info.json

HO√ÄN TH√ÄNH TRAINING!
‚úÖ ƒê√£ train 3 m√¥ h√¨nh
üèÜ Model t·ªët nh·∫•t: XGBoost
üìÅ Models ƒë√£ ƒë∆∞·ª£c l∆∞u trong: output/models/
