# XGBoost Model Notebook with SQL integration

This notebook tests the multi-station model training functionality.

In [6]:
import sys
import os
import importlib

sys.path.append(os.path.abspath('../src/'))
sys.path.append(os.path.join(os.getcwd(), '..'))

# import utils.model_utils as model_utils
import pandas as pd
import numpy as np

# importlib.reload(model_utils)
from data.make_dataset import get_engine

engine, _ = get_engine()
print('Tables:\n')
print(pd.read_sql("SHOW TABLES", engine))
print('Stations:\n')
print(pd.read_sql("SELECT DISTINCT station FROM gw_table LIMIT 10", engine))

print(pd.read_sql("SELECT * FROM pred_table LIMIT 100", engine))


('✅ Connected to MySQL, test query result:', 1)
Tables:

  Tables_in_mlops_database
0                 gw_table
1             precip_table
2               pred_table
3            stations_meta
Stations:

   station
0        1
1      100
2    10035
3    10038
4      101
5      102
6      103
7    10414
8    10417
9    10420
Empty DataFrame
Columns: [date, station, day_1, day_2, day_3, day_4, day_5, day_6, day_7]
Index: []


## Test Multi-Station Model Training

Let's test the updated functions that can handle multiple stations simultaneously.

In [9]:
# Define test parameters
station_list = ['100']
#, ['100', '106', '115']  # Replace with actual station IDs
start_date = '2022-01-01'
end_date = '2025-04-30'
test_size = 0.2

print(f"Testing multi-station training with stations: {station_list}")
print(f"Date range: {start_date} to {end_date}")
print(f"Test size: {test_size}")

Testing multi-station training with stations: ['100']
Date range: 2022-01-01 to 2025-04-30
Test size: 0.2


## Step 1: Load and Examine Data

First, let's load the data for multiple stations and see what we get.

In [3]:
# Load training data for multiple stations
from models.utils import load_training_data

print("Loading training data for multiple stations...")
gw_df, precip_df = load_training_data(station_list, start_date, end_date)

print(f"\nGroundwater data shape: {gw_df.shape}")
print(f"Precipitation data shape: {precip_df.shape}")

print(f"\nStations in groundwater data: {gw_df['station'].unique()}")
print(f"Stations in precipitation data: {precip_df['station'].unique()}")

print(f"\nGroundwater data sample:")
print(gw_df.head())

print(f"\nPrecipitation data sample:")
print(precip_df.head())

Loading training data for multiple stations...
('✅ Connected to MySQL, test query result:', 1)
('✅ Connected to MySQL, test query result:', 1)

Groundwater data shape: (1212, 2)
Precipitation data shape: (1212, 2)

Stations in groundwater data: [100]
Stations in precipitation data: [100]

Groundwater data sample:
            value  station
date                      
2022-01-01  30.64      100
2022-01-02  30.66      100
2022-01-03  30.67      100
2022-01-04  30.69      100
2022-01-05  30.67      100

Precipitation data sample:
            precipitation  station
date                              
2022-01-01            6.5      100
2022-01-02            1.6      100
2022-01-03            5.2      100
2022-01-04            1.9      100
2022-01-05            0.8      100


## Step 2: Build Features

Now let's build the features from the multi-station data.

In [4]:
# Build features from the loaded data
from features.build_features import build_features

print("Building features from multi-station data...")
X, y = build_features(gw_df, precip_df)

print(f"\nFeature matrix shape: {X.shape}")
print(f"Target matrix shape: {y.shape}")

print(f"\nFeature columns:")
print(X.columns.tolist())

print(f"\nFeature matrix sample:")
print(X.head(30))

print(f"\nTarget matrix sample (first 5 rows, all prediction days):")
print(y[:5])

Building features from multi-station data...

Feature matrix shape: (1201, 7)
Target matrix shape: (1201, 7)

Feature columns:
['gw_lag_4', 'gw_lag_3', 'gw_lag_2', 'gw_lag_1', 'prcp_sum_14', 'prcp_sum_30', 'month']

Feature matrix sample:
    gw_lag_4  gw_lag_3  gw_lag_2  gw_lag_1  prcp_sum_14  prcp_sum_30  month
0      30.64     30.66     30.67     30.69          NaN          NaN      1
1      30.66     30.67     30.69     30.67          NaN          NaN      1
2      30.67     30.69     30.67     30.61          NaN          NaN      1
3      30.69     30.67     30.61     30.64          NaN          NaN      1
4      30.67     30.61     30.64     30.64          NaN          NaN      1
5      30.61     30.64     30.64     30.68          NaN          NaN      1
6      30.64     30.64     30.68     30.60          NaN          NaN      1
7      30.64     30.68     30.60     30.57          NaN          NaN      1
8      30.68     30.60     30.57     30.59          NaN          NaN      1
9

## Step 3: Train Multi-Station Model

Train a single model on data from all stations.

In [11]:
import models.train_model

importlib.reload(models.train_model)
from models.train_model import train

model, metrics, y_pred, y_test, results = train(100,
                                                start_date,
                                                end_date,
                                                test_size=test_size)

('✅ Connected to MySQL, test query result:', 1)
('✅ Connected to MySQL, test query result:', 1)


TypeError: 'int' object is not iterable

## Step 4: Test Prediction on Individual Station

Test using the trained multi-station model to predict for a single station.

In [None]:
# Test prediction on a single station using the multi-station trained model
import models.utils

importlib.reload(models.utils)
import models.predict_model

importlib.reload(models.predict_model)

from models.predict_model import predict_station
from models.utils import load_model_from_registry

from models.utils import load_model_from_registry, load_model_from_local_path

# model = load_model_from_registry("my_model", "Production")

# model = load_model_from_local_path("./mlruns/0/models/m-57fec30aa1ad46aeaec956654f205a7f/artifacts/model")
# model = load_model_from_registry("my_model", "Production")
# predict_station(100, model, start_date, end_date)

test_station = station_list[0]  # Use first station for testing
pred_start_date = '2023-01-01'
pred_end_date = '2023-01-31'

print(f"Testing prediction for station {test_station}")
print(f"Prediction date range: {pred_start_date} to {pred_end_date}")

predictions = predict_station(test_station, model, pred_start_date,
                              pred_end_date)

# print(predictions)
# # for day, preds in predictions.items():
# #     print(
# #         f"{day}: {len(preds)} predictions, mean={np.mean(preds):.3f}, std={np.std(preds):.3f}"
# #     )
# #     print(f"  Sample predictions: {preds[:5]}")

Testing prediction for station 100
Prediction date range: 2023-01-01 to 2023-01-31
('✅ Connected to MySQL, test query result:', 1)

MULTI-OUTPUT PREDICTION RESULTS
   day_1: RMSE=0.032, MAE=0.026, R2=0.187
   day_2: RMSE=0.043, MAE=0.035, R2=-0.504
   day_3: RMSE=0.043, MAE=0.036, R2=-0.515
   day_4: RMSE=0.039, MAE=0.032, R2=-0.190
   day_5: RMSE=0.038, MAE=0.028, R2=-0.134
   day_6: RMSE=0.039, MAE=0.031, R2=-0.196
   day_7: RMSE=0.039, MAE=0.033, R2=-0.203


# Prediction from loaded model

In [None]:
from models.utils import load_model_from_registry
from models.predict_model import predict_station
import mlflow
import models.predict_model

importlib.reload(models.predict_model)
mlflow.set_tracking_uri("http://127.0.0.1:5000")

station_id = 100
model = load_model_from_registry(f"model_station_{station_id}")
results = predict_station(100, model, start_date, end_date)


OSError: No such file or directory: '\mlflow\artifacts\1\models\m-db51aeb9478a4788a684ba9ffc4eba58\artifacts\.'