In [1]:
# Data Retrieval and Handling
import os
import pandas as pd
import psycopg

# Utility Functions and Miscellaneous
from scipy import stats
from functools import reduce

# Machine Learning
import xgboost as xgb

# Data Preprocessing
from sklearn.impute import KNNImputer

# Hyperparameter Optimization
import optuna

# Experiment Tracking and Model Management
import mlflow
import mlflow.pyfunc
from mlflow import MlflowClient

# Saving and Loading Models
import pickle

In [2]:
conn_params = {
    'host': 'localhost',
    'port': '5433',
    'dbname': 'aq_data',
    'user': 'postgres',
    'password': 'magedb_password',
}

with psycopg.connect(**conn_params) as conn:
    with conn.cursor() as cur:
        cur.execute("SELECT * FROM public.original_data")
        
        results = cur.fetchall()
        
        colnames = [desc.name for desc in cur.description]
        
        dataset = pd.DataFrame(results, columns=colnames)

dataset.head()

Unnamed: 0,datetime,sid_20466,sid_34845,sid_34841,sid_35394,sid_35577,sid_35843,sid_36047,sid_36066,sid_36064,sid_36092,sid_35606
0,2022-01-01 00:00:00+00:00,9.9,0.016,20.0,0.01,15.0,41.0,14.0,41.0,19.0,21.0,26.0
1,2022-01-01 01:00:00+00:00,14.0,0.014,20.0,0.011,18.0,35.0,12.0,36.0,30.0,36.0,55.0
2,2022-01-01 02:00:00+00:00,25.0,0.013,23.0,0.011,24.0,34.0,16.0,46.0,28.0,32.0,39.0
3,2022-01-01 03:00:00+00:00,27.0,0.011,27.0,0.011,39.0,39.0,20.0,48.0,32.0,54.0,33.0
4,2022-01-01 04:00:00+00:00,26.0,0.012,27.0,0.012,33.0,37.0,29.0,40.0,33.0,84.0,31.0


In [3]:
dataset = dataset[~dataset["sid_35606"].isna()]

In [4]:
dataset.drop(columns="datetime", inplace=True)
z_scores = dataset.apply(stats.zscore)
threshold = 3

masks = []
for col in dataset.columns:
    masks.append(abs(z_scores[col]) > threshold)

mask = reduce(lambda x, y: x | y, masks)

dataset = dataset[~mask]

In [5]:
X = dataset[
    [
        "sid_20466",
        "sid_34845",
        "sid_34841",
        "sid_35394",
        "sid_35577",
        "sid_35843",
        "sid_36047",
        "sid_36066",
        "sid_36064",
        "sid_36092",
    ]
]

y = dataset["sid_35606"]

imputer = KNNImputer(n_neighbors=5)
X_imputed = imputer.fit_transform(X)

imputer_path = "./artifacts/imputer.pkl"
with open(imputer_path, "wb") as f:
    pickle.dump(imputer, f)

In [6]:
os.environ['AWS_ACCESS_KEY_ID'] = 'test'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'test'
os.environ['AWS_DEFAULT_REGION'] = 'us-east-1'
os.environ['MLFLOW_S3_ENDPOINT_URL'] = 'http://localhost:4566'

In [7]:
class ImputerAndXGBoost(mlflow.pyfunc.PythonModel):
    def load_context(self, context):
        # Load the KNN imputer
        with open(context.artifacts["imputer"], "rb") as f:
            self.imputer = pickle.load(f)
        # Load the XGBoost model
        self.xgboost_model = xgb.Booster()
        self.xgboost_model.load_model(context.artifacts["xgboost_model"])
        
    def predict(self, context, model_input):
        # Impute missing values
        imputed = self.imputer.transform(model_input)
        dmatrix = xgb.DMatrix(imputed)
        # Make predictions
        return self.xgboost_model.predict(dmatrix)

In [8]:
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("local_testing")

dtrain = xgb.DMatrix(X_imputed, label=y)

def objective(trial):
    param = {
        "verbosity": 0,
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "validate_parameters": True,
        "objective": "reg:squarederror",
        "tree_method": "auto",
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        "max_depth": trial.suggest_int("max_depth", 3, 9, step=2),
        "n_estimators": trial.suggest_int("n_estimators", 100, 300),
    }

    if param["booster"] in ["gbtree", "dart"]:
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        param["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=2)
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)
    
    with mlflow.start_run():
        mlflow.log_params(param)
        
        # Cross-validation
        cv_results = xgb.cv(
            dtrain=dtrain,
            params=param,
            nfold=5,  # Number of folds
            num_boost_round=200,  # Maximum number of boosting rounds
            early_stopping_rounds=10,  # Stop if no improvement after these rounds
            metrics='rmse',  # Metric to evaluate
            seed=1020
        )
        
        # Extract the best RMSE and corresponding number of rounds
        best_rmse = cv_results['test-rmse-mean'].min()
        best_rounds = cv_results['test-rmse-mean'].idxmin()
        
        mlflow.log_metric("best_rmse", best_rmse)
        mlflow.log_metric("best_rounds", best_rounds)

        # Train the model with the best number of rounds on the full training data
        final_model = xgb.train(param, dtrain, num_boost_round=best_rounds)

        xgboost_model_path = "xgboost_model.json"
        final_model.save_model(xgboost_model_path)

        # Log the XGBoost model and the imputer as artifacts
        mlflow.pyfunc.log_model(
            artifact_path="model",
            python_model=ImputerAndXGBoost(),
            artifacts={
                "imputer": imputer_path,
                "xgboost_model": xgboost_model_path,
            }
        )

    return best_rmse


2024/08/01 15:12:24 INFO mlflow.tracking.fluent: Experiment with name 'local_testing' does not exist. Creating a new experiment.


In [9]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=1, timeout=600)

print("Best trial:")
trial = study.best_trial

print(f"  RMSE: {trial.value:.4f}")
print("  Parameters: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Best trial:
  RMSE: 16.4755
  Parameters: 
    booster: gbtree
    lambda: 0.016276554931958193
    alpha: 4.991135228007095e-08
    subsample: 0.850066910470654
    colsample_bytree: 0.9711565765798214
    max_depth: 5
    n_estimators: 204
    eta: 9.999267214174526e-06
    min_child_weight: 3
    gamma: 3.779986942083264e-07
    grow_policy: depthwise


In [10]:
client = MlflowClient()

experiment = client.get_experiment_by_name("local_testing")

experiment_id = experiment.experiment_id

runs = client.search_runs(experiment_ids=experiment_id, order_by=["metrics.rmse"], max_results=1)

best_run = runs[0].to_dictionary()
best_hyperparameters = best_run['data']['params']

print("Best hyperparameters:")
for param_name, param_value in best_hyperparameters.items():
    print(f"{param_name}: {param_value}")

Best hyperparameters:
verbosity: 0
booster: gbtree
validate_parameters: True
objective: reg:squarederror
tree_method: auto
lambda: 0.016276554931958193
alpha: 4.991135228007095e-08
subsample: 0.850066910470654
colsample_bytree: 0.9711565765798214
max_depth: 5
n_estimators: 204
eta: 9.999267214174526e-06
min_child_weight: 3
gamma: 3.779986942083264e-07
grow_policy: depthwise


In [11]:
model_name = "openaq-medellin-35606-xgboost-imputer"
    
client.create_registered_model(model_name)


<RegisteredModel: aliases={}, creation_timestamp=1722543151016, description='', last_updated_timestamp=1722543151016, latest_versions=[], name='openaq-medellin-35606-xgboost-imputer', tags={}>

In [12]:
client.set_registered_model_tag(model_name, "task", "regression")

In [13]:
s3_bucket_name = "mlflow"

result = client.create_model_version(
    name=model_name,
    # on log_model we set artifact_path="model"
    source=f"s3://{s3_bucket_name}/{best_run['info']['experiment_id']}/{best_run['info']['run_id']}/artifacts/model", 
    run_id=best_run['info']['run_id'],
)

2024/08/01 15:12:31 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: openaq-medellin-35606-xgboost-imputer, version 1


In [14]:
client.set_registered_model_alias(model_name, "champion", result.version)
client.set_model_version_tag(model_name, result.version, "validation_status", "approved")

In [15]:
client.get_model_version_by_alias(model_name, "champion")

<ModelVersion: aliases=['champion'], creation_timestamp=1722543151055, current_stage='None', description='', last_updated_timestamp=1722543151055, name='openaq-medellin-35606-xgboost-imputer', run_id='3e60c7fad8a7435c91836e63efac28c3', run_link='', source='s3://mlflow/1/3e60c7fad8a7435c91836e63efac28c3/artifacts/model', status='READY', status_message='', tags={'validation_status': 'approved'}, user_id='', version='1'>

In [16]:
model = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/{result.version}")

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

In [18]:
model_uri = f"models:/{model_name}@champion"
model = mlflow.pyfunc.load_model(model_uri)

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

In [23]:
model.predict(X.iloc[[1,2]])

array([28.26352, 34.16879], dtype=float32)

In [8]:
X.columns

Index(['sid_20466', 'sid_34845', 'sid_34841', 'sid_35394', 'sid_35577',
       'sid_35843', 'sid_36047', 'sid_36066', 'sid_36064', 'sid_36092'],
      dtype='object')

In [9]:
y

0        26.0
1        55.0
2        39.0
3        33.0
4        31.0
         ... 
16671    24.0
16672    23.0
16673    19.0
16674     9.0
16675     8.0
Name: sid_35606, Length: 15330, dtype: float64