In [3]:
# Data Retrieval and Handling
import requests
import os
import pandas as pd
from dotenv import load_dotenv

# Time and Date Manipulation
import time
from datetime import datetime, timezone
from dateutil.relativedelta import relativedelta

# Progress Tracking
from tqdm import tqdm

# Utility Functions and Miscellaneous
import urllib.parse

from sklearn.metrics import root_mean_squared_error

import mlflow
import mlflow.pyfunc

load_dotenv(dotenv_path='./.env', override=True)

True

In [2]:
headers = {"accept": "application/json", "X-API-Key": os.environ['OPENAQ_API_KEY']}
base_url = "https://api.openaq.org/v3/"

x_sensors = [20466, 34845, 34841, 35394, 35577, 35843, 36047, 36066, 36064, 36092]

y_sensor = [35606]

sensor_ids = x_sensors + y_sensor

start_date = datetime(2024, 7, 1, tzinfo=timezone.utc)
end_date = datetime(2024, 7, 31, tzinfo=timezone.utc)


def generate_url(sensor_id, start, end, limit=1000):
    base_url = f"https://api.openaq.org/v3/sensors/{sensor_id}/measurements"

    params = {
        "period_name": "hour",
        "date_from": start.isoformat(),
        "date_to": end.isoformat(),
        "limit": limit,
        "page": "1",
    }
    encoded_params = urllib.parse.urlencode(params)

    full_url = f"{base_url}?{encoded_params}"

    return full_url


all_data = []


def fetch_sensor_data(sensor_id):
    sensor_data = []

    date_range = pd.date_range(start_date, end_date, freq="MS")

    for current_date in tqdm(date_range, desc=f"{sensor_id:>10}"):
        month_end = current_date + relativedelta(months=1)

        while True:
            url = generate_url(sensor_id, current_date, month_end)
            response = requests.get(url, headers=headers)
            # print(url)

            if response.status_code == 429:
                #print("Rate limit exceeded, sleeping for 30 seconds...")
                time.sleep(30)
                continue

            if response.status_code == 403:
                #print("Error 403", sensor_id)
                time.sleep(30)
                continue

            if response.status_code == 408:
                #print("Error 408", sensor_id)
                time.sleep(30)
                continue

            if response.status_code != 200:
                print(f"Error: {response.status_code}")
                print(url)
                break

            data = response.json()
            if not data["results"]:
                break

            for item in data["results"]:
                value = item["value"]
                utc_datetime = item["period"]["datetimeFrom"]["utc"]
                sensor_data.append(
                    {"sensor_id": sensor_id, "datetime": utc_datetime, "value": value}
                )
            break

    return sensor_data


# Iterate over all sensors and fetch data
for sensor_id in sensor_ids:
    sensor_data = fetch_sensor_data(sensor_id)
    all_data.extend(sensor_data)

df = pd.DataFrame(all_data)

df.to_csv("inference_data.csv", index=False)

pivot_df = df.pivot(index='datetime', columns='sensor_id', values='value').reset_index()
pivot_df = pivot_df.rename_axis(None, axis=1)
pivot_df.columns = [f'sid_{col}' if isinstance(col, int) else col for col in pivot_df.columns[:]]
pivot_df.tail()


     20466: 100%|██████████| 1/1 [00:02<00:00,  2.58s/it]
     34845: 100%|██████████| 1/1 [00:01<00:00,  1.05s/it]
     34841: 100%|██████████| 1/1 [00:00<00:00,  1.06it/s]
     35394: 100%|██████████| 1/1 [00:00<00:00,  1.43it/s]
     35577: 100%|██████████| 1/1 [00:01<00:00,  1.28s/it]
     35843: 100%|██████████| 1/1 [00:00<00:00,  1.45it/s]
     36047: 100%|██████████| 1/1 [00:00<00:00,  1.63it/s]
     36066: 100%|██████████| 1/1 [00:00<00:00,  2.05it/s]
     36064: 100%|██████████| 1/1 [00:01<00:00,  1.22s/it]
     36092: 100%|██████████| 1/1 [00:00<00:00,  1.60it/s]
     35606: 100%|██████████| 1/1 [00:00<00:00,  1.56it/s]


Unnamed: 0,datetime,sid_20466,sid_34841,sid_34845,sid_35394,sid_35577,sid_35606,sid_35843,sid_36047,sid_36064,sid_36066,sid_36092
739,2024-07-31T19:00:00+00:00,8.7,11.0,0.00054,0.019,7.4,22.0,32.0,7.4,17.0,38.0,9.8
740,2024-07-31T20:00:00+00:00,1.2,12.0,0.0021,,7.4,13.0,28.0,6.1,26.0,36.0,8.6
741,2024-07-31T21:00:00+00:00,,17.0,0.0022,0.019,7.4,11.0,20.0,15.0,6.0,33.0,11.0
742,2024-07-31T22:00:00+00:00,,11.0,0.0039,0.017,1.2,17.0,1.0,8.5,9.0,37.0,
743,2024-07-31T23:00:00+00:00,,18.0,0.0052,0.012,0.5,21.0,33.0,8.4,26.0,34.0,6.0


In [5]:
mlflow.set_tracking_uri(f"http://{os.environ['MLFLOW_HOST']}:5000")

model_name = "openaq-medellin-35606-xgboost-imputer"
model_version_alias = "champion"

In [6]:
model_uri = f"models:/{model_name}@{model_version_alias}"
model = mlflow.pyfunc.load_model(model_uri)

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

 - lz4 (current: uninstalled, required: lz4==4.3.3)
 - psutil (current: 6.0.0, required: psutil==5.9.8)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


TypeError: code() argument 13 must be str, not int

In [16]:
preds = model.predict(pivot_df[["sid_" + str(sensor) for sensor in x_sensors]])

In [17]:
truth = pivot_df["sid_" + str(y_sensor[0])]

In [22]:
root_mean_squared_error(preds, truth.fillna(0))

16.450133134925167

In [23]:
pivot_df[["sid_" + str(sensor) for sensor in x_sensors]]

Unnamed: 0,sid_20466,sid_34845,sid_34841,sid_35394,sid_35577,sid_35843,sid_36047,sid_36066,sid_36064,sid_36092
0,11.0,0.01400,8.0,0.0049,8.2,16.0,8.2,,13.0,12.0
1,13.0,0.02400,9.0,0.0050,13.0,33.0,11.0,,27.0,16.0
2,9.5,0.02700,11.0,0.0020,9.4,21.0,14.0,,14.0,20.0
3,13.0,0.02300,17.0,0.0015,11.0,22.0,15.0,,18.0,23.0
4,7.1,0.02000,21.0,0.0012,9.4,32.0,19.0,,29.0,25.0
...,...,...,...,...,...,...,...,...,...,...
739,8.7,0.00054,11.0,0.0190,7.4,32.0,7.4,38.0,17.0,9.8
740,1.2,0.00210,12.0,,7.4,28.0,6.1,36.0,26.0,8.6
741,,0.00220,17.0,0.0190,7.4,20.0,15.0,33.0,6.0,11.0
742,,0.00390,11.0,0.0170,1.2,1.0,8.5,37.0,9.0,
