# Data Generation
This script will generate the necessary files to construct the pipeline and simulate production.

In [1]:
import pandas as pd

from datetime import timedelta, datetime
from openaq import OpenAQ
from sklearn.preprocessing import StandardScaler

In [2]:
TRAIN_SPLIT = 0.4
VAL_SPLIT = 0.1
TEST_SPLIT = 0.1
PROD_SPLIT = 0.4

SEQ_LEN = 20
PRED_LEN = 1

In [3]:
TARGET_PARAM = "pm25"
LOCATIONS_DICT = {
    "Canyon ES (2795)": 947312,
    "Pacific Palisades ES (5959)": 947232,
    "Revere MS (8356)": 947280,
    "Brentwood Sci Mag ES (2507)": 947305,
}

In [4]:
# Put your OpenAQ API key in a text file with the name "openaq_api_key.txt"
with open("api_keys/openaq_api_key.txt", "r") as file:
    API_KEY = file.read()

# Initialize the OpenAQ client
client = OpenAQ(api_key=API_KEY)

In [5]:
def fetch_sensors_by_list(sensor_list):
    format_string = "%Y-%m-%dT%H:%M:%SZ"
    data = {}
    m_id = 0
    # For each location in the response, fetch its sensors
    for id in sensor_list:
        location = client.locations.get(id).results[0]
        print(f"""Fetching {location.name} data""")
        for sensor in location.sensors:
            lat = location.coordinates.latitude
            long = location.coordinates.longitude
            loc_name = location.name
            location_id = location.id
            sensor = sensor.id

            # Fetch the recent measurements the sensor
            measurements = client.measurements.list(sensor)

            # For each measurement, record the relevant data
            for measurement in measurements.results:
                m_id += 1
                epoch = datetime.strptime(measurement.period.datetime_from.utc, format_string)
                duration = timedelta(seconds=pd.to_timedelta(measurement.period.interval).seconds)
                parameter = measurement.parameter.name
                value = measurement.value
                units = measurement.parameter.units

                data[m_id] = {
                    "measurement_id": m_id,
                    "sensor_id": sensor,
                    "location_id": location_id,
                    "location": loc_name,
                    "latitude": lat,
                    "longitude": long,
                    "epoch": epoch,
                    "duration": duration,
                    "parameter": parameter,
                    "value": value,
                    "units": units,
                }

    return pd.DataFrame.from_dict(data, orient="index")

In [6]:
# Get only the first location for time series analysis
location_index = LOCATIONS_DICT["Canyon ES (2795)"]
df = fetch_sensors_by_list([location_index])

Fetching Canyon ES (2795) data


In [7]:
# Filter data for this location
df_location = df[df["location_id"] == location_index]
df_param = df_location[df_location["parameter"] == TARGET_PARAM]  # Filter data for this parameter

In [8]:
# Split training
train_data = df_param.iloc[: int(len(df_param) * (TRAIN_SPLIT + VAL_SPLIT + TEST_SPLIT))]
train_data = train_data.reset_index(drop=True)

# Split production
prod_data = df_param.iloc[int(len(df_param) * (TRAIN_SPLIT + VAL_SPLIT + TEST_SPLIT)) :]
prod_data = prod_data.reset_index(drop=True)

In [9]:
# Normalize the training dataset
scaler = StandardScaler()
train_data.loc[:, "value"] = scaler.fit_transform(train_data["value"].values.reshape(-1, 1))
prod_data.loc[:, "value"] = scaler.transform(prod_data["value"].values.reshape(-1, 1))

In [10]:
# Save the data to csv
train_data.to_csv("data/sensor_data.csv", index=False)
prod_data.to_csv("data/production_data.csv", index=False)