In [1]:
import pandas as pd
import datetime as dt
from datetime import timedelta

import geopandas as gpd
from pyathena import connect
import requests

```
s3://openaq-data-archive/records/csv.gz/
├─ year=2025/
│  ├─ month=10/
│  │  ├─ locationid=2178/
│  │  ├─ locationid=827/
│  │  └─ ...
│  └─ month=11/...
└─ year=2024/...
```

Example file path:

` /records/csv.gz/locationid=2178/year=2022/month=05/location-2178-20220503.csv.gz `

In [2]:
pm25_gdf = gpd.read_file(r'../Data/Outputs/pm25.geojson')

In [3]:
pm25_gdf.head()

Unnamed: 0,station_id,name,sensor_id,sensor_display_name,timezone,is_mobile,is_monitor,owner.id,owner.name,provider.id,provider.name,coordinates.latitude,coordinates.longitude,datetime_first.utc,datetime_last.utc,sensor_type,geometry
0,847,South Long Beach,1502,PM2.5,America/Los_Angeles,False,True,4,Unknown Governmental Organization,119,AirNow,33.792221,-118.175278,2016-03-06 20:00:00+00:00,2022-05-05 22:00:00+00:00,pm25 µg/m³,POINT (-118.17528 33.79222)
1,1042,Piru - Pacific,22301,PM2.5,America/Los_Angeles,False,True,4,Unknown Governmental Organization,119,AirNow,34.4044,-118.81,2016-03-06 20:00:00+00:00,2020-06-10 21:00:00+00:00,pm25 µg/m³,POINT (-118.81 34.4044)
2,1200,Glendora - Laurel,2150,PM2.5,America/Los_Angeles,False,True,4,Unknown Governmental Organization,119,AirNow,34.1439,-117.8508,2016-03-06 20:00:00+00:00,2025-11-09 18:00:00+00:00,pm25 µg/m³,POINT (-117.8508 34.1439)
3,1310,Lancaster-Division,2362,PM2.5,America/Los_Angeles,False,True,4,Unknown Governmental Organization,119,AirNow,34.669589,-118.130689,2016-03-30 06:00:00+00:00,2022-10-04 18:00:00+00:00,pm25 µg/m³,POINT (-118.13069 34.66959)
4,1575,Los Angeles - N. Mai,2775,PM2.5,America/Los_Angeles,False,True,4,Unknown Governmental Organization,119,AirNow,34.0669,-118.2417,2016-03-06 20:00:00+00:00,2017-06-07 21:00:00+00:00,pm25 µg/m³,POINT (-118.2417 34.0669)


In [4]:
pm25_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 415 entries, 0 to 414
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   station_id             415 non-null    int32              
 1   name                   415 non-null    object             
 2   sensor_id              415 non-null    int32              
 3   sensor_display_name    415 non-null    object             
 4   timezone               415 non-null    object             
 5   is_mobile              415 non-null    bool               
 6   is_monitor             415 non-null    bool               
 7   owner.id               415 non-null    int32              
 8   owner.name             415 non-null    object             
 9   provider.id            415 non-null    int32              
 10  provider.name          415 non-null    object             
 11  coordinates.latitude   415 non-null    float64    

In [5]:
station_IDs = list(pm25_gdf['station_id'])
station_IDs[:10]

[847, 1042, 1200, 1310, 1575, 1585, 1681, 1902, 1948, 1989]

## Pulling in a few entries manually

In [6]:
def download_file(url, save_path):
    response = requests.get(url, stream=True)
    
    if response.status_code == 200:
        with open(save_path, 'wb') as file:
            file.write(response.content)
        print(f'{url} downloaded')
    else:
        print(f'{url} failed')

### Downloading the example URL

In [7]:
url = "https://openaq-data-archive.s3.amazonaws.com/records/csv.gz/locationid=2178/year=2022/month=05/location-2178-20220503.csv.gz"

In [8]:
download_file(url, r'../Data/Outputs/test.csv.gz')

https://openaq-data-archive.s3.amazonaws.com/records/csv.gz/locationid=2178/year=2022/month=05/location-2178-20220503.csv.gz downloaded


In [9]:
test_download = pd.read_csv(r'../Data/Outputs/test.csv.gz')

In [10]:
test_download.head()

Unnamed: 0,location_id,sensors_id,location,datetime,lat,lon,parameter,units,value
0,2178,3919,Del Norte-2178,2022-05-03T01:00:00-06:00,35.1353,-106.584702,pm10,µg/m³,31.0
1,2178,3919,Del Norte-2178,2022-05-03T02:00:00-06:00,35.1353,-106.584702,pm10,µg/m³,22.0
2,2178,3919,Del Norte-2178,2022-05-03T03:00:00-06:00,35.1353,-106.584702,pm10,µg/m³,29.0
3,2178,3919,Del Norte-2178,2022-05-03T04:00:00-06:00,35.1353,-106.584702,pm10,µg/m³,30.0
4,2178,3919,Del Norte-2178,2022-05-03T05:00:00-06:00,35.1353,-106.584702,pm10,µg/m³,31.0


#### Downloading sample data from a station in `station_IDs`
I'll try this exact URL again, but using the first entry from my stations dataset

In [11]:
station_IDs[0]

847

In [12]:
stations_url = "https://openaq-data-archive.s3.amazonaws.com/records/csv.gz/locationid=847/year=2022/month=05/location-847-20220503.csv.gz"

In [13]:
download_file(stations_url, r'../Data/Outputs/stations_test.csv.gz')

https://openaq-data-archive.s3.amazonaws.com/records/csv.gz/locationid=847/year=2022/month=05/location-847-20220503.csv.gz downloaded


In [14]:
stations_test = pd.read_csv(r'../Data/Outputs/stations_test.csv.gz')

In [15]:
stations_test.head()

Unnamed: 0,location_id,sensors_id,location,datetime,lat,lon,parameter,units,value
0,847,1502,South Long Beach-847,2022-05-03T01:00:00-07:00,33.792221,-118.175278,pm25,µg/m³,13.1
1,847,1502,South Long Beach-847,2022-05-03T02:00:00-07:00,33.792221,-118.175278,pm25,µg/m³,12.9
2,847,1502,South Long Beach-847,2022-05-03T03:00:00-07:00,33.792221,-118.175278,pm25,µg/m³,8.8
3,847,1502,South Long Beach-847,2022-05-03T04:00:00-07:00,33.792221,-118.175278,pm25,µg/m³,10.0
4,847,1502,South Long Beach-847,2022-05-03T05:00:00-07:00,33.792221,-118.175278,pm25,µg/m³,11.7


This looks good.

In this case, we only see PM2.5 readings - however, `location_id` is a station-level identifier, and each station can have a variety of sensors. This means it could be possible for other pollutant data to be pulled.

However, the partition schema:

`/records/csv.gz/locationid=2178/year=2022/month=05/location-2178-20220503.csv.gz `

Does not allow us to query based on sensor type. Our list of stations should gurarantee that we pull stations which have PM2.5 readings, but will not gurarantee we *only* pull these readings.

#### Recent data
I'll try pulling data from same station as above, but from 2025, rather than 2022:

In [16]:
recent_stations_url = "https://openaq-data-archive.s3.amazonaws.com/records/csv.gz/locationid=847/year=2025/month=05/location-847-20250503.csv.gz"
download_file(recent_stations_url, r'../Data/Outputs/recent_stations_test.csv.gz')

https://openaq-data-archive.s3.amazonaws.com/records/csv.gz/locationid=847/year=2025/month=05/location-847-20250503.csv.gz failed


In [17]:
recent_stations_url = "https://openaq-data-archive.s3.amazonaws.com/records/csv.gz/locationid=847/year=2023/month=05/location-847-20230503.csv.gz"
download_file(recent_stations_url, r'../Data/Outputs/recent_stations_test.csv.gz')

https://openaq-data-archive.s3.amazonaws.com/records/csv.gz/locationid=847/year=2023/month=05/location-847-20230503.csv.gz failed


In [18]:
pm25_gdf.iloc[0]

station_id                                             847
name                                      South Long Beach
sensor_id                                             1502
sensor_display_name                                  PM2.5
timezone                               America/Los_Angeles
is_mobile                                            False
is_monitor                                            True
owner.id                                                 4
owner.name               Unknown Governmental Organization
provider.id                                            119
provider.name                                       AirNow
coordinates.latitude                             33.792221
coordinates.longitude                          -118.175278
datetime_first.utc               2016-03-06 20:00:00+00:00
datetime_last.utc                2022-05-05 22:00:00+00:00
sensor_type                                     pm25 µg/m³
geometry                     POINT (-118.175278 33.79222

It actually makes sense that this download failed: the station has not had entries since 2022:

In [19]:
pm25_gdf.iloc[0]['datetime_last.utc']

Timestamp('2022-05-05 22:00:00+0000', tz='UTC')

I will need to create a subset of PM2.5 stations with modern entries so we don't have to attempt to pull in non-exitent readings - for my purposes, it isn't going to be useful to have data older than one week.

In [20]:
pm25_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 415 entries, 0 to 414
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   station_id             415 non-null    int32              
 1   name                   415 non-null    object             
 2   sensor_id              415 non-null    int32              
 3   sensor_display_name    415 non-null    object             
 4   timezone               415 non-null    object             
 5   is_mobile              415 non-null    bool               
 6   is_monitor             415 non-null    bool               
 7   owner.id               415 non-null    int32              
 8   owner.name             415 non-null    object             
 9   provider.id            415 non-null    int32              
 10  provider.name          415 non-null    object             
 11  coordinates.latitude   415 non-null    float64    

In [21]:
pm25_gdf.head()

Unnamed: 0,station_id,name,sensor_id,sensor_display_name,timezone,is_mobile,is_monitor,owner.id,owner.name,provider.id,provider.name,coordinates.latitude,coordinates.longitude,datetime_first.utc,datetime_last.utc,sensor_type,geometry
0,847,South Long Beach,1502,PM2.5,America/Los_Angeles,False,True,4,Unknown Governmental Organization,119,AirNow,33.792221,-118.175278,2016-03-06 20:00:00+00:00,2022-05-05 22:00:00+00:00,pm25 µg/m³,POINT (-118.17528 33.79222)
1,1042,Piru - Pacific,22301,PM2.5,America/Los_Angeles,False,True,4,Unknown Governmental Organization,119,AirNow,34.4044,-118.81,2016-03-06 20:00:00+00:00,2020-06-10 21:00:00+00:00,pm25 µg/m³,POINT (-118.81 34.4044)
2,1200,Glendora - Laurel,2150,PM2.5,America/Los_Angeles,False,True,4,Unknown Governmental Organization,119,AirNow,34.1439,-117.8508,2016-03-06 20:00:00+00:00,2025-11-09 18:00:00+00:00,pm25 µg/m³,POINT (-117.8508 34.1439)
3,1310,Lancaster-Division,2362,PM2.5,America/Los_Angeles,False,True,4,Unknown Governmental Organization,119,AirNow,34.669589,-118.130689,2016-03-30 06:00:00+00:00,2022-10-04 18:00:00+00:00,pm25 µg/m³,POINT (-118.13069 34.66959)
4,1575,Los Angeles - N. Mai,2775,PM2.5,America/Los_Angeles,False,True,4,Unknown Governmental Organization,119,AirNow,34.0669,-118.2417,2016-03-06 20:00:00+00:00,2017-06-07 21:00:00+00:00,pm25 µg/m³,POINT (-118.2417 34.0669)


In [22]:
recent_stations_test = pd.read_csv(r'../Data/Outputs/recent_stations_test.csv.gz')

In [23]:
recent_stations_test.head()

Unnamed: 0,location_id,sensors_id,location,datetime,lat,lon,parameter,units,value
0,847,1502,South Long Beach-847,2022-05-03T01:00:00-07:00,33.792221,-118.175278,pm25,µg/m³,13.1
1,847,1502,South Long Beach-847,2022-05-03T02:00:00-07:00,33.792221,-118.175278,pm25,µg/m³,12.9
2,847,1502,South Long Beach-847,2022-05-03T03:00:00-07:00,33.792221,-118.175278,pm25,µg/m³,8.8
3,847,1502,South Long Beach-847,2022-05-03T04:00:00-07:00,33.792221,-118.175278,pm25,µg/m³,10.0
4,847,1502,South Long Beach-847,2022-05-03T05:00:00-07:00,33.792221,-118.175278,pm25,µg/m³,11.7


### Creating recent dataset

In [24]:
dt.datetime.now()

datetime.datetime(2025, 11, 9, 11, 16, 49, 186467)

I can find stations which have posted records in the past week:

In [25]:
delta = timedelta(days=7)

In order to compare my `datetime_last.utc` and the current time, I can't just use dt.datetime.now() on its own - I need to specify the timezone via pd.Timestamp():

In [26]:
week_ago = pd.Timestamp(dt.datetime.now(), tz='UTC') - delta
week_ago

Timestamp('2025-11-02 11:16:49.222001+0000', tz='UTC')

In [27]:
pm25_gdf['datetime_last.utc'][:5]

0   2022-05-05 22:00:00+00:00
1   2020-06-10 21:00:00+00:00
2   2025-11-09 18:00:00+00:00
3   2022-10-04 18:00:00+00:00
4   2017-06-07 21:00:00+00:00
Name: datetime_last.utc, dtype: datetime64[ms, UTC]

This will find the five stations with the oldest entries out of those which have published in the past week

In [28]:
pm25_gdf[pm25_gdf['datetime_last.utc'] > (week_ago)].sort_values(by='datetime_last.utc',ascending=True)[:5]

Unnamed: 0,station_id,name,sensor_id,sensor_display_name,timezone,is_mobile,is_monitor,owner.id,owner.name,provider.id,provider.name,coordinates.latitude,coordinates.longitude,datetime_first.utc,datetime_last.utc,sensor_type,geometry
365,3930632,Father's Office,12661032,PM2.5,America/Los_Angeles,False,False,11509,Unknown Person,66,AirGradient,34.047306,-118.23506,2025-04-02 16:00:00+00:00,2025-11-03 02:00:00+00:00,pm25 µg/m³,POINT (-118.23506 34.04731)
380,5276665,Lago Vista Drive,14153323,PM2.5,America/Los_Angeles,False,False,12,AirGradient,66,AirGradient,34.097234,-118.409102,2025-09-19 00:00:00+00:00,2025-11-04 19:00:00+00:00,pm25 µg/m³,POINT (-118.4091 34.09723)
181,947273,Ambler ES (2089),2000911,PM2.5,America/Los_Angeles,False,False,6,Unknown Community Organization,166,Clarity,33.87823,-118.27078,2022-02-18 01:53:32+00:00,2025-11-09 17:30:19+00:00,pm25 µg/m³,POINT (-118.27078 33.87823)
408,5831927,Villa Intermediate,14149764,PM2.5,America/Los_Angeles,False,False,9,Clarity,166,Clarity,33.74343,-117.84841,2025-09-18 17:12:41+00:00,2025-11-09 17:51:36+00:00,pm25 µg/m³,POINT (-117.84841 33.74343)
292,2812577,Sherman Oaks,8966929,PM2.5,America/Los_Angeles,False,False,9,Clarity,166,Clarity,34.13053,-118.45947,2024-04-24 16:00:00+00:00,2025-11-09 17:53:27+00:00,pm25 µg/m³,POINT (-118.45947 34.13053)


In [29]:
pm25_gdf[pm25_gdf['datetime_last.utc'] > (week_ago)].sort_values(by='datetime_last.utc',ascending=True).iloc[0]

station_id                                                    3930632
name                                                  Father's Office
sensor_id                                                    12661032
sensor_display_name                                             PM2.5
timezone                                          America/Los_Angeles
is_mobile                                                       False
is_monitor                                                      False
owner.id                                                        11509
owner.name                                             Unknown Person
provider.id                                                        66
provider.name                                             AirGradient
coordinates.latitude                                        34.047306
coordinates.longitude                                      -118.23506
datetime_first.utc                          2025-04-02 16:00:00+00:00
datetime_last.utc   

In [30]:
pm25_gdf[pm25_gdf['datetime_last.utc'] > (week_ago)].sort_values(by='datetime_last.utc',ascending=True).iloc[0]['datetime_last.utc']

Timestamp('2025-11-03 02:00:00+0000', tz='UTC')

The most out-of-date station at the time of checking (seen above) published two days ago:

In [31]:
pd.Timestamp(dt.datetime.now(), tz='UTC')

Timestamp('2025-11-09 11:16:49.359680+0000', tz='UTC')

### Creating subset of dataset with modern readings

In the script version, I will implement all of this dynamically so that we can have an up-to-date list of stations which are actively posting. For the sake of EDA, I'll create this list and then practice with Athena

Creating a dataset with only entries from the past week:

In [32]:
recent_pm25 = pm25_gdf[pm25_gdf['datetime_last.utc'] > (week_ago)]

In [33]:
recent_pm25.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 310 entries, 2 to 414
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   station_id             310 non-null    int32              
 1   name                   310 non-null    object             
 2   sensor_id              310 non-null    int32              
 3   sensor_display_name    310 non-null    object             
 4   timezone               310 non-null    object             
 5   is_mobile              310 non-null    bool               
 6   is_monitor             310 non-null    bool               
 7   owner.id               310 non-null    int32              
 8   owner.name             310 non-null    object             
 9   provider.id            310 non-null    int32              
 10  provider.name          310 non-null    object             
 11  coordinates.latitude   310 non-null    float64         

In [34]:
recent_pm25.head()

Unnamed: 0,station_id,name,sensor_id,sensor_display_name,timezone,is_mobile,is_monitor,owner.id,owner.name,provider.id,provider.name,coordinates.latitude,coordinates.longitude,datetime_first.utc,datetime_last.utc,sensor_type,geometry
2,1200,Glendora - Laurel,2150,PM2.5,America/Los_Angeles,False,True,4,Unknown Governmental Organization,119,AirNow,34.1439,-117.8508,2016-03-06 20:00:00+00:00,2025-11-09 18:00:00+00:00,pm25 µg/m³,POINT (-117.8508 34.1439)
8,1948,EBAM 11,25551,PM2.5,America/Los_Angeles,False,True,4,Unknown Governmental Organization,119,AirNow,33.90144,-118.20502,2016-03-06 20:00:00+00:00,2025-11-09 18:00:00+00:00,pm25 µg/m³,POINT (-118.20502 33.90144)
9,1989,Santa Clarita,3523,PM2.5,America/Los_Angeles,False,True,4,Unknown Governmental Organization,119,AirNow,34.3833,-118.5283,2016-03-06 20:00:00+00:00,2025-11-09 18:00:00+00:00,pm25 µg/m³,POINT (-118.5283 34.3833)
10,2138,Reseda,3842,PM2.5,America/Los_Angeles,False,True,4,Unknown Governmental Organization,119,AirNow,34.1992,-118.5331,2016-03-06 20:00:00+00:00,2025-11-09 18:00:00+00:00,pm25 µg/m³,POINT (-118.5331 34.1992)
11,5791,EBAM-4,15731,PM2.5,America/Los_Angeles,False,True,4,Unknown Governmental Organization,119,AirNow,33.85966,-118.2007,2016-11-15 21:00:00+00:00,2025-11-09 18:00:00+00:00,pm25 µg/m³,POINT (-118.2007 33.85966)


In [35]:
recent_pm25.to_file('../Data/Outputs/recents_pm25.geojson', driver='GeoJSON')

In [36]:
1/0

ZeroDivisionError: division by zero

https://stackoverflow.com/questions/339007/how-do-i-pad-a-string-with-zeros#339013

In [None]:
def pull_records(location_id, year, month, day):
    day = float(str(day).zfill(2))
    url = f"https://openaq-data-archive.s3.amazonaws.com/records/csv.gz/locationid={location_id}/year={year}/month={month}/location-{location_id}-{year}{month}{day}.csv.gz"
    download_file(url,f"../Data/Outputs/station_{location_id}.csv.gz")
    
    return pd.read_csv(f"../Data/Outputs/station_{location_id}.csv.gz")

In [None]:
station_IDs[1]

In [None]:
pull_records(station_IDs[3],2024,10,3)

In [None]:
station_IDs[0]

In [None]:
location_id = station_IDs[1]
year = 2025
month = 10

In [None]:
url = f"https://openaq-data-archive.s3.amazonaws.com/records/csv.gz/locationid={location_id}/year={year}/month={month}/location-{location_id}-{year}{month}01.csv.gz"

In [None]:
download_file(url,f"../Data/Outputs/station_{location_id}.csv.gz")

In [None]:
station_847 = pd.read_csv("../Data/Outputs/station_847.csv.gz")

In [None]:
station_1502.head()

In [None]:
pm2_5_df[pm2_5_df['station_id'] == 1502]

In [None]:
cursor = connect(
    s3_staging_dir="s3://la-openaq-athena-results-us-east-1/temp/",
    region_name="us-east-1"
).cursor()

cursor.execute("DROP TABLE openaq_db.openaqMeasurements")

In [None]:
cursor.execute("CREATE DATABASE IF NOT EXISTS openaq_db;")