In [2]:
import pandas as pd

import geopandas as gpd

import requests

import datetime as dt
from pyathena import connect

Expected units:

    location_id INT,
    sensors_id INT,
    location STRING,
    datetime STRING,
    lat DOUBLE,
    lon DOUBLE,
    parameter STRING,
    units STRING,
    value DOUBLE

```
s3://openaq-data-archive/records/csv.gz/
├─ year=2025/
│  ├─ month=10/
│  │  ├─ locationid=2178/
│  │  ├─ locationid=827/
│  │  └─ ...
│  └─ month=11/...
└─ year=2024/...
```

Example file path:

` /records/csv.gz/locationid=2178/year=2022/month=05/location-2178-20220503.csv.gz `

In [4]:
recents = gpd.read_file(r'../../Data/Outputs/recents_pm25.geojson')
recents.head()

Unnamed: 0,station_id,name,sensor_id,sensor_display_name,timezone,is_mobile,is_monitor,owner.id,owner.name,provider.id,provider.name,coordinates.latitude,coordinates.longitude,datetime_first.utc,datetime_last.utc,sensor_type,geometry
0,1200,Glendora - Laurel,2150,PM2.5,America/Los_Angeles,False,True,4,Unknown Governmental Organization,119,AirNow,34.1439,-117.8508,2016-03-06 20:00:00+00:00,2025-11-09 18:00:00+00:00,pm25 µg/m³,POINT (-117.8508 34.1439)
1,1948,EBAM 11,25551,PM2.5,America/Los_Angeles,False,True,4,Unknown Governmental Organization,119,AirNow,33.90144,-118.20502,2016-03-06 20:00:00+00:00,2025-11-09 18:00:00+00:00,pm25 µg/m³,POINT (-118.20502 33.90144)
2,1989,Santa Clarita,3523,PM2.5,America/Los_Angeles,False,True,4,Unknown Governmental Organization,119,AirNow,34.3833,-118.5283,2016-03-06 20:00:00+00:00,2025-11-09 18:00:00+00:00,pm25 µg/m³,POINT (-118.5283 34.3833)
3,2138,Reseda,3842,PM2.5,America/Los_Angeles,False,True,4,Unknown Governmental Organization,119,AirNow,34.1992,-118.5331,2016-03-06 20:00:00+00:00,2025-11-09 18:00:00+00:00,pm25 µg/m³,POINT (-118.5331 34.1992)
4,5791,EBAM-4,15731,PM2.5,America/Los_Angeles,False,True,4,Unknown Governmental Organization,119,AirNow,33.85966,-118.2007,2016-11-15 21:00:00+00:00,2025-11-09 18:00:00+00:00,pm25 µg/m³,POINT (-118.2007 33.85966)


In [4]:
recents.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 310 entries, 0 to 309
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   station_id             310 non-null    int32              
 1   name                   310 non-null    object             
 2   sensor_id              310 non-null    int32              
 3   sensor_display_name    310 non-null    object             
 4   timezone               310 non-null    object             
 5   is_mobile              310 non-null    bool               
 6   is_monitor             310 non-null    bool               
 7   owner.id               310 non-null    int32              
 8   owner.name             310 non-null    object             
 9   provider.id            310 non-null    int32              
 10  provider.name          310 non-null    object             
 11  coordinates.latitude   310 non-null    float64    

In [5]:
recents.iloc[0]

station_id                                            1200
name                                     Glendora - Laurel
sensor_id                                             2150
sensor_display_name                                  PM2.5
timezone                               America/Los_Angeles
is_mobile                                            False
is_monitor                                            True
owner.id                                                 4
owner.name               Unknown Governmental Organization
provider.id                                            119
provider.name                                       AirNow
coordinates.latitude                               34.1439
coordinates.longitude                            -117.8508
datetime_first.utc               2016-03-06 20:00:00+00:00
datetime_last.utc                2025-11-09 18:00:00+00:00
sensor_type                                     pm25 µg/m³
geometry                         POINT (-117.8508 34.143

In [6]:
recents.iloc[0]['datetime_last.utc']

Timestamp('2025-11-09 18:00:00+0000', tz='UTC')

### Manually pulling in data from S3:

In [7]:
def download_file(url, save_path):
    response = requests.get(url, stream=True)
    
    if response.status_code == 200:
        with open(save_path, 'wb') as file:
            file.write(response.content)
        print(f'{url} downloaded')
    else:
        print(f'{url} failed')

In [8]:
url = "https://openaq-data-archive.s3.amazonaws.com/records/csv.gz/locationid=1200/year=2025/month=10/location-1200-20251010.csv.gz"

In [9]:
download_file(url,"../Data/Outputs/station_1200.csv.gz")

https://openaq-data-archive.s3.amazonaws.com/records/csv.gz/locationid=1200/year=2025/month=10/location-1200-20251010.csv.gz downloaded


In [10]:
station_1200 = pd.read_csv(r'../Data/Outputs/station_1200.csv.gz')

In [11]:
station_1200.head()

Unnamed: 0,location_id,sensors_id,location,datetime,lat,lon,parameter,units,value
0,1200,2152,Glendora - Laurel-1200,2025-10-10T01:00:00-07:00,34.1439,-117.8508,pm10,µg/m³,17.0
1,1200,2152,Glendora - Laurel-1200,2025-10-10T02:00:00-07:00,34.1439,-117.8508,pm10,µg/m³,26.0
2,1200,2152,Glendora - Laurel-1200,2025-10-10T03:00:00-07:00,34.1439,-117.8508,pm10,µg/m³,19.0
3,1200,2152,Glendora - Laurel-1200,2025-10-10T04:00:00-07:00,34.1439,-117.8508,pm10,µg/m³,24.0
4,1200,2152,Glendora - Laurel-1200,2025-10-10T05:00:00-07:00,34.1439,-117.8508,pm10,µg/m³,23.0


In [12]:
station_1200.iloc[0]

location_id                         1200
sensors_id                          2152
location          Glendora - Laurel-1200
datetime       2025-10-10T01:00:00-07:00
lat                              34.1439
lon                            -117.8508
parameter                           pm10
units                              µg/m³
value                               17.0
Name: 0, dtype: object

### Finding the same data via Athena

In [7]:
cursor = connect(
    s3_staging_dir="s3://la-openaq-athena-results-us-east-1/temp/",
    region_name="us-east-1"
).cursor()

In [14]:
cursor.execute("DROP TABLE IF EXISTS openaq_db.openaqMeasurements;")

<pyathena.cursor.Cursor at 0x7f478bf52270>

In [15]:
create_table = """
CREATE EXTERNAL TABLE openaq_db.openaqMeasurements (
  location_id INT,
  sensors_id INT,
  location STRING,
  datetime STRING,
  lat DOUBLE,
  lon DOUBLE,
  parameter STRING,
  units STRING,
  value DOUBLE
)
PARTITIONED BY (locationid STRING, year STRING, month STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES ('serialization.format'=',','field.delim'=',')
LOCATION 's3://openaq-data-archive/records/csv.gz/'
TBLPROPERTIES ('skip.header.line.count'='1');
"""
cursor.execute(create_table)

<pyathena.cursor.Cursor at 0x7f478bf52270>

In [16]:
cursor.execute("""
ALTER TABLE openaq_db.openaqMeasurements ADD
PARTITION (year='2025', month='10', locationid='1200')
LOCATION 's3://openaq-data-archive/records/csv.gz/locationid=1200/year=2025/month=09/';
""")

<pyathena.cursor.Cursor at 0x7f478bf52270>

In [17]:
cursor.execute("SHOW PARTITIONS openaq_db.openaqMeasurements;")
print(cursor.fetchall())

[('locationid=1200/year=2025/month=10',)]


In [18]:
cursor.execute("""
SELECT * 
FROM openaq_db.openaqMeasurements 
WHERE location_id = 1200
ORDER BY datetime DESC
LIMIT 4;
""")
print(cursor.fetchall())

[(1200, 2150, 'Glendora - Laurel-1200', '2025-10-01T00:00:00-07:00', 34.1439, -117.85080000000002, 'pm25', 'µg/m³', 8.2, '1200', '2025', '10'), (1200, 24901, 'Glendora - Laurel-1200', '2025-10-01T00:00:00-07:00', 34.1439, -117.85080000000002, 'no2', 'ppm', 0.0066, '1200', '2025', '10'), (1200, 24900, 'Glendora - Laurel-1200', '2025-10-01T00:00:00-07:00', 34.1439, -117.85080000000002, 'co', 'ppm', 0.2, '1200', '2025', '10'), (1200, 2154, 'Glendora - Laurel-1200', '2025-10-01T00:00:00-07:00', 34.1439, -117.85080000000002, 'o3', 'ppm', 0.024, '1200', '2025', '10')]


In [19]:
today = 2
today.zfill(2)

AttributeError: 'int' object has no attribute 'zfill'

In [55]:
station_1200.sort_values(by = 'datetime', ascending = True)

Unnamed: 0,location_id,sensors_id,location,datetime,lat,lon,parameter,units,value
0,1200,2152,Glendora - Laurel-1200,2025-10-10T01:00:00-07:00,34.1439,-117.8508,pm10,µg/m³,17.0000
120,1200,4272387,Glendora - Laurel-1200,2025-10-10T01:00:00-07:00,34.1439,-117.8508,no,ppm,0.0006
48,1200,24901,Glendora - Laurel-1200,2025-10-10T01:00:00-07:00,34.1439,-117.8508,no2,ppm,0.0168
24,1200,2150,Glendora - Laurel-1200,2025-10-10T01:00:00-07:00,34.1439,-117.8508,pm25,µg/m³,9.9000
144,1200,4272396,Glendora - Laurel-1200,2025-10-10T01:00:00-07:00,34.1439,-117.8508,nox,ppm,0.0175
...,...,...,...,...,...,...,...,...,...
71,1200,24901,Glendora - Laurel-1200,2025-10-11T00:00:00-07:00,34.1439,-117.8508,no2,ppm,0.0159
119,1200,2154,Glendora - Laurel-1200,2025-10-11T00:00:00-07:00,34.1439,-117.8508,o3,ppm,0.0160
47,1200,2150,Glendora - Laurel-1200,2025-10-11T00:00:00-07:00,34.1439,-117.8508,pm25,µg/m³,11.1000
95,1200,24900,Glendora - Laurel-1200,2025-10-11T00:00:00-07:00,34.1439,-117.8508,co,ppm,0.3000
