In [51]:
import pandas as pd

import geopandas as gpd

import requests

import datetime as dt
from pyathena import connect

In [4]:
recents_pm25 = gpd.read_file(r'../Data/Outputs/recents_pm25.geojson')

In [16]:
recents_pm25.head(3)

Unnamed: 0,station_id,name,sensor_id,sensor_display_name,timezone,is_mobile,is_monitor,owner.id,owner.name,provider.id,provider.name,coordinates.latitude,coordinates.longitude,datetime_first.utc,datetime_last.utc,sensor_type,geometry
0,1200,Glendora - Laurel,2150,PM2.5,America/Los_Angeles,False,True,4,Unknown Governmental Organization,119,AirNow,34.1439,-117.8508,2016-03-06 20:00:00+00:00,2025-10-24 19:00:00+00:00,pm25 µg/m³,POINT (34.1439 -117.8508)
1,1948,EBAM 11,25551,PM2.5,America/Los_Angeles,False,True,4,Unknown Governmental Organization,119,AirNow,33.90144,-118.20502,2016-03-06 20:00:00+00:00,2025-10-24 19:00:00+00:00,pm25 µg/m³,POINT (33.90144 -118.20502)
2,1989,Santa Clarita,3523,PM2.5,America/Los_Angeles,False,True,4,Unknown Governmental Organization,119,AirNow,34.3833,-118.5283,2016-03-06 20:00:00+00:00,2025-10-24 19:00:00+00:00,pm25 µg/m³,POINT (34.3833 -118.5283)


In [17]:
recents_pm25.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 310 entries, 0 to 309
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   station_id             310 non-null    int32              
 1   name                   310 non-null    object             
 2   sensor_id              310 non-null    int32              
 3   sensor_display_name    310 non-null    object             
 4   timezone               310 non-null    object             
 5   is_mobile              310 non-null    bool               
 6   is_monitor             310 non-null    bool               
 7   owner.id               310 non-null    int32              
 8   owner.name             310 non-null    object             
 9   provider.id            310 non-null    int32              
 10  provider.name          310 non-null    object             
 11  coordinates.latitude   310 non-null    float64    

```
s3://openaq-data-archive/records/csv.gz/
├─ year=2025/
│  ├─ month=10/
│  │  ├─ locationid=2178/
│  │  ├─ locationid=827/
│  │  └─ ...
│  └─ month=11/...
└─ year=2024/...
```

Example file path:

` /records/csv.gz/locationid=2178/year=2022/month=05/location-2178-20220503.csv.gz `

In [9]:
cursor = connect(
    s3_staging_dir="s3://la-openaq-athena-results-us-east-1/temp/",
    region_name="us-east-1"
).cursor()

cursor.execute("DROP TABLE openaq_db.openaqMeasurements")

In [135]:
cursor.execute("CREATE DATABASE IF NOT EXISTS openaq_db;")

<pyathena.cursor.Cursor at 0x7f75c4586270>

In [136]:
create_table_query = """
CREATE EXTERNAL TABLE IF NOT EXISTS openaq_db.openaqMeasurements (
  location_id INT,
  sensors_id INT,
  location STRING,
  datetime STRING,
  lat DOUBLE,
  lon DOUBLE,
  parameter STRING,
  units STRING,
  value DOUBLE
)
PARTITIONED BY (locationid STRING, year STRING, month STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES ('serialization.format'=',','field.delim'=',')
LOCATION 's3://openaq-data-archive/records/csv.gz/'
TBLPROPERTIES ('skip.header.line.count'='1');
"""

In [137]:
cursor.execute(create_table_query)

<pyathena.cursor.Cursor at 0x7f75c4586270>

In [146]:
cursor.execute("""
SELECT lat, lon, value, typeof(lat), typeof(lon), typeof(value)
FROM openaq_db.openaqMeasurements
WHERE locationid = '1989'
LIMIT 3;
"""
)
print(cursor.fetchall())

[(None, None, None, 'double', 'double', 'double'), (None, None, None, 'double', 'double', 'double'), (None, None, None, 'double', 'double', 'double')]


In [15]:
cursor.execute("SHOW PARTITIONS openaq_db.openaqMeasurements;")
print(cursor.fetchall())

[]


In [149]:
cursor.execute("""
SELECT *
FROM openaq_db.openaqMeasurements
WHERE locationid = '1989'
LIMIT 1;
"""
)
print(cursor.fetchall())

[(1989, 3523, '"Santa Clarita-1989"', '"2025-10-02T01:00:00-07:00"', None, None, '"pm25"', '"µg/m³"', None, '1989', '2025', '10')]


In [20]:
test_IDs = list(recents_pm25['station_id'][:3])
test_IDs

[1200, 1948, 1989]

In [26]:
now = pd.Timestamp(dt.datetime.now(), tz='UTC')

In [142]:
year = 2025
month = 10

In [143]:
for location in test_IDs:
    cursor.execute(f"""
    ALTER TABLE openaq_db.openaqMeasurements ADD
    PARTITION (year='{year}', month='{month}', locationid='{location}')
    LOCATION 's3://openaq-data-archive/records/csv.gz/locationid={location}/year={year}/month={month}/';
    """)

In [144]:
cursor.execute("SHOW PARTITIONS openaq_db.openaqMeasurements;")
print(cursor.fetchall())

[('locationid=1989/year=2025/month=10',), ('locationid=1948/year=2025/month=10',), ('locationid=1200/year=2025/month=10',)]


As expected, we see 3 partitions, each in October, 2025, corresponding to the three stations given

In [33]:
cursor.execute("SELECT * FROM openaq_db.openaqMeasurements LIMIT 1;")
print(cursor.fetchall())

[(1948, 13574377, '"EBAM 11-1948"', '"2025-10-07T01:00:00-07:00"', None, None, '"pm10"', '"µg/m³"', None, '1948', '2025', '10')]


This conforms to the expected units:

    location_id INT,
    sensors_id INT,
    location STRING,
    datetime STRING,
    lat DOUBLE,
    lon DOUBLE,
    parameter STRING,
    units STRING,
    value DOUBLE

And also appends the `locationid` (redundant), `year` and `month`

The data is `None`

Finding non-null (`None`) data:

In [34]:
cursor.execute("""
SELECT * 
FROM openaq_db.openaqMeasurements 
WHERE value IS NOT NULL
LIMIT 5;
""")
print(cursor.fetchall())

[]


Out of the three locations, all readings are null within the month of October

### Trying again with 50 stations:

cursor.execute("DROP TABLE openaq_db.openaqMeasurements")

In [98]:
cursor.execute(create_table_query)

<pyathena.cursor.Cursor at 0x7f75c4586270>

In [36]:
test_IDs_50 = list(recents_pm25['station_id'][:50])
test_IDs_50[:5]

[1200, 1948, 1989, 2138, 5791]

In [39]:
for location in test_IDs_50:
    cursor.execute(f"""
    ALTER TABLE openaq_db.openaqMeasurements ADD
    PARTITION (year='{year}', month='{month}', locationid='{location}')
    LOCATION 's3://openaq-data-archive/records/csv.gz/locationid={location}/year={year}/month={month}/';
    """)

In [40]:
cursor.execute("""
SELECT * 
FROM openaq_db.openaqMeasurements 
WHERE value IS NOT NULL
LIMIT 5;
""")
print(cursor.fetchall())

[]


Again, all data is null for all 50 stations. I will try to pull the same data

In [44]:
recents_pm25['sensor_id'][0]

np.int32(2150)

In [41]:
def download_file(url, save_path):
    response = requests.get(url, stream=True)
    
    if response.status_code == 200:
        with open(save_path, 'wb') as file:
            file.write(response.content)
        print(f'{url} downloaded')
    else:
        print(f'{url} failed')

In [42]:
def pull_records(location_id, year, month, day):
    day = float(str(day).zfill(2))
    url = f"https://openaq-data-archive.s3.amazonaws.com/records/csv.gz/locationid={location_id}/year={year}/month={month}/location-{location_id}-{year}{month}{day}.csv.gz"
    download_file(url,f"../Data/Outputs/station_{location_id}.csv.gz")
    
    return pd.read_csv(f"../Data/Outputs/station_{location_id}.csv.gz")

In [46]:
cursor.execute("SHOW PARTITIONS openaq_db.openaqMeasurements;")
print(cursor.fetchall()[0])

('locationid=947177/year=2025/month=10',)


In [53]:
cursor.execute("""
SELECT * 
FROM openaq_db.openaqMeasurements 
WHERE locationid = '947177'
LIMIT 1;
""")
print(cursor.fetchall())

[(947177, 2000565, '"Oxnard ES (5918)-958227"', '"2025-10-19T00:00:51-07:00"', None, None, '"pm25"', '"µg/m³"', None, '947177', '2025', '10')]


In [54]:
url = f"https://openaq-data-archive.s3.amazonaws.com/records/csv.gz/locationid=947177/year=2025/month=10/location-947177-20251019.csv.gz"

In [55]:
download_file(url,f"../Data/Outputs/station_947177.csv.gz")

https://openaq-data-archive.s3.amazonaws.com/records/csv.gz/locationid=947177/year=2025/month=10/location-947177-20251019.csv.gz downloaded


In [78]:
station_947177 = pd.read_csv(r'../Data/Outputs/station_947177.csv.gz')

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


In [58]:
station_947177.head()

Unnamed: 0,location_id,sensors_id,location,datetime,lat,lon,parameter,units,value
0,947177,2000565,Oxnard ES (5918)-958227,2025-10-19T00:00:51-07:00,34.1785,-118.36865,pm25,µg/m³,14.04
1,947177,2000565,Oxnard ES (5918)-958227,2025-10-19T00:06:24-07:00,34.1785,-118.36865,pm25,µg/m³,15.24
2,947177,2000565,Oxnard ES (5918)-958227,2025-10-19T01:01:49-07:00,34.1785,-118.36865,pm25,µg/m³,11.74
3,947177,2000565,Oxnard ES (5918)-958227,2025-10-19T01:07:22-07:00,34.1785,-118.36865,pm25,µg/m³,12.38
4,947177,2000565,Oxnard ES (5918)-958227,2025-10-19T01:12:55-07:00,34.1785,-118.36865,pm25,µg/m³,11.89


In [61]:
station_947177.iloc[0]

location_id                       947177
sensors_id                       2000565
location         Oxnard ES (5918)-958227
datetime       2025-10-19T00:00:51-07:00
lat                              34.1785
lon                           -118.36865
parameter                           pm25
units                              µg/m³
value                              14.04
Name: 0, dtype: object

In [62]:
cursor.execute("""
SELECT * 
FROM openaq_db.openaqMeasurements 
WHERE locationid = '947177'
LIMIT 1;
""")
print(cursor.fetchall())

[(947177, 2000565, '"Oxnard ES (5918)-958227"', '"2025-10-05T00:05:30-07:00"', None, None, '"pm25"', '"µg/m³"', None, '947177', '2025', '10')]


Attempting to download data from `2025-10-05` to check whether it is also null when pulling directly from S3

In [87]:
url = f"https://openaq-data-archive.s3.amazonaws.com/records/csv.gz/locationid=947177/year=2025/month=10/location-947177-20251005.csv.gz"

In [88]:
download_file(url,f"../Data/Outputs/station_947177_05.csv.gz")

https://openaq-data-archive.s3.amazonaws.com/records/csv.gz/locationid=947177/year=2025/month=10/location-947177-20251005.csv.gz downloaded


In [92]:
station_947177_05 = pd.read_csv(r'../Data/Outputs/station_947177_05.csv.gz')

In [94]:
but

Unnamed: 0,location_id,sensors_id,location,datetime,lat,lon,parameter,units,value
0,947177,2000565,Oxnard ES (5918)-958227,2025-10-05T00:05:30-07:00,34.1785,-118.36865,pm25,µg/m³,11.6


Via Athena, we got `[(947177, 2000565, '"Oxnard ES (5918)-958227"', '"2025-10-05T00:05:30-07:00"', None, None, '"pm25"', '"µg/m³"', None, '947177', '2025', '10')]` i.e. a `None`, but via S3 we see the value is `11.6`

The data via Athena doesn't even have `lat` or `long`, which are also supposed to be "double" data types:

```
 location_id INT,
    sensors_id INT,
    location STRING,
    datetime STRING,
    lat DOUBLE,
    lon DOUBLE,
    parameter STRING,
    units STRING,
    value DOUBLE
```

There is likely an issue reading this type

#### I will try again but setting the dtype to FLOAT

cursor.execute("DROP TABLE openaq_db.openaqMeasurements")

In [95]:
create_table_query = """
CREATE EXTERNAL TABLE IF NOT EXISTS openaq_db.openaqMeasurements (
  location_id INT,
  sensors_id INT,
  location STRING,
  datetime STRING,
  lat FLOAT,
  lon FLOAT,
  parameter STRING,
  units STRING,
  value FLOAT
)
PARTITIONED BY (locationid STRING, year STRING, month STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES ('serialization.format'=',','field.delim'=',')
LOCATION 's3://openaq-data-archive/records/csv.gz/'
TBLPROPERTIES ('skip.header.line.count'='1');
"""

In [99]:
cursor.execute(create_table_query)

<pyathena.cursor.Cursor at 0x7f75c4586270>

In [100]:
for location in test_IDs_50:
    cursor.execute(f"""
    ALTER TABLE openaq_db.openaqMeasurements ADD
    PARTITION (year='{year}', month='{month}', locationid='{location}')
    LOCATION 's3://openaq-data-archive/records/csv.gz/locationid={location}/year={year}/month={month}/';
    """)

In [102]:
cursor.execute("""
SELECT * 
FROM openaq_db.openaqMeasurements 
WHERE locationid = '947177'
LIMIT 1;
""")
print(cursor.fetchall())

[(947177, 2000565, '"Oxnard ES (5918)-958227"', '"2025-10-08T00:02:01-07:00"', None, None, '"pm25"', '"µg/m³"', None, '947177', '2025', '10')]


In [103]:
station_947177_05[station_947177_05['datetime'] == '2025-10-05T00:05:30-07:00']

Unnamed: 0,location_id,sensors_id,location,datetime,lat,lon,parameter,units,value
0,947177,2000565,Oxnard ES (5918)-958227,2025-10-05T00:05:30-07:00,34.1785,-118.36865,pm25,µg/m³,11.6


### Another attempt

In [115]:
cursor.execute("DROP TABLE openaq_db.openaqMeasurements")

OperationalError: FAILED: SemanticException [Error 10001]: Table not found openaq_db.openaqMeasurements

In [133]:
template_create_table_query = """
CREATE EXTERNAL TABLE `openaqMeasurements`(
  `location_id` INT,
  `sensors_id` INT,
  `location` STRING,
  `datetime` STRING,
  `lat` float,
  `lon` float,
  `parameter` STRING,
  `units` STRING,
  `value` float
)
PARTITIONED BY (locationid string, year string, month string)
ROW FORMAT DELIMITED
  FIELDS TERMINATED BY ','
  ESCAPED BY '\\'
  LINES TERMINATED BY '\n'
LOCATION
  's3://openaq-data-archive/records/csv.gz/'
  TBLPROPERTIES ('skip.header.line.count'='1')
"""

In [134]:
cursor.execute(template_create_table_query)

Failed to execute query.
Traceback (most recent call last):
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/pyathena/common.py", line 645, in _execute
    query_id = retry_api_call(
               ~~~~~~~~~~~~~~^
        self._connection.client.start_query_execution,
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<2 lines>...
        **request,
        ^^^^^^^^^^
    ).get("QueryExecutionId")
    ^
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/pyathena/util.py", line 196, in retry_api_call
    return retry(func, *args, **kwargs)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init__.py", line 477, in __call__
    do = self.iter(retry_state=retry_state)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init__.py", line 378, in iter
    result = action(retry_state)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init_

DatabaseError: An error occurred (InvalidRequestException) when calling the StartQueryExecution operation: line 1:8: mismatched input 'EXTERNAL'. Expecting: 'MATERIALIZED', 'MULTI', 'OR', 'PROTECTED', 'ROLE', 'SCHEMA', 'TABLE', 'VIEW'

create_table_query = """
CREATE TABLE openaq_db.openaqMeasurements(
  "location_id" INT,
  "sensors_id" INT,
  "location" STRING,
  "datetime" STRING,
  "lat" float,
  "lon" float,
  "parameter" STRING,
  "units" STRING,
  "value" float
)
PARTITIONED BY (locationid string, year string, month string)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
  'separatorChar' = ',',
  'quoteChar' = '"',
  'escapeChar' = '\\'
)
LOCATION 's3://openaq-data-archive/records/csv.gz/'
TBLPROPERTIES ('skip.header.line.count'='1');
"""

In [130]:
cursor.execute(create_table_query)

Failed to execute query.
Traceback (most recent call last):
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/pyathena/common.py", line 645, in _execute
    query_id = retry_api_call(
               ~~~~~~~~~~~~~~^
        self._connection.client.start_query_execution,
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<2 lines>...
        **request,
        ^^^^^^^^^^
    ).get("QueryExecutionId")
    ^
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/pyathena/util.py", line 196, in retry_api_call
    return retry(func, *args, **kwargs)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init__.py", line 477, in __call__
    do = self.iter(retry_state=retry_state)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init__.py", line 378, in iter
    result = action(retry_state)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init_

DatabaseError: An error occurred (InvalidRequestException) when calling the StartQueryExecution operation: line 12:1: mismatched input 'PARTITIONED'. Expecting: 'COMMENT', 'WITH', <EOF>

In [86]:
station_947177[station_947177['datetime'] == "2025-10-05"]

Unnamed: 0,location_id,sensors_id,location,datetime,lat,lon,parameter,units,value


In [83]:
cursor.execute("""
SELECT * 
FROM openaq_db.openaqMeasurements 
WHERE locationid = '947177' AND datetime = '2025-10-19'
LIMIT 1;
""")
print(cursor.fetchall())

[]


In [84]:
cursor.execute("""
SELECT * 
FROM openaq_db.openaqMeasurements 
WHERE locationid = '947177' AND value IS NOT NULL
LIMIT 1;
""")
print(cursor.fetchall())

[]


In [85]:
station_947177.iloc[0]['value']

np.float64(14.04)

In [None]:
create_table_query = """
CREATE EXTERNAL TABLE IF NOT EXISTS openaq_db.openaqMeasurements (
  location_id INT,
  sensors_id INT,
  location STRING,
  datetime STRING,
  lat FLOAT,
  lon FLOAT,
  parameter STRING,
  units STRING,
  value FLOAT
)
PARTITIONED BY (locationid STRING, year STRING, month STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES ('serialization.format'=',','field.delim'=',')
LOCATION 's3://openaq-data-archive/records/csv.gz/'
TBLPROPERTIES ('skip.header.line.count'='1');
"""

In [179]:
another_create_table_query = """
CREATE TABLE myopencsvtable (
  location_id INT,
  sensors_id INT,
  location STRING,
  datetime STRING,
  lat FLOAT,
  lon FLOAT,
  parameter STRING,
  units STRING,
  value FLOAT
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
   'separatorChar' = ',',
   'quoteChar' = '"',
   'escapeChar' = '\\'
   )
STORED AS TEXTFILE
LOCATION 's3://openaq-data-archive/records/csv.gz/';
"""

cursor.execute(another_create_table_query)

Failed to execute query.
Traceback (most recent call last):
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/pyathena/common.py", line 645, in _execute
    query_id = retry_api_call(
               ~~~~~~~~~~~~~~^
        self._connection.client.start_query_execution,
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<2 lines>...
        **request,
        ^^^^^^^^^^
    ).get("QueryExecutionId")
    ^
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/pyathena/util.py", line 196, in retry_api_call
    return retry(func, *args, **kwargs)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init__.py", line 477, in __call__
    do = self.iter(retry_state=retry_state)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init__.py", line 378, in iter
    result = action(retry_state)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init_

DatabaseError: An error occurred (InvalidRequestException) when calling the StartQueryExecution operation: line 12:1: mismatched input 'ROW'. Expecting: 'COMMENT', 'WITH', <EOF>

cursor.execute("DROP TABLE openaq_db.openaqMeasurements")

In [172]:
cursor.execute(another_create_table_query)

Failed to execute query.
Traceback (most recent call last):
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/pyathena/common.py", line 645, in _execute
    query_id = retry_api_call(
               ~~~~~~~~~~~~~~^
        self._connection.client.start_query_execution,
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<2 lines>...
        **request,
        ^^^^^^^^^^
    ).get("QueryExecutionId")
    ^
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/pyathena/util.py", line 196, in retry_api_call
    return retry(func, *args, **kwargs)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init__.py", line 477, in __call__
    do = self.iter(retry_state=retry_state)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init__.py", line 378, in iter
    result = action(retry_state)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init_

DatabaseError: An error occurred (InvalidRequestException) when calling the StartQueryExecution operation: line 12:1: mismatched input 'ROW'. Expecting: 'COMMENT', 'WITH', <EOF>

`` /records/csv.gz/locationid=2178/year=2022/month=05/location-2178-20220503.csv.gz ``