In [2]:
import pandas as pd

import geopandas as gpd

import requests

import datetime as dt
from pyathena import connect

```
s3://openaq-data-archive/records/csv.gz/
├─ year=2025/
│  ├─ month=10/
│  │  ├─ locationid=2178/
│  │  ├─ locationid=827/
│  │  └─ ...
│  └─ month=11/...
└─ year=2024/...
```

Example file path:

` /records/csv.gz/locationid=2178/year=2022/month=05/location-2178-20220503.csv.gz `

In [3]:
recents_pm25 = gpd.read_file(r'../Data/Outputs/recents_pm25.geojson')

In [4]:
cursor = connect(
    s3_staging_dir="s3://la-openaq-athena-results-us-east-1/temp/",
    region_name="us-east-1"
).cursor()

cursor.execute("DROP TABLE openaq_db.openaqMeasurements")

In [6]:
cursor.execute("CREATE DATABASE IF NOT EXISTS openaq_db;")

<pyathena.cursor.Cursor at 0x7fee0f844980>

Trying to run the template query for creating a schema table, taken directly from the OpenAQ docs:

https://docs.openaq.org/aws/athena-guide

In [8]:
template_query = """
CREATE EXTERNAL TABLE `openaqMeasurements`(
  `location_id` INT,
  `sensors_id` INT,
  `location` STRING,
  `datetime` STRING,
  `lat` float,
  `lon` float,
  `parameter` STRING,
  `units` STRING,
  `value` float
)
PARTITIONED BY (locationid string, year string, month string)
ROW FORMAT DELIMITED
  FIELDS TERMINATED BY ','
  ESCAPED BY '\\'
  LINES TERMINATED BY '\n'
LOCATION
  's3://openaq-data-archive/records/csv.gz/'
  TBLPROPERTIES ('skip.header.line.count'='1')
"""

In [9]:
cursor.execute(template_query)

Failed to execute query.
Traceback (most recent call last):
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/pyathena/common.py", line 645, in _execute
    query_id = retry_api_call(
               ~~~~~~~~~~~~~~^
        self._connection.client.start_query_execution,
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<2 lines>...
        **request,
        ^^^^^^^^^^
    ).get("QueryExecutionId")
    ^
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/pyathena/util.py", line 196, in retry_api_call
    return retry(func, *args, **kwargs)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init__.py", line 477, in __call__
    do = self.iter(retry_state=retry_state)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init__.py", line 378, in iter
    result = action(retry_state)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init_

DatabaseError: An error occurred (InvalidRequestException) when calling the StartQueryExecution operation: line 1:8: mismatched input 'EXTERNAL'. Expecting: 'MATERIALIZED', 'MULTI', 'OR', 'PROTECTED', 'ROLE', 'SCHEMA', 'TABLE', 'VIEW'

In [10]:
template_query2 = """
CREATE TABLE `openaqMeasurements`(
  `location_id` INT,
  `sensors_id` INT,
  `location` STRING,
  `datetime` STRING,
  `lat` float,
  `lon` float,
  `parameter` STRING,
  `units` STRING,
  `value` float
)
PARTITIONED BY (locationid string, year string, month string)
ROW FORMAT DELIMITED
  FIELDS TERMINATED BY ','
  ESCAPED BY '\\'
  LINES TERMINATED BY '\n'
LOCATION
  's3://openaq-data-archive/records/csv.gz/'
  TBLPROPERTIES ('skip.header.line.count'='1')
"""

In [11]:
cursor.execute(template_query2)

Failed to execute query.
Traceback (most recent call last):
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/pyathena/common.py", line 645, in _execute
    query_id = retry_api_call(
               ~~~~~~~~~~~~~~^
        self._connection.client.start_query_execution,
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<2 lines>...
        **request,
        ^^^^^^^^^^
    ).get("QueryExecutionId")
    ^
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/pyathena/util.py", line 196, in retry_api_call
    return retry(func, *args, **kwargs)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init__.py", line 477, in __call__
    do = self.iter(retry_state=retry_state)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init__.py", line 378, in iter
    result = action(retry_state)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init_

DatabaseError: An error occurred (InvalidRequestException) when calling the StartQueryExecution operation: line 1:14: backquoted identifiers are not supported; use double quotes to quote identifiers

In [17]:
template_query3 = """
CREATE TABLE "openaqMeasurements"(
  "location_id" INT,
  "sensors_id" INT,
  "location" STRING,
  "datetime" STRING,
  "lat" float,
  "lon" float,
  "parameter" STRING,
  "units" STRING,
  "value" float
)
PARTITIONED BY (locationid string, year string, month string)
ROW FORMAT DELIMITED
  FIELDS TERMINATED BY ','
  ESCAPED BY '\\'
  LINES TERMINATED BY '\n'
LOCATION
  's3://openaq-data-archive/records/csv.gz/'
  TBLPROPERTIES ('skip.header.line.count'='1')
"""

In [18]:
cursor.execute(template_query3)

Failed to execute query.
Traceback (most recent call last):
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/pyathena/common.py", line 645, in _execute
    query_id = retry_api_call(
               ~~~~~~~~~~~~~~^
        self._connection.client.start_query_execution,
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<2 lines>...
        **request,
        ^^^^^^^^^^
    ).get("QueryExecutionId")
    ^
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/pyathena/util.py", line 196, in retry_api_call
    return retry(func, *args, **kwargs)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init__.py", line 477, in __call__
    do = self.iter(retry_state=retry_state)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init__.py", line 378, in iter
    result = action(retry_state)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init_

DatabaseError: An error occurred (InvalidRequestException) when calling the StartQueryExecution operation: line 12:1: mismatched input 'PARTITIONED'. Expecting: 'COMMENT', 'WITH', <EOF>

In [19]:
LazySimpleSerDe_query = """
CREATE EXTERNAL TABLE IF NOT EXISTS openaq_db.openaqMeasurements (
  location_id INT,
  sensors_id INT,
  location STRING,
  datetime STRING,
  lat DOUBLE,
  lon DOUBLE,
  parameter STRING,
  units STRING,
  value DOUBLE
)
PARTITIONED BY (locationid STRING, year STRING, month STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES ('serialization.format'=',','field.delim'=',')
LOCATION 's3://openaq-data-archive/records/csv.gz/'
TBLPROPERTIES ('skip.header.line.count'='1');
"""

In [20]:
cursor.execute(LazySimpleSerDe_query)

<pyathena.cursor.Cursor at 0x7fee0f844980>