In [1]:
import pandas as pd

import geopandas as gpd

import requests

import datetime as dt
from pyathena import connect

```
s3://openaq-data-archive/records/csv.gz/
├─ year=2025/
│  ├─ month=10/
│  │  ├─ locationid=2178/
│  │  ├─ locationid=827/
│  │  └─ ...
│  └─ month=11/...
└─ year=2024/...
```

Example file path:

` /records/csv.gz/locationid=2178/year=2022/month=05/location-2178-20220503.csv.gz `

In [2]:
cursor = connect(
    s3_staging_dir="s3://la-openaq-athena-results-us-east-1/temp/",
    region_name="us-east-1"
).cursor()

In [42]:
cursor.execute('SELECT version();')

OperationalError: FUNCTION_NOT_FOUND: line 1:8: Function 'version' not registered

cursor.execute("DROP TABLE openaq_db.openaqMeasurements")

cursor.execute("DROP TABLE openaq_db.openaqMeasurements")
cursor.execute("DROP DATABASE openaq_db;")
cursor.execute("CREATE DATABASE IF NOT EXISTS openaq_db;")

In [5]:
cursor.execute("CREATE DATABASE IF NOT EXISTS openaq_db;")

<pyathena.cursor.Cursor at 0x7f2ff9f9b8c0>

https://docs.aws.amazon.com/athena/latest/ug/csv-serde.html

In [9]:
create_table_query = """
CREATE TABLE `openaqMeasurements`(
  `location_id` INT,
  `sensors_id` INT,
  `location` STRING,
  `datetime` STRING,
  `lat` float,
  `lon` float,
  `parameter` STRING,
  `units` STRING,
  `value` float
)
PARTITIONED BY (locationid string, year string, month string)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
  "separatorChar" = ",",
  "quoteChar"     = "`",
  "escapeChar"    = "\\"
)
LOCATION
  's3://openaq-data-archive/records/csv.gz/'
  TBLPROPERTIES ('skip.header.line.count'='1')
"""

cursor.execute(create_table_query)

Failed to execute query.
Traceback (most recent call last):
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/pyathena/common.py", line 645, in _execute
    query_id = retry_api_call(
               ~~~~~~~~~~~~~~^
        self._connection.client.start_query_execution,
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<2 lines>...
        **request,
        ^^^^^^^^^^
    ).get("QueryExecutionId")
    ^
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/pyathena/util.py", line 196, in retry_api_call
    return retry(func, *args, **kwargs)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init__.py", line 477, in __call__
    do = self.iter(retry_state=retry_state)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init__.py", line 378, in iter
    result = action(retry_state)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init_

DatabaseError: An error occurred (InvalidRequestException) when calling the StartQueryExecution operation: line 1:14: backquoted identifiers are not supported; use double quotes to quote identifiers

In [None]:
cursor.execute("DROP DATABASE IF EXISTS openaq_db;")
cursor.execute("DROP TABLE IF EXISTS openaq_db.openaqMeasurements")
cursor.execute("CREATE DATABASE IF NOT EXISTS openaq_db;")

In [19]:
create_table_query2 = """
CREATE TABLE openaq_db.openaqMeasurements (
  location_id INT,
  sensors_id INT,
  location STRING,
  datetime STRING,
  lat float,
  lon float,
  parameter STRING,
  units STRING,
  value float
)
PARTITIONED BY (locationid STRING, year STRING, month STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
   'separatorChar' = ',',
   'quoteChar' = '"',
   'escapeChar' = '\\'
   )
STORED AS TEXTFILE
LOCATION 's3://openaq-data-archive/records/csv.gz/';
"""
cursor.execute(create_table_query2)

Failed to execute query.
Traceback (most recent call last):
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/pyathena/common.py", line 645, in _execute
    query_id = retry_api_call(
               ~~~~~~~~~~~~~~^
        self._connection.client.start_query_execution,
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<2 lines>...
        **request,
        ^^^^^^^^^^
    ).get("QueryExecutionId")
    ^
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/pyathena/util.py", line 196, in retry_api_call
    return retry(func, *args, **kwargs)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init__.py", line 477, in __call__
    do = self.iter(retry_state=retry_state)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init__.py", line 378, in iter
    result = action(retry_state)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init_

DatabaseError: An error occurred (InvalidRequestException) when calling the StartQueryExecution operation: line 12:1: mismatched input 'PARTITIONED'. Expecting: 'COMMENT', 'WITH', <EOF>

In [None]:
"""
CREATE TABLE IF NOT EXISTS openaq_db.openaqMeasurements (
  location_id INT,
  sensors_id INT,
  location STRING,
  datetime STRING,
  lat float,
  lon float,
  parameter STRING,
  units STRING,
  value float
)
PARTITIONED BY (locationid STRING, year STRING, month STRING)
ROW FORMAT DELIMITED
    [ STORED AS file_format ]
    [ LOCATION path ]
    [ TBLPROPERTIES ( key1=val1, key2=val2, ... ) ]
    [ AS select_statement ]

row_format:
    : SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde' [ WITH SERDEPROPERTIES (k1=v1, k2=v2, ... ) ]
    | DELIMITED [ FIELDS TERMINATED BY fields_terminated_char [ ESCAPED BY escaped_char ] ]
        [ COLLECTION ITEMS TERMINATED BY collection_items_terminated_char ]
        [ MAP KEYS TERMINATED BY map_key_terminated_char ]
        [ LINES TERMINATED BY row_terminated_char ]
        [ NULL DEFINED AS null_char ]
"""

In [26]:
create = """
CREATE TABLE openaq_db.openaqMeasurements(
  location_id INT,
  sensors_id INT,
  location STRING,
  datetime STRING,
  lat float,
  lon float,
  parameter STRING,
  units STRING,
  value float
)
ROW FORMAT DELIMITED
    FIELDS TERMINATED BY ',' 
    ESCAPED BY '\\'
    MAP KEYS TERMINATED BY ':'
    LINES TERMINATED BY '\n'
    NULL DEFINED AS 'NULL'
    STORED AS TEXTFILE
LOCATION
  's3://openaq-data-archive/records/csv.gz/'
  TBLPROPERTIES ('skip.header.line.count'='1');
"""
cursor.execute(create)

Failed to execute query.
Traceback (most recent call last):
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/pyathena/common.py", line 645, in _execute
    query_id = retry_api_call(
               ~~~~~~~~~~~~~~^
        self._connection.client.start_query_execution,
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<2 lines>...
        **request,
        ^^^^^^^^^^
    ).get("QueryExecutionId")
    ^
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/pyathena/util.py", line 196, in retry_api_call
    return retry(func, *args, **kwargs)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init__.py", line 477, in __call__
    do = self.iter(retry_state=retry_state)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init__.py", line 378, in iter
    result = action(retry_state)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init_

DatabaseError: An error occurred (InvalidRequestException) when calling the StartQueryExecution operation: line 12:1: mismatched input 'ROW'. Expecting: 'COMMENT', 'WITH', <EOF>

In [40]:
create_functioning = """
CREATE EXTERNAL TABLE IF NOT EXISTS openaq_db.openaqMeasurements (
  location_id INT,
  sensors_id INT,
  location STRING,
  datetime STRING,
  lat DOUBLE,
  lon DOUBLE,
  parameter STRING,
  units STRING,
  value DOUBLE
)
PARTITIONED BY (locationid STRING, year STRING, month STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES ('serialization.format'=',','field.delim'=',')
LOCATION 's3://openaq-data-archive/records/csv.gz/'
TBLPROPERTIES ('skip.header.line.count'='1');
"""
cursor.execute(create_functioning)

<pyathena.cursor.Cursor at 0x7f2ff9f9b8c0>

In [32]:
create3 = """
CREATE TABLE IF NOT EXISTS openaq_db.openaqMeasurements (
  location_id INT,
  sensors_id INT,
  location STRING,
  datetime STRING,
  lat DOUBLE,
  lon DOUBLE,
  parameter STRING,
  units STRING,
  value DOUBLE
)
WITH SERDEPROPERTIES (
   'separatorChar' = ',',
   'quoteChar' = '"',
   'escapeChar' = '\\'
   )
LOCATION 's3://openaq-data-archive/records/csv.gz/'
TBLPROPERTIES ('skip.header.line.count'='1');
"""
cursor.execute(create3)

Failed to execute query.
Traceback (most recent call last):
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/pyathena/common.py", line 645, in _execute
    query_id = retry_api_call(
               ~~~~~~~~~~~~~~^
        self._connection.client.start_query_execution,
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<2 lines>...
        **request,
        ^^^^^^^^^^
    ).get("QueryExecutionId")
    ^
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/pyathena/util.py", line 196, in retry_api_call
    return retry(func, *args, **kwargs)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init__.py", line 477, in __call__
    do = self.iter(retry_state=retry_state)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init__.py", line 378, in iter
    result = action(retry_state)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init_

DatabaseError: An error occurred (InvalidRequestException) when calling the StartQueryExecution operation: line 12:6: mismatched input 'SERDEPROPERTIES'. Expecting: '('

In [34]:
create_functioning2 = """
CREATE EXTERNAL TABLE IF NOT EXISTS openaq_db.openaqMeasurements (
  location_id INT,
  sensors_id INT,
  location STRING,
  datetime STRING,
  lat DOUBLE,
  lon DOUBLE,
  parameter STRING,
  units STRING,
  value DOUBLE
)
PARTITIONED BY (locationid STRING, year STRING, month STRING)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
   'separatorChar' = ',',
   'quoteChar' = '"',
   'escapeChar' = '\\'
   )
LOCATION 's3://openaq-data-archive/records/csv.gz/'
TBLPROPERTIES ('skip.header.line.count'='1');
"""

cursor.execute(create_functioning2)

Failed to execute query.
Traceback (most recent call last):
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/pyathena/common.py", line 645, in _execute
    query_id = retry_api_call(
               ~~~~~~~~~~~~~~^
        self._connection.client.start_query_execution,
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<2 lines>...
        **request,
        ^^^^^^^^^^
    ).get("QueryExecutionId")
    ^
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/pyathena/util.py", line 196, in retry_api_call
    return retry(func, *args, **kwargs)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init__.py", line 477, in __call__
    do = self.iter(retry_state=retry_state)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init__.py", line 378, in iter
    result = action(retry_state)
  File "/home/daniel/miniforge3/envs/la_env/lib/python3.14/site-packages/tenacity/__init_

DatabaseError: An error occurred (InvalidRequestException) when calling the StartQueryExecution operation: line 1:8: mismatched input 'EXTERNAL'. Expecting: 'MATERIALIZED', 'MULTI', 'OR', 'PROTECTED', 'ROLE', 'SCHEMA', 'TABLE', 'VIEW'