In [10]:
OPENAQ_API_KEY = "2856dbe1b6e685bc1210cd8657c9b81b7809764a63a1f0033dbc1b235bd30638"

In [11]:
import requests
from datetime import date, timedelta

def fetchMeasurements(coordinates: str, radius: int, limit: int, page: int):
    today = date.today()
    yesterday = date.today() - timedelta(days=1)
    params = {
        "coordinates": coordinates,
        "radius": radius,
        "limit": limit,
        "page": page,
        "date_from": yesterday.strftime("%Y-%m-%d"),
        "date_to": today.strftime("%Y-%m-%d"),
    }
    headers={
        "X-API-Key": OPENAQ_API_KEY
    }
    req = requests.get('https://api.openaq.org/v2/measurements', params=params, headers=headers)
    print(req.url)
    return req.json()

In [12]:
from pyspark.sql.types import DoubleType, StructType, StructField, IntegerType, StringType, ArrayType

responseSchema = StructType([
    StructField("meta", StructType([
        StructField("found", IntegerType() or StringType()),
        StructField("page", IntegerType())
    ])),
    StructField("results", ArrayType(StructType([
        StructField("locationId", IntegerType()),
        StructField("location", StringType()),
        StructField("parameter", StringType()),
        StructField("value", DoubleType()),
        StructField("unit", StringType()),
        StructField("country", StringType()),
        StructField("city", StringType()),
        StructField("entity", StringType()),
        StructField("sensorType", StringType()),
        StructField("date", StructType([
            StructField("utc", StringType()),
            StructField("local", StringType()),
        ])),
        StructField("coordinates", StructType([
            StructField("latitude", DoubleType()),
            StructField("longitude", DoubleType()),
        ]))
    ])))
])

In [13]:
from pyspark.sql import Row
from pyspark.sql.functions import udf

FetchMeasurementRequestRow = Row("coordinates", "radius", "limit", "page")
udf_fetchMeasurements = udf(fetchMeasurements, responseSchema)

In [14]:
import pyspark
from pyspark.sql import SparkSession

spark = spark = SparkSession.builder \
    .appName('spark_api_to_gcs') \
    .config("spark.jars", "https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop3-latest.jar") \
    .config("spark.sql.repl.eagerEval.enabled", True) \
    .getOrCreate()
spark.sparkContext.setLogLevel("ERROR") 
spark._jsc.hadoopConfiguration().set("google.cloud.auth.service.account.json.keyfile", "/home/ali_marzouk/air-quality-421919-36f116eb9049.json")
spark._jsc.hadoopConfiguration().set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
spark._jsc.hadoopConfiguration().set("google.cloud.auth.service.account.enable", "true")

# Define GCS bucket and file path
bucket_name = "air-quality-data-0123"

In [16]:
from pyspark.sql.functions import col
from pyspark.sql.functions import size
from datetime import date

today = date.today()
MAX_PAGES = 5
shift = 1
firstDataFrame = True
while True:
    pageRequestArray = [];
    for iPages in range(shift, MAX_PAGES + shift):
        pageRequestArray.append(FetchMeasurementRequestRow('48.86211,2.344615', 20000, 1000, iPages)) 
        
    request_df = spark.createDataFrame(pageRequestArray)
    result_df = request_df \
      .withColumn("result", udf_fetchMeasurements(col("coordinates"), col("radius"), col("limit"), col("page")))
    result_df.cache()
    has_invalid = result_df.where(size(col("result.results")) == 0).count() > 0
    result_df = result_df.where(size(col("result.results")) > 0)
    # save the DataFrame to storage
    today_date_str = today.strftime("%d-%m-%Y")
    if firstDataFrame == True:
        print("saving the file")
        result_df.write.format("parquet").save(f"gs://{bucket_name}/{today_date_str}/", mode="overwrite")
        firstDataFrame = False
    else:
        result_df.write.mode("append").format("parquet").save(f"gs://{bucket_name}/{today_date_str}/")
    
    # test the responses - do we break out of the iteration loop or continue
    if has_invalid:
        break
    shift += MAX_PAGES

https://api.openaq.org/v2/measurements?coordinates=48.86211%2C2.344615&radius=20000&limit=1000&page=1&date_from=2024-04-11&date_to=2024-05-10
https://api.openaq.org/v2/measurements?coordinates=48.86211%2C2.344615&radius=20000&limit=1000&page=3&date_from=2024-04-11&date_to=2024-05-10
https://api.openaq.org/v2/measurements?coordinates=48.86211%2C2.344615&radius=20000&limit=1000&page=2&date_from=2024-04-11&date_to=2024-05-10
https://api.openaq.org/v2/measurements?coordinates=48.86211%2C2.344615&radius=20000&limit=1000&page=4&date_from=2024-04-11&date_to=2024-05-10
https://api.openaq.org/v2/measurements?coordinates=48.86211%2C2.344615&radius=20000&limit=1000&page=5&date_from=2024-04-11&date_to=2024-05-10
                                                                                

saving the file


https://api.openaq.org/v2/measurements?coordinates=48.86211%2C2.344615&radius=20000&limit=1000&page=8&date_from=2024-04-11&date_to=2024-05-10
https://api.openaq.org/v2/measurements?coordinates=48.86211%2C2.344615&radius=20000&limit=1000&page=6&date_from=2024-04-11&date_to=2024-05-10
https://api.openaq.org/v2/measurements?coordinates=48.86211%2C2.344615&radius=20000&limit=1000&page=9&date_from=2024-04-11&date_to=2024-05-10
https://api.openaq.org/v2/measurements?coordinates=48.86211%2C2.344615&radius=20000&limit=1000&page=7&date_from=2024-04-11&date_to=2024-05-10
https://api.openaq.org/v2/measurements?coordinates=48.86211%2C2.344615&radius=20000&limit=1000&page=10&date_from=2024-04-11&date_to=2024-05-10
https://api.openaq.org/v2/measurements?coordinates=48.86211%2C2.344615&radius=20000&limit=1000&page=13&date_from=2024-04-11&date_to=2024-05-10
https://api.openaq.org/v2/measurements?coordinates=48.86211%2C2.344615&radius=20000&limit=1000&page=11&date_from=2024-04-11&date_to=2024-05-10
htt

In [18]:
spark.sparkContext.stop()

In [19]:
spark.stop()