In [17]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import requests


In [18]:
spark = SparkSession.builder \
    .appName("Retrieve Bike Counting Data") \
    .getOrCreate()

START_DATE = '2022-04-01'
END_DATE = '2022-04-30'

In [19]:
channels_data = spark.read.option("header", "true") \
               .option("sep", ";") \
               .csv("./CSVs/channels.csv")

channels_ID = [row['channel_id'] for row in channels_data.collect()][0:1]  # For testing, limit to first 3 channels

In [20]:
def request_counting_data(channel_id, start_date, end_date):
    url = f'https://data.grandlyon.com/fr/datapusher/ws/timeseries/pvo_patrimoine_voirie.pvocomptagemeasure/all.json?start_datetime__gte={start_date}&start_datetime__lt={end_date}&channel_id__eq={channel_id}'
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.json().get('values', [])
    except Exception as e:
        print(f"Failed to retrieve data for channel {channel_id}: {e}")
        return []

In [21]:
rdd_ids = spark.sparkContext.parallelize(channels_ID)
all_records_rdd = rdd_ids.flatMap(lambda cid: request_counting_data(cid, START_DATE, END_DATE))
all_data_spark = spark.createDataFrame(all_records_rdd)

In [22]:
all_data_spark.coalesce(1).write \
    .mode('overwrite') \
    .option("header", "true") \
    .option("sep", ";") \
    .csv("./CSVs/spark/counting_data_april_2022")