In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import requests


In [3]:
spark = SparkSession.builder \
    .appName("Retrieve Bike Counting Data") \
    .getOrCreate()

MONTH = '04'
YEAR = '2022'
START_DATE = f'{YEAR}-{MONTH}-01'
END_DATE = f'{YEAR}-{MONTH}-30'

# Requête des données

In [4]:
channels_data = spark.read.option("header", "true") \
               .option("sep", ";") \
               .csv("./CSVs/channels.csv")

channels_ID = [row['channel_id'] for row in channels_data.collect()]

In [5]:
def request_counting_data(channel_id, start_date, end_date):
    url = f'https://data.grandlyon.com/fr/datapusher/ws/timeseries/pvo_patrimoine_voirie.pvocomptagemeasure/all.json?start_datetime__gte={start_date}&start_datetime__lt={end_date}&channel_id__eq={channel_id}&maxfeatures=-1'
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.json().get('values', [])
    except Exception as e:
        print(f"Failed to retrieve data for channel {channel_id}: {e}")
        return []

def request_temp_data(start_date, end_date):
    url = f'https://data.grandlyon.com/fr/datapusher/ws/timeseries/biotope.temperature/all.json?horodate__gte={start_date}&horodate__lt={end_date}&maxfeatures=-1'
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.json().get('values', [])
    except Exception as e:
        print(f"Failed to retrieve temperature data: {e}")
        return []

def request_rain_data(start_date, end_date):
    url = f'https://data.grandlyon.com/fr/datapusher/ws/timeseries/eau.pluviometrie_mesure/all.json?maxfeatures=-1&horodate__gte={start_date}&horodate__lte={end_date}'
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.json().get('values', [])
    except Exception as e:
        print(f"Failed to retrieve rainfall data: {e}")
        return []

# Traitement des données

## Traitement des données de comptage

In [6]:
# Aggréger les données de comptage de la même heure toutes stations confondues
def traitement_count_data(data):
    data_with_hour = data.withColumn("start_datetime", F.date_trunc("hour", F.col("start_datetime")))
    count_data_agg = data_with_hour.groupBy("start_datetime") \
        .agg(F.sum("count").alias("total_count")) \
        .orderBy("start_datetime")
    count_data_agg.drop("end_datetime")
    count_data_agg = count_data_agg.withColumnRenamed("start_datetime", "horodate_heure")
    return count_data_agg

## Traitement des données de température

In [7]:
# Aggréger les données de température par horodate selon l'heure et la station de mesure
def traitement_temp_data(data):
    data_with_hour = data.withColumn("horodate_heure", F.date_trunc("hour", F.col("horodate")))
    temp_data_agg = data_with_hour.groupBy("horodate_heure") \
        .agg(F.avg("degre_celsius").alias("avg_temp_per_hour")) \
        .orderBy("horodate_heure")
    temp_data_agg.drop("horodate")
    return temp_data_agg

## Traitement des données de précipitations

In [8]:
# Aggréger les données de précipitations d'une même station de mesure
def traitement_rainfall(data):
    data_with_hour = data.withColumn("horodate_heure", F.date_trunc("hour", F.col("horodate")))
    rain_data_agg_station = data_with_hour.groupBy("horodate_heure", "identifiant") \
        .agg(F.sum("pluie_mm").alias("avg_pluie_per_station_per_hour"))
    rain_data_agg = rain_data_agg_station.groupBy("horodate_heure") \
        .agg(F.avg("avg_pluie_per_station_per_hour").alias("avg_pluie_per_hour")) \
        .orderBy("horodate_heure")
    rain_data_agg.drop("horodate")
    return rain_data_agg


# Exécution du traitement

In [9]:
count_rdd_ids = spark.sparkContext.parallelize(channels_ID)
count_records_rdd = count_rdd_ids.flatMap(lambda cid: request_counting_data(cid, START_DATE, END_DATE))
count_data_spark = spark.createDataFrame(count_records_rdd)
aggregated_count_data = traitement_count_data(count_data_spark)
aggregated_count_data.show()

+-------------------+-----------+
|     horodate_heure|total_count|
+-------------------+-----------+
|2022-03-31 23:00:00|       3607|
|2022-04-01 00:00:00|       1762|
|2022-04-01 01:00:00|       1207|
|2022-04-01 02:00:00|        878|
|2022-04-01 03:00:00|        652|
|2022-04-01 04:00:00|        885|
|2022-04-01 05:00:00|       2315|
|2022-04-01 06:00:00|       9696|
|2022-04-01 07:00:00|      21176|
|2022-04-01 08:00:00|      11340|
|2022-04-01 09:00:00|       6401|
|2022-04-01 10:00:00|       7415|
|2022-04-01 11:00:00|      11734|
|2022-04-01 12:00:00|      10602|
|2022-04-01 13:00:00|       8518|
|2022-04-01 14:00:00|       8810|
|2022-04-01 15:00:00|      13040|
|2022-04-01 16:00:00|      20039|
|2022-04-01 17:00:00|      18692|
|2022-04-01 18:00:00|      14181|
+-------------------+-----------+
only showing top 20 rows


In [10]:
temp_records = request_temp_data(START_DATE, END_DATE)
temp_data_spark = spark.createDataFrame(temp_records)
aggregated_temp_data = traitement_temp_data(temp_data_spark)
aggregated_temp_data = aggregated_temp_data.withColumn("avg_temp_per_hour", F.round(F.col("avg_temp_per_hour"), 2))
aggregated_temp_data.show()

+-------------------+-----------------+
|     horodate_heure|avg_temp_per_hour|
+-------------------+-----------------+
|2022-03-31 23:00:00|              7.4|
|2022-04-01 00:00:00|             6.96|
|2022-04-01 01:00:00|             6.49|
|2022-04-01 02:00:00|             4.51|
|2022-04-01 03:00:00|              2.7|
|2022-04-01 04:00:00|             2.11|
|2022-04-01 05:00:00|             1.55|
|2022-04-01 06:00:00|             1.37|
|2022-04-01 07:00:00|             1.51|
|2022-04-01 08:00:00|             1.86|
|2022-04-01 09:00:00|             2.14|
|2022-04-01 10:00:00|             2.29|
|2022-04-01 11:00:00|             2.44|
|2022-04-01 12:00:00|             2.47|
|2022-04-01 13:00:00|              2.5|
|2022-04-01 14:00:00|             2.54|
|2022-04-01 15:00:00|             3.38|
|2022-04-01 16:00:00|             3.88|
|2022-04-01 17:00:00|             4.07|
|2022-04-01 18:00:00|             3.88|
+-------------------+-----------------+
only showing top 20 rows


In [11]:
# Partie Meteo
rain_records = request_rain_data(START_DATE, END_DATE)
rain_data_spark = spark.createDataFrame(rain_records)
aggregated_rain_data = traitement_rainfall(rain_data_spark)
aggregated_rain_data = aggregated_rain_data.withColumn("avg_pluie_per_hour", F.round(F.col("avg_pluie_per_hour"), 2))
aggregated_rain_data.show()

+-------------------+------------------+
|     horodate_heure|avg_pluie_per_hour|
+-------------------+------------------+
|2022-03-31 23:00:00|               0.1|
|2022-04-01 01:00:00|               0.2|
|2022-04-01 02:00:00|              0.75|
|2022-04-01 03:00:00|              1.18|
|2022-04-01 04:00:00|               1.3|
|2022-04-01 05:00:00|              1.53|
|2022-04-01 06:00:00|              1.07|
|2022-04-01 07:00:00|               0.6|
|2022-04-01 08:00:00|              0.32|
|2022-04-01 09:00:00|              1.04|
|2022-04-01 10:00:00|              0.93|
|2022-04-01 11:00:00|              0.53|
|2022-04-01 12:00:00|               0.4|
|2022-04-01 13:00:00|               0.2|
|2022-04-01 14:00:00|              0.45|
|2022-04-01 15:00:00|              1.95|
|2022-04-01 18:00:00|              0.32|
|2022-04-01 19:00:00|               0.3|
|2022-04-01 20:00:00|              0.95|
|2022-04-01 21:00:00|              1.09|
+-------------------+------------------+
only showing top

In [12]:
final_data = aggregated_count_data.join(aggregated_temp_data, "horodate_heure", "left") \
                                  .join(aggregated_rain_data, "horodate_heure", "left") \
                                  .withColumnRenamed("horodate_heure", "datetime") \
                                  .orderBy("datetime")
final_data = final_data.fillna(0)
final_data.show()

+-------------------+-----------+-----------------+------------------+
|           datetime|total_count|avg_temp_per_hour|avg_pluie_per_hour|
+-------------------+-----------+-----------------+------------------+
|2022-03-31 23:00:00|       3607|              7.4|               0.1|
|2022-04-01 00:00:00|       1762|             6.96|               0.0|
|2022-04-01 01:00:00|       1207|             6.49|               0.2|
|2022-04-01 02:00:00|        878|             4.51|              0.75|
|2022-04-01 03:00:00|        652|              2.7|              1.18|
|2022-04-01 04:00:00|        885|             2.11|               1.3|
|2022-04-01 05:00:00|       2315|             1.55|              1.53|
|2022-04-01 06:00:00|       9696|             1.37|              1.07|
|2022-04-01 07:00:00|      21176|             1.51|               0.6|
|2022-04-01 08:00:00|      11340|             1.86|              0.32|
|2022-04-01 09:00:00|       6401|             2.14|              1.04|
|2022-

In [15]:
final_data.write \
    .mode("overwrite") \
    .option("header", "true") \
    .option("sep", ";") \
    .csv(f"./CSVs/spark/counting_data_{MONTH}_{YEAR}")