In [1]:
import pandas as pd
import plotly.express as px
from pathlib import Path
from tqdm.notebook import tqdm
import pyarrow.feather as feather

In [2]:
import os

os.getcwd()

'd:\\dev\\predictempo\\notebooks'

Data exploration

In [3]:
DATA_FOLDER = Path("../data/rtt-vent")
extension = ".csv.gz"
all_files = list(DATA_FOLDER.glob(f"*{extension}"))
MIN_YEAR = 20170101
# Define the latitude and longitude bounds for Metropolitan France
lat_min, lat_max = 41.0, 51.0
lon_min, lon_max = -5.0, 10.0

In [5]:
all_data = []
with tqdm(total=len(all_files), desc="Loading data") as pbar:

    for file in all_files:
        try:
            data_file = pd.read_csv(file, compression="gzip", comment="#", sep=";")
            data_file = data_file[data_file["AAAAMMJJ"] >= MIN_YEAR]
            metropolitan_france_data_file = data_file[
                (data_file["LAT"] >= lat_min)
                & (data_file["LAT"] <= lat_max)
                & (data_file["LON"] >= lon_min)
                & (data_file["LON"] <= lon_max)
            ]

            all_data.append(metropolitan_france_data_file)
        except Exception as e:
            print(f"Error processing file {file}: {e}")

        pbar.update(1)
        pbar.set_postfix_str(dict(file=str(file)))

data = pd.concat(all_data, ignore_index=True)
del all_data
data.head()

Loading data:   0%|          | 0/489 [00:00<?, ?it/s]

Unnamed: 0,NUM_POSTE,NOM_USUEL,LAT,LON,ALTI,AAAAMMJJ,RR,QRR,TN,QTN,...,HXI2,QHXI2,FXI3S,QFXI3S,DXI3S,QDXI3S,HXI3S,QHXI3S,DRR,QDRR
0,3005002,ARCHIGNAT-SPC,46.3325,2.385833,557,20201213,0.0,1.0,,,...,,,,,,,,,,
1,3005002,ARCHIGNAT-SPC,46.3325,2.385833,557,20201214,7.2,1.0,,,...,,,,,,,,,,
2,3005002,ARCHIGNAT-SPC,46.3325,2.385833,557,20201215,3.0,1.0,,,...,,,,,,,,,,
3,3005002,ARCHIGNAT-SPC,46.3325,2.385833,557,20201216,1.4,1.0,,,...,,,,,,,,,,
4,3005002,ARCHIGNAT-SPC,46.3325,2.385833,557,20201218,0.0,1.0,,,...,,,,,,,,,,


In [6]:
data.shape

(13569320, 58)

In [None]:
data.to_feather(
    DATA_FOLDER / f"all_data_rtt_vent_from_{MIN_YEAR}.feather"
)

In [4]:
# Load the feather file with memory mapping
data = feather.read_feather(
    DATA_FOLDER / f"all_data_rtt_vent_from_20170101.feather",
    memory_map=True,
    columns=[
        "NOM_USUEL",
        "LON",
        "LAT",
        "AAAAMMJJ",
        "TN",
        "TX",
        "TM",
        "TNTXM",
        "TAMPLI",
        "DG",
        "DRR",
        "FFM",
        "FF2M",
        "FXY",
        "RR",
    ],
)
data.columns

Index(['NOM_USUEL', 'LON', 'LAT', 'AAAAMMJJ', 'TN', 'TX', 'TM', 'TNTXM',
       'TAMPLI', 'DG', 'DRR', 'FFM', 'FF2M', 'FXY', 'RR'],
      dtype='object')

In [5]:
data.head()

Unnamed: 0,NOM_USUEL,LON,LAT,AAAAMMJJ,TN,TX,TM,TNTXM,TAMPLI,DG,DRR,FFM,FF2M,FXY,RR
0,ARCHIGNAT-SPC,2.385833,46.3325,20201213,,,,,,,,,,,0.0
1,ARCHIGNAT-SPC,2.385833,46.3325,20201214,,,,,,,,,,,7.2
2,ARCHIGNAT-SPC,2.385833,46.3325,20201215,,,,,,,,,,,3.0
3,ARCHIGNAT-SPC,2.385833,46.3325,20201216,,,,,,,,,,,1.4
4,ARCHIGNAT-SPC,2.385833,46.3325,20201218,,,,,,,,,,,0.0


In [6]:
cities_data_all = data[["NOM_USUEL", "LON", "LAT"]].drop_duplicates()
cities_data_all.rename(
    columns={"NOM_USUEL": "City", "LON": "Longitude", "LAT": "Latitude"}, inplace=True
)
cities_data_all.shape

(4059, 3)

In [7]:
def plot_map(df, title="Cities in France"):
    fig = px.scatter_mapbox(
        df,
        lat="Latitude",
        lon="Longitude",
        text="City",
        zoom=5,
        center={"lat": 46.6034, "lon": 1.8883},
        title=title,
    )

    # Set the mapbox style
    fig.update_layout(
        mapbox_style="carto-darkmatter",  # Choose a map style (e.g., "carto-positron", "stamen-terrain")
        mapbox=dict(center=dict(lat=46.6034, lon=1.8883), zoom=5),
        margin={"r": 0, "t": 50, "l": 0, "b": 0},
        height=800,
    )

    fig.show()


plot_map(cities_data_all)

In [8]:
most_populated_cities = [
    "Paris",
    "Marseille",
    "Lyon",
    "Toulouse",
    "Nice",
    "Nantes",
    "Strasbourg",
    "Montpellier",
    "Bordeaux",
    "Lille",
    "Rennes",
    "Reims",
    "Le Havre",
    "Saint-Étienne",
    "Toulon",
    "Grenoble",
    "Dijon",
    "Angers",
    "Nîmes",
    "Villeurbanne",
    "Saint-Denis",
]

In [9]:
topk = 15
cities_from_df = []
for city in most_populated_cities[:topk]:
    for c in cities_data_all["City"].unique():
        # if city == "Montpellier":
        #     if (
        #         c.lower() == "3eme-ecluse-spc"
        #     ):  # Montpellier data is invalid so we use the closest datapoint
        #         cities_from_df.append(c)

        if city.lower() == c.lower().split("-")[0].split(" ")[0]:
            cities_from_df.append(c)
cities_data_most_populated = cities_data_all[
    cities_data_all["City"].isin(cities_from_df)
]
plot_map(cities_data_most_populated, title="Most populated cities in France")

In [10]:
sorted(cities_data_most_populated["City"].unique())

['BORDEAUX-MERIGNAC',
 'BORDEAUX-PAULIN',
 'LILLE ISA',
 'LILLE-LESQUIN',
 "LYON TETE D'OR",
 'LYON-BRON',
 'LYON-FOURVIERE',
 'LYON-ST EXUPERY',
 'MARSEILLE',
 'MARSEILLE-OBS',
 'MARSEILLE-ST BARNABE',
 'MARSEILLE-STE MARTHE',
 'MONTPELLIER-AEROPORT',
 'NANTES-BOUGUENAIS',
 'NANTES-VILLE',
 'NICE',
 'NICE-COL',
 'NICE-RIMIEZ',
 'PARIS-MONTSOURIS',
 'PARIS-MONTSOURIS-DOUBLE',
 'REIMS-PRUNAY',
 'RENNES GALLET',
 'RENNES-ST JACQUES',
 'STRASBOURG - BOTANIQUE',
 'STRASBOURG-ENTZHEIM',
 'TOULON',
 'TOULOUSE PERIOLE-SPC',
 'TOULOUSE SAINT-MICHEL-SPC',
 'TOULOUSE-BLAGNAC',
 'TOULOUSE-FRANCAZAL']

In [11]:
non_null_cities = []
all_cities = cities_data_most_populated["City"].unique()
for city in tqdm(all_cities, desc="Checking null values", total=len(all_cities)):
    city_data = data[data["NOM_USUEL"] == city]
    null_ratio = city_data["TN"].isnull().sum() / len(city_data) + city_data[
        "TX"
    ].isnull().sum() / len(city_data)
    exists_since_2017 = city_data["AAAAMMJJ"].min() <= 20170101
    exists_until_2024 = city_data["AAAAMMJJ"].max() >= 20240331
    if null_ratio == 0 and exists_since_2017 and exists_until_2024:
        non_null_cities.append(city)

    if null_ratio == 0 and not exists_until_2024:
        print("Not up to date data for city:", city, ". Try to add most recent data.")

sorted(non_null_cities)

Checking null values:   0%|          | 0/30 [00:00<?, ?it/s]

['BORDEAUX-MERIGNAC',
 'BORDEAUX-PAULIN',
 'LILLE-LESQUIN',
 'LYON-BRON',
 'LYON-ST EXUPERY',
 'MARSEILLE',
 'MONTPELLIER-AEROPORT',
 'NANTES-BOUGUENAIS',
 'NICE',
 'NICE-RIMIEZ',
 'PARIS-MONTSOURIS',
 'PARIS-MONTSOURIS-DOUBLE',
 'REIMS-PRUNAY',
 'RENNES-ST JACQUES',
 'STRASBOURG - BOTANIQUE',
 'STRASBOURG-ENTZHEIM',
 'TOULON',
 'TOULOUSE-BLAGNAC']

In [12]:
cities_to_keep = [
    "TOULOUSE-BLAGNAC",
    "NICE",
    "MARSEILLE",
    "BORDEAUX-MERIGNAC",
    "MONTPELLIER-AEROPORT",
    "NANTES-BOUGUENAIS",
    "LILLE-LESQUIN",
    "STRASBOURG-ENTZHEIM",
    "LYON-ST EXUPERY",
    "PARIS-MONTSOURIS",
    "REIMS-PRUNAY",
    "TOULON",
    "RENNES-ST JACQUES",
]

assert all(
    city in non_null_cities for city in cities_to_keep
), "Some cities have null values"

if len(cities_to_keep) != topk:
    print(
        "The number of cities to keep is different from the topk:",
        len(cities_to_keep),
        "!=",
        topk,
    )

The number of cities to keep is different from the topk: 13 != 15


In [15]:
cities_data_most_populated = cities_data_all[cities_data_all['City'].isin(cities_to_keep)]
plot_map(cities_data_most_populated, title='Most populated cities in France')

In [14]:
data_cities_filtered = data[data["NOM_USUEL"].isin(cities_to_keep)]
data_cities_filtered.to_feather(
    DATA_FOLDER / f"13_most_populated_cities_data_rtt_vent_from_{MIN_YEAR}.feather"
)