In [1]:
from pathlib import Path
import pyarrow.parquet as pq
import pandas as pd

### Read parquet file

In [2]:
# read parquet file
path = Path("../data/processed/merged_20260108_174125.parquet")
table = pq.read_table(path)
df = table.to_pandas()
print(df.columns)
print(df.shape)

Index(['adresse', 'categories_de_poi', 'classements_du_poi',
       'code_postal_et_commune', 'contacts_du_poi',
       'covid19_mesures_specifiques', 'createur_de_la_donnee',
       'date_de_mise_a_jour', 'description', 'latitude', 'longitude',
       'nom_du_poi', 'periodes_regroupees', 'sit_diffuseur', 'uri_id_du_poi',
       'region', 'code_postal', 'commune', 'departement', 'types_list',
       'type_principal', 'main_category', 'sub_category', 'itineraire',
       'h3_r6', 'h3_r7', 'h3_r8', 'h3_r9', 'density_commune',
       'density_commune_norm', 'diversity_commune', 'diversity_commune_norm',
       'rating', 'review_count', 'rating_norm', 'reviews_norm',
       'popularity_norm', 'centroid_lat', 'centroid_lon', 'proximity_commune',
       'proximity_commune_log', 'proximity_commune_norm', 'category_weight',
       'category_weight_norm', 'is_open_now', 'open_hours_norm', 'open_late',
       'open_weekend', 'opening_score', 'opening_score_norm',
       'final_score_raw', 'final

In [5]:
# filtrer que lesm main main category caculé dans les itinéraires
df = df[df["itineraire"]== True]
print(df.shape)

(208280, 52)


In [6]:
df_main_cat = df.groupby("main_category").size().reset_index(name="count")
df_main_cat["main_category"].unique()
df_main_cat = df_main_cat.sort_values(by="count", ascending=False)

df_sub_cat = df.groupby("sub_category").size().reset_index(name="count")
df_sub_cat["sub_category"].unique()
df_sub_cat = df_sub_cat.sort_values(by="count", ascending=False)

df_type_prin = df.groupby("type_principal").size().reset_index(name="count")
df_type_prin = df_type_prin.sort_values(by="count", ascending=False)

In [7]:
import plotly.express as px

#fig = px.histogram(df_main_cat, x="main_category", y="count", title="Distribution des catégories de POI")
fig = px.pie(df_main_cat, values="count", names="main_category", title="Distribution des catégories de POI")
fig.show()  


In [8]:
fig = px.histogram(df_sub_cat, x="sub_category", y="count", title="Distribution des sous catégories de POI")
fig.show()

In [9]:
fig = px.histogram(df_type_prin, x="type_principal", y="count", title="Distribution des types principaux de POI")
fig.show()

In [10]:
missing = df.isna().sum().sort_values(ascending=False)

fig = px.bar(
    missing.reset_index(),
    x=0, y="index",
    orientation="h",
    labels={"index": "Colonne", "0": "Valeurs manquantes"}
)
fig.show()

In [11]:
df["date_de_mise_a_jour"] = pd.to_datetime(df["date_de_mise_a_jour"], errors="coerce")
df["year_month"] = df["date_de_mise_a_jour"].dt.to_period("M")
freq = df.groupby("year_month").size()

fig = px.bar(
    x=freq.index.astype(str),
    y=freq.values,
    labels={"x": "Année-Mois", "y": "Nombre de POI mis à jour"},
    title="Fréquence des mises à jour par mois"
)

fig.update_layout(xaxis_tickangle=-45)
fig.show()

### Visualisation des données clusters

In [23]:
## Avant POI selector
path_before_poi_selector = Path("../data/processed/merged_20260109_113342.parquet")
table = pq.read_table(path_before_poi_selector)
df_before_poi_selector = table.to_pandas()
print(df_before_poi_selector.columns)
print(df_before_poi_selector.shape)

fig = px.histogram(
    df_before_poi_selector,
    x="sub_category",
    color="cluster_id",
    barmode="group",   # ou "stack"
    title="Distribution des sub_category par cluster"
)

fig.update_layout(
    xaxis_title="Sub-category",
    yaxis_title="Count",
    xaxis_tickangle=45
)

fig.show()


Index(['poi_id', 'main_category', 'sub_category', 'longitude', 'latitude',
       'final_score', 'h3_r8', 'cluster_id', 'diversity_commune_norm',
       'itineraire'],
      dtype='object')
(2434, 10)


In [27]:
## Après POI selector
path_osrm = Path("../data/processed/merged_20260109_113856_after.parquet")
table = pq.read_table(path_osrm)
df = table.to_pandas()
print(df.columns)
print(df.shape)


Index(['cluster_id', 'poi_id', 'latitude', 'longitude', 'main_category',
       'sub_category', 'final_score', 'mixed_score'],
      dtype='object')
(63, 8)


In [28]:
df.head(20)

Unnamed: 0,cluster_id,poi_id,latitude,longitude,main_category,sub_category,final_score,mixed_score
0,0,990,48.85618,2.355726,Culture & Musées,Bibliothèques & médiation,0.604947,0.542401
1,0,1156,48.854904,2.356299,Culture & Musées,Bibliothèques & médiation,0.604112,0.541816
2,0,2379,48.86067,2.35221,Culture & Musées,Bibliothèques & médiation,0.69366,0.6045
3,0,1161,48.862473,2.345099,Culture & Musées,Cinéma & audiovisuel,0.696061,0.51565
4,0,77,48.875347,2.331067,Culture & Musées,Spectacle vivant,0.571424,0.549112
5,0,1137,48.856796,2.352297,Culture & Musées,Spectacle vivant,0.608901,0.575346
6,0,292,48.870564,2.347487,Culture & Musées,Spectacle vivant,0.528424,0.519012
7,0,728,48.861847,2.351169,Gastronomie & Restauration,Restauration rapide,0.678745,0.528219
8,0,75,48.862816,2.34406,Gastronomie & Restauration,Restauration rapide,0.681089,0.529859
9,0,1098,48.862667,2.343973,Sports & Loisirs,Loisirs indoor,0.732364,0.611475


In [26]:
fig = px.histogram(
    df,
    x="sub_category",
    color="cluster_id",
    barmode="group",   # ou "stack"
    title="Distribution des sub_category par cluster"
)

fig.update_layout(
    xaxis_title="Sub-category",
    yaxis_title="Count",
    xaxis_tickangle=45
)

fig.show()