In [1]:
from pathlib import Path
import pyarrow.parquet as pq
import pandas as pd

### Read parquet file

In [3]:
# read parquet file
path = Path("../output/datatourisme/merged_20251231_173759.parquet")

table = pq.read_table(path)
df = table.to_pandas()
print(df.columns)
print(df.shape)

Index(['adresse_postale', 'classements_du_poi', 'contacts_du_poi',
       'date_de_mise_a_jour', 'description', 'latitude', 'longitude',
       'nom_du_poi', 'periodes_regroupees', 'uri_id_du_poi', 'region',
       'code_postal', 'commune', 'département', 'type_principal',
       'main_category', 'sub_category'],
      dtype='object')
(9305, 17)


In [4]:
df_main_cat = df.groupby("main_category").size().reset_index(name="count")
df_main_cat["main_category"].unique()
df_main_cat = df_main_cat.sort_values(by="count", ascending=False)

df_sub_cat = df.groupby("sub_category").size().reset_index(name="count")
df_sub_cat["sub_category"].unique()
df_sub_cat = df_sub_cat.sort_values(by="count", ascending=False)

df_type_prin = df.groupby("type_principal").size().reset_index(name="count")
df_type_prin = df_type_prin.sort_values(by="count", ascending=False)

In [5]:
import plotly.express as px

#fig = px.histogram(df_main_cat, x="main_category", y="count", title="Distribution des catégories de POI")
fig = px.pie(df_main_cat, values="count", names="main_category", title="Distribution des catégories de POI")
fig.show()  


In [6]:
fig = px.histogram(df_sub_cat, x="sub_category", y="count", title="Distribution des sous catégories de POI")
fig.show()

In [7]:
fig = px.histogram(df_type_prin, x="type_principal", y="count", title="Distribution des types principaux de POI")
fig.show()

In [8]:
missing = df.isna().sum().sort_values(ascending=False)

fig = px.bar(
    missing.reset_index(),
    x=0, y="index",
    orientation="h",
    labels={"index": "Colonne", "0": "Valeurs manquantes"}
)
fig.show()

In [9]:
df["date_de_mise_a_jour"] = pd.to_datetime(df["date_de_mise_a_jour"], errors="coerce")
df["year_month"] = df["date_de_mise_a_jour"].dt.to_period("M")
freq = df.groupby("year_month").size()

fig = px.bar(
    x=freq.index.astype(str),
    y=freq.values,
    labels={"x": "Année-Mois", "y": "Nombre de POI mis à jour"},
    title="Fréquence des mises à jour par mois"
)

fig.update_layout(xaxis_tickangle=-45)
fig.show()