In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

In [2]:
df=pd.read_csv("../data/electricite.csv")

In [3]:
# --- 1. Nettoyage de base ---
df = df.drop_duplicates(subset=["id", "Adresse"])
df["Adresse"] = df["Adresse"].str.strip().str.title()

# Conversion des valeurs numériques
cols_num = [
    "Annee_construction",
    "Superficie",
    "GES",
    "Electricite_GJ_moyenne_4_ans",
    "Gaz_naturel_GJ_moyenne_4_ans",
    "Mazout_GJ_moyenne_4_ans",
]
df[cols_num] = df[cols_num].apply(pd.to_numeric, errors="coerce")

# --- 2. Gestion des valeurs manquantes ---
df = df.dropna(subset=["Superficie", "GES"])
df[["Electricite_GJ_moyenne_4_ans", "Gaz_naturel_GJ_moyenne_4_ans", "Mazout_GJ_moyenne_4_ans"]] = \
    df[["Electricite_GJ_moyenne_4_ans", "Gaz_naturel_GJ_moyenne_4_ans", "Mazout_GJ_moyenne_4_ans"]].fillna(0)


In [4]:
# --- 3. Feature engineering ---
df["Conso_totale_GJ"] = (
    df["Electricite_GJ_moyenne_4_ans"]
    + df["Gaz_naturel_GJ_moyenne_4_ans"]
    + df["Mazout_GJ_moyenne_4_ans"]
)
df["GES_m2"] = df["GES"] / df["Superficie"]

# --- 4. Géocodage optionnel si tu n’as pas encore de géométrie ---
# Exemple simplifié (si tu as déjà des colonnes latitude/longitude)
if {"latitude", "longitude"}.issubset(df.columns):
    df["geometry"] = df.apply(lambda row: Point(row["longitude"], row["latitude"]), axis=1)
    gdf = gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")
else:
    gdf = df  # reste un DataFrame pour le moment


In [5]:
# --- 5. Sauvegarde ---
gdf.to_file("dataset_electricite_nettoye.gpkg", driver="GPKG") if isinstance(gdf, gpd.GeoDataFrame) else \
    gdf.to_csv("dataset_electricite_nettoye.csv", index=False)