In [48]:
import polars as pl

In [49]:
df = pl.read_csv('tlse_raw_data.csv', ignore_errors=True)

# Analyse statistique des colonnes pour la sélection des features pertinentes

On utilise les outils présent sur Pycharm afin d'avoir une vue d'ensemble du contenu des colonnes du dataset

<img alt="&#39;stats&#39;" src="stat_features.png"/>


In [50]:
df = (
    df
    .filter(
        pl.col("nature_mutation") == "Vente",
        pl.col("type_local").is_not_null(),
        # On supprime les dépendance car manque de data et sa perturbre l'entrainement
        pl.col("type_local").is_in(["Appartement", "Maison"])
    )
    .select(
        "id_mutation",
        pl.col("date_mutation").str.strptime(pl.Date, format="%Y-%m-%d", strict=False),
        "numero_disposition",
        "valeur_fonciere",
        "adresse_nom_voie",
        "adresse_code_voie",
        "code_postal",
        "type_local",
        "code_type_local",
        "surface_reelle_bati",
        "nombre_pieces_principales",
        "longitude",
        "latitude",
        pl.col("section_prefixe").str.slice(0,3),
        "nombre_lots"
    )
    .with_columns([
    pl.col("date_mutation").dt.year().alias("annee_mutation"),
    pl.col("date_mutation").dt.month().alias("mois_mutation"),
    pl.col("date_mutation").dt.day().alias("jour_mutation"),
    pl.col("date_mutation").dt.weekday().alias("jour_sem_mutation"),
    ])
)

In [51]:
# Suppresion des valeures abérantes au niveau du prix et de la superficie
q_low = df.select(pl.col("valeur_fonciere").quantile(0.05)).item()
q_high = df.select(pl.col("valeur_fonciere").quantile(0.95)).item()

df_clean = df.filter(
    (pl.col("valeur_fonciere") >= q_low) &
    (pl.col("valeur_fonciere") <= q_high),
    pl.col("surface_reelle_bati") > 9
)

In [52]:
df_clean

id_mutation,date_mutation,numero_disposition,valeur_fonciere,adresse_nom_voie,adresse_code_voie,code_postal,type_local,code_type_local,surface_reelle_bati,nombre_pieces_principales,longitude,latitude,section_prefixe,nombre_lots,annee_mutation,mois_mutation,jour_mutation,jour_sem_mutation
str,date,i64,f64,str,str,i64,str,i64,f64,i64,f64,f64,str,i64,i32,i8,i8,i8
"""2022-469305""",2022-09-02,1,235693.0,"""AV MAIGNAN""","""5512""",31200,"""Appartement""",2,78.0,3,1.43473,43.616526,"""802""",1,2022,9,2,5
"""2022-469776""",2022-08-31,1,90000.0,"""RUE JACQUELINE AURIOL""","""4434""",31400,"""Appartement""",2,24.0,1,,,"""837""",1,2022,8,31,3
"""2022-449760""",2022-01-04,1,490952.0,"""BD LAZARE CARNOT""","""5132""",31000,"""Appartement""",2,90.0,4,1.452051,43.602981,"""821""",2,2022,1,4,2
"""2022-449764""",2022-01-03,1,377650.0,"""RUE DES TROENES""","""8588""",31200,"""Maison""",1,109.0,6,1.414822,43.616255,"""829""",0,2022,1,3,1
"""2022-449771""",2022-01-05,1,164200.0,"""CHEM DU DOCTEUR DIDIER DASQUE""","""2693""",31400,"""Appartement""",2,70.0,3,1.45283,43.554988,"""839""",1,2022,1,5,3
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""2021-487386""",2021-12-14,1,139400.0,"""RUE DES CEDRES""","""1816""",31400,"""Appartement""",2,57.0,3,1.456801,43.571702,"""838""",2,2021,12,14,2
"""2021-487397""",2021-10-19,1,290000.0,"""RUE DE NEGRENEYS""","""6236""",31200,"""Appartement""",2,56.0,3,1.440515,43.6168,"""803""",1,2021,10,19,2
"""2021-487397""",2021-10-19,1,290000.0,"""RUE DU PROF JAMMES""","""7152""",31200,"""Appartement""",2,58.0,3,1.440515,43.6168,"""803""",1,2021,10,19,2
"""2021-487406""",2021-10-27,1,137312.0,"""IMP DE BEAUCAIRE""","""0954""",31500,"""Appartement""",2,72.0,3,1.464742,43.628267,"""833""",1,2021,10,27,3


In [53]:
df["type_local"].unique().to_list()

['Maison', 'Appartement']

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

correlation_matrix = df.to_pandas().corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Matrice de corrélation")
plt.show()

In [54]:
df = df.sample(fraction=1.0, shuffle=True, seed=42)