<a href="https://colab.research.google.com/github/AWAafi/Anacode/blob/main/MSPR3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Pour Fusionner les données ensemble

---



In [1]:
import pandas as pd

# Lecture des fichiers CSV
df_chomage = pd.read_csv('data_chomage.csv')
df_creat_entreprise = pd.read_csv('data_creations_entreprises.csv')
df_criminalite = pd.read_csv('data_criminalite.csv', sep=';')
df_election = pd.read_csv('data_election.csv')
df_population = pd.read_csv('data_population.csv')

df_final = pd.DataFrame()

# Harmonisation des colonnes
df_chomage['Année'] = df_chomage['Année'].astype(str).str[-2:].astype(int)
df_creat_entreprise['Année'] = df_creat_entreprise['Année'].astype(str).str[-2:].astype(int)
df_criminalite.rename(columns={'annee': 'Année', 'CODGEO_2024': 'Code Commune', 'faits': 'Nb Faits Divers'}, inplace=True)

# Fusion des DataFrames
df_final = df_population.copy()

df_final = pd.merge(df_final, df_criminalite, on=['Code Commune', 'Année'], how='left')
df_final = pd.merge(df_final, df_chomage, on=['Année'], how='left')
df_final = pd.merge(df_final, df_creat_entreprise, on=['Année'], how='left')
df_final = pd.merge(df_final, df_election, on=['Code Commune', 'Année'], how='left')

# Nettoyage des noms de candidats (problèmes d'encodage)
df_final['Nom_Prenom_Gagnant'] = df_final['Nom_Prenom_Gagnant'].str.replace('Jean-Luc Mï¿½LENCHON', 'Jean-Luc MELENCHON', regex=False)
df_final['Nom_Prenom_Gagnant'] = df_final['Nom_Prenom_Gagnant'].str.replace('Franï¿½ois FILLON', 'Francois FILLON', regex=False)
df_final['Nom_Prenom_Gagnant'] = df_final['Nom_Prenom_Gagnant'].str.replace('Benoï¿½t HAMON', 'Benoit HAMON', regex=False)
df_final['Nom_Prenom_Gagnant'] = df_final['Nom_Prenom_Gagnant'].str.replace('Franï¿½ois ASSELINEAU', 'Francois ASSELINEAU', regex=False)
df_final['Nom_Prenom_Gagnant'] = df_final['Nom_Prenom_Gagnant'].str.replace('ï¿½ric ZEMMOUR', 'Eric ZEMMOUR', regex=False)
df_final['Nom_Prenom_Gagnant'] = df_final['Nom_Prenom_Gagnant'].str.replace('Valï¿½rie Pï¿½CRESSE', 'Valerie PECRESSE', regex=False)

# Affichage et export
print(df_final.head())

df_final.to_csv('final_data.csv', index=False)


  Code Commune               Nom Commune  Population Code Departement  \
0        01001  L' Abergement-Clémenciat         776               01   
1        01002    L' Abergement-de-Varey         248               01   
2        01004         Ambérieu-en-Bugey       14035               01   
3        01005       Ambérieux-en-Dombes        1689               01   
4        01006                   Ambléon         111               01   

                 Région  Année  Nb Faits Divers  Taux de chômage  Créations  \
0  Auvergne-Rhône-Alpes     17              0.0            8.125    4024.25   
1  Auvergne-Rhône-Alpes     17              0.0            8.125    4024.25   
2  Auvergne-Rhône-Alpes     17            643.0            8.125    4024.25   
3  Auvergne-Rhône-Alpes     17             15.0            8.125    4024.25   
4  Auvergne-Rhône-Alpes     17              0.0            8.125    4024.25   

   Inscrits  Nb_Votant  Nom_Prenom_Gagnant  
0     598.0      495.0       Marine LE PE

Les prédictions


In [2]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score

# Chargement du jeu de données
donnees = pd.read_csv("final_data.csv")

# Filtrage sur l'année de base
ANNEE_REF = 22
ANNEE_PRECE = 17
donnees_2022 = donnees[donnees["Année"] == ANNEE_REF].copy()

# Variables concernées
vars_diff = ['Population', 'Nb Faits Divers', 'Taux de chômage', 'Créations', 'Inscrits', 'Nb_Votant']
vars_locales = ['Population', 'Nb Faits Divers', 'Inscrits', 'Nb_Votant']
vars_macro = ['Taux de chômage', 'Créations']

# Génération des colonnes de différence entre 2022 et 2017
colonnes_diff = {}
for v in vars_diff:
    df_tmp = donnees[['Code Commune', 'Année', v]].groupby(['Code Commune', 'Année']).sum().reset_index()
    pivot = df_tmp.pivot(index='Code Commune', columns='Année', values=v)
    nom_col_diff = f"{v}_Delta_{ANNEE_REF}_{ANNEE_PRECE}"

    if ANNEE_REF in pivot.columns and ANNEE_PRECE in pivot.columns:
        pivot[nom_col_diff] = pivot[ANNEE_REF] - pivot[ANNEE_PRECE]
        donnees_2022 = donnees_2022.merge(pivot[[nom_col_diff]].reset_index(), on='Code Commune', how='left')
        colonnes_diff[v] = nom_col_diff

donnees_2022.fillna({col: 0 for col in colonnes_diff.values()}, inplace=True)

# Encodage des colonnes catégorielles
donnees_2022["Région"] = donnees_2022["Région"].astype(str)
donnees_2022["Code Departement"] = donnees_2022["Code Departement"].astype(str)

# Cible : gagnant
donnees_2022 = donnees_2022.dropna(subset=["Nom_Prenom_Gagnant"])
encodeur_cible = LabelEncoder()
cible = encodeur_cible.fit_transform(donnees_2022["Nom_Prenom_Gagnant"])
mapping_candidats = dict(zip(encodeur_cible.classes_, encodeur_cible.transform(encodeur_cible.classes_)))

# Définition des features
num_features = vars_locales + [colonnes_diff[v] for v in vars_locales if v in colonnes_diff]
macro_features = [colonnes_diff[v] for v in vars_macro if v in colonnes_diff]
cat_features = ["Région", "Code Departement"]

features_X = [col for col in num_features + macro_features + cat_features if col in donnees_2022.columns]
X = donnees_2022[features_X].copy()

# Pipelines
pipeline_num = Pipeline([
    ("imput", SimpleImputer(strategy="median")),
    ("scale", StandardScaler())
])

cat_in_X = [c for c in cat_features if c in X.columns]
num_in_X = [c for c in num_features if c in X.columns]
reste = [c for c in features_X if c not in cat_in_X + num_in_X]

preprocesseur = ColumnTransformer([
    ("numeriques", pipeline_num, num_in_X),
    ("categoriels", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_in_X),
    ("restants", SimpleImputer(strategy="median"), reste)
])

X_train, X_test, y_train, y_test = train_test_split(X, cible, test_size=0.2, random_state=42, stratify=cible)

X_train_enc = preprocesseur.fit_transform(X_train)
X_test_enc = preprocesseur.transform(X_test)

# Modèle 1 : Random Forest
modele_rf = RandomForestClassifier(n_estimators=125, class_weight='balanced', random_state=42, n_jobs=-1)
modele_rf.fit(X_train_enc, y_train)
print("RF Accuracy:", accuracy_score(y_test, modele_rf.predict(X_test_enc)))

# Modèle 2 : Gradient Boosting
modele_gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
modele_gb.fit(X_train_enc, y_train)
print("GB Accuracy:", accuracy_score(y_test, modele_gb.predict(X_test_enc)))

# Modèle 3 : KMeans
nb_clusters = 250
km = KMeans(n_clusters=nb_clusters, n_init=10, random_state=42)
km.fit(X_train_enc)
clusters_train = km.labels_

# Associer chaque cluster à un gagnant majoritaire
assoc_cluster = {}
for i in range(nb_clusters):
    indices = np.where(clusters_train == i)[0]
    if len(indices):
        gagnant = Counter(y_train[indices]).most_common(1)[0][0]
    else:
        gagnant = Counter(y_train).most_common(1)[0][0]
    assoc_cluster[i] = gagnant

# Prédictions KMeans
clusters_test = km.predict(X_test_enc)
pred_kmeans = np.array([assoc_cluster.get(c, -1) for c in clusters_test])
valid_kmeans = pred_kmeans != -1
print("KMeans Accuracy:", accuracy_score(y_test[valid_kmeans], pred_kmeans[valid_kmeans]))


  donnees = pd.read_csv("final_data.csv")


RF Accuracy: 0.6521054139215124
GB Accuracy: 0.6631337725580063
KMeans Accuracy: 0.6541105700372386
