In [20]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sys
sys.path.append('../../')
from config.settings import settings
from scipy.stats import pearsonr

In [21]:
DATASETS_BASE = settings.MODULES_TO_DATASETS_PATH

## Transform Dataframe

On récupère le dataframe orginal, qu'on transforme afin d'avoir les polluants en features du df. Ce format correspond à l'input de notre modèle de machine learning.

In [22]:
df1_4 = pd.read_csv(DATASETS_BASE + settings.DATA_EU_AIR_F1_4, low_memory=False)

In [23]:
def pollutant_features(d_frame: pd.DataFrame) -> pd.DataFrame:
    df_pivoted = d_frame\
    .pivot_table(
        index=['FacilityInspireID', 'reportingYear'],
        columns='pollutant',
        values='emissions',
        aggfunc='sum')\
    .reset_index()

    df_pivoted.iloc[:, 2:] = df_pivoted.iloc[:, 2:].fillna(0)

    return df_pivoted

In [24]:
def df_with_pollutant_features(df_unique: pd.DataFrame, df_pollutant: pd.DataFrame) -> pd.DataFrame:
    df_final = pd.merge(df_unique, df_pollutant, on=['FacilityInspireID', 'reportingYear'], how='left')
    # df_final = df_final.drop(columns=['index'])

    return df_final

In [31]:
def display_corr(df_pollutant: pd.DataFrame):
    correlation_matrix = df_pollutant.iloc[:, 15:].corr(method='pearson')

    plt.figure(figsize=(16, 12))

    heatmap = sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', center=0)

    plt.title('Heatmap of Feature Correlations')

    plt.show()
    
    threshold = 0.75  # This is an arbitrary threshold

    high_correlations = np.where(np.abs(correlation_matrix) > threshold)

    high_correlations = [(correlation_matrix.index[x], correlation_matrix.columns[y])
                        for x, y in zip(*high_correlations)
                        if x != y and x < y]
    
    for pair in high_correlations:
        corr_value, p_value = pearsonr(df_pollutant[pair[0]], df_pollutant[pair[1]])
        print(f"Correlation between {pair[0]} and {pair[1]}: {corr_value}, P-value: {p_value}")

    return heatmap

In [40]:
def drop_columns(d_frame: pd.DataFrame) -> pd.DataFrame:
    return d_frame.drop(columns=["countryName", "EPRTRSectorCode", 
                                 "EPRTRAnnexIMainActivityCode", "facilityNameConfidentialityReason",
                                 "Longitude", "Latitude", "addressConfidentialityReason", "City",
                                 "targetRelease", "releasesConfidentialityReason"])

In [26]:
df_unique = df1_4.drop_duplicates(subset=['FacilityInspireID', 'reportingYear']).drop(columns=['pollutant', 'emissions'])
df_pollutant = pollutant_features(df1_4)
df_final = df_with_pollutant_features(df_unique, df_pollutant)
display_corr(df_final)

In [41]:
df_final = drop_columns(df_final)
df_final.info()

## Clustering

On cherche à mettre en évidence des patterns cachés