<a href="https://colab.research.google.com/github/Chu-Yichen/QM2-Group-19/blob/main/Correlation_proximity_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Correlation proximity analysis**



In [None]:
def correlation_proximity_analysis(df_siteyear, pollutant_name, pollutant_unit, distances):
    """

    Analyze the relationship between distance and pollutant concentration at the site level.

    Instruction:
    you can first calculate the pollutant concentration for a specific year (or multi-year average),
    and then perform a correlation analysis with the site distances.
    """
    num_years = (END_YEAR - START_YEAR)

    # (1) Calculate the average pollutant for each site for the date range.
    #   You can adjust this to other time periods or focus only on one specific year.
    df_yr = df_siteyear[(df_siteyear['Year'] >= START_YEAR) & (df_siteyear['Year'] <= END_YEAR)]
    df_yr_avg = df_yr.groupby('Site Id', as_index=False)['annual_pollutant'].mean().rename(columns={'annual_pollutant': 'pollutant_yr_mean'})

    # (2) Merge with distance information
    dist_df = pd.DataFrame(list(distances.items()), columns=['Site Id', 'distance_km'])
    df_yr_merged = pd.merge(df_yr_avg, dist_df, on='Site Id', how='inner')

    # (3) Perform correlation analysis
    pearson_corr, pearson_pval = pearsonr(df_yr_merged['distance_km'], df_yr_merged['pollutant_yr_mean'])
    spearman_corr, spearman_pval = spearmanr(df_yr_merged['distance_km'], df_yr_merged['pollutant_yr_mean'])

    print(f"Pearson correlation coefficient between {num_years}-year average {pollutant_name} and distance ({START_YEAR}-{END_YEAR}): {pearson_corr:.4f}, p-value: {pearson_pval:.4g}")
    print(f"Spearman correlation coefficient between {num_years}-year average {pollutant_name} and distance ({START_YEAR}-{END_YEAR}): {spearman_corr:.4f}, p-value: {spearman_pval:.4g}")

    # Visualize scatter plot
    plt.figure(figsize=(6,4))
    sns.scatterplot(x='distance_km', y='pollutant_yr_mean', data=df_yr_merged, alpha=0.7)
    plt.xlabel('Distance from Airport (km)')
    plt.ylabel(f'Average {pollutant_name} ({pollutant_unit})')
    plt.title(f'{pollutant_name} concentration vs. Distance ({START_YEAR}-{END_YEAR})')
    plt.show()