<a href="https://colab.research.google.com/github/Chu-Yichen/QM2-Group-19/blob/main/show_daily_line_chart_%26_show_annual_line_chart_%E2%80%8B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Show Daily line chart & Regression Analysis**

In [None]:
def show_daily_line_chart(df_plot_daily, pollutant_name, pollutant_unit):
    # Filter the dataframe df_plot_daily for the specified date range
    df_daily_filtered = df_plot_daily[
        (df_plot_daily['Date Local'] >= START_DATE) &
        (df_plot_daily['Date Local'] <= END_DATE)
    ].copy()


    # Create a numeric_date column for regression analysis
    df_daily_filtered['numeric_date'] = df_daily_filtered['Date Local'].map(pd.Timestamp.toordinal)

    # Create X and Y, and perform the homoscedasticity & independence test
    X = df_daily_filtered['numeric_date']              # Predictor
    y = df_daily_filtered['pollutant_mean_all']           # Response
    results = test_homoscedasticity_independence(X, y)
    print(results)

    # Perform linear regression for Nearby Sites
    slope_near, intercept_near, r_value_near, p_value_near, std_err_near = linregress(
        df_daily_filtered['numeric_date'],
        df_daily_filtered['pollutant_mean_nearby']
    )

    # Perform linear regression for Other Sites
    slope_all, intercept_all, r_value_all, p_value_all, std_err_all = linregress(
        df_daily_filtered['numeric_date'],
        df_daily_filtered['pollutant_mean_all']
    )

    # Print regression results
    print("==== [DAILY] Regression Analysis (Filtered) ====")
    print("[Nearby Sites]:")
    print(f"  Slope = {slope_near:.6f}, Intercept = {intercept_near:.6f}")
    print(f"  R-value = {r_value_near:.4f},  P-value = {p_value_near:.4g},  StdErr = {std_err_near:.6f}\n")

    print("[Other Sites]:")
    print(f"  Slope = {slope_all:.6f}, Intercept = {intercept_all:.6f}")
    print(f"  R-value = {r_value_all:.4f},  P-value = {p_value_all:.4g},  StdErr = {std_err_all:.6f}\n")

    # Visualization Section
    plt.figure(figsize=(10,6))

    # 1) Line chart for the filtered date range
    sns.lineplot(
        x='Date Local',
        y='pollutant_mean_nearby',
        data=df_daily_filtered,
        label=f'Nearby Sites Daily {pollutant_name} Mean'
    )
    sns.lineplot(
        x='Date Local',
        y='pollutant_mean_all',
        data=df_daily_filtered,
        label=f'Other Sites Daily {pollutant_name} Mean'
    )

    # 2) Add regression lines (calculate y_pred)
    x_vals = df_daily_filtered['numeric_date']
    y_pred_near = slope_near*x_vals + intercept_near
    y_pred_all  = slope_all*x_vals + intercept_all

    # Use the same Date Local as the x-axis
    dates_for_plot = df_daily_filtered['Date Local']

    plt.plot(
        dates_for_plot,
        y_pred_near,
        color='red',
        linestyle='--',
        label='Regression (Nearby)'
    )
    plt.plot(
        dates_for_plot,
        y_pred_all,
        color='blue',
        linestyle='--',
        label='Regression (Other)'
    )

    plt.title(f'Daily {pollutant_name} Concentration with Regression')
    plt.xlabel('Date')
    plt.ylabel(f'Daily Average {pollutant_name} ({pollutant_unit})')
    plt.legend()
    plt.tight_layout()
    plt.show()

# **Show annual line chart & Regression Analysis**

In [None]:
def show_annual_line_chart(df_nearby_daily, df_all_daily, pollutant_name, pollutant_unit):
    # Add a 'year' column to the daily data, extracting the year from the 'Date Local' column
    df_nearby_daily['year'] = df_nearby_daily['Date Local'].dt.year
    df_all_daily['year'] = df_all_daily['Date Local'].dt.year

    # Group by 'year' and calculate the mean for each year
    df_nearby_annual = df_nearby_daily.groupby('year')['pollutant_mean_nearby'].mean().reset_index()
    df_all_annual = df_all_daily.groupby('year')['pollutant_mean_all'].mean().reset_index()


    print("Annual mean for nearby sites:\n", df_nearby_annual)

    print(f"\nAnnual mean for other sites in {CITY_NAME}\n", df_all_annual)

    # Maybe looks a little bit messy but gives all the raw annual mean data

    # Merge the two datasets on the 'year' column to facilitate plotting
    df_plot_annual = pd.merge(df_nearby_annual, df_all_annual, on='year', how='inner')


    # Filter the dataframe df_plot_annual for the specified year range
    df_annual_filtered = df_plot_annual[
        (df_plot_annual['year'] >= START_YEAR) &
        (df_plot_annual['year'] <= END_YEAR)
    ].copy()

    # Extract the filtered years
    x_year = df_annual_filtered['year']

    # Perform linear regression for Nearby Sites
    slope_near_annual, intercept_near_annual, r_value_near_annual, p_value_near_annual, std_err_near_annual = linregress(
        x_year,
        df_annual_filtered['pollutant_mean_nearby']
    )

    # Perform linear regression for Other Sites
    slope_all_annual, intercept_all_annual, r_value_all_annual, p_value_all_annual, std_err_all_annual = linregress(
        x_year,
        df_annual_filtered['pollutant_mean_all']
    )

    # Print regression results
    print("==== [ANNUAL] Regression Analysis (Filtered) ====")
    print("[Nearby Sites]:")
    print(f"  Slope = {slope_near_annual:.6f}, Intercept = {intercept_near_annual:.6f}")
    print(f"  R-value = {r_value_near_annual:.4f},  P-value = {p_value_near_annual:.4g},  StdErr = {std_err_near_annual:.6f}\n")

    print("[Other Sites]:")
    print(f"  Slope = {slope_all_annual:.6f}, Intercept = {intercept_all_annual:.6f}")
    print(f"  R-value = {r_value_all_annual:.4f},  P-value = {p_value_all_annual:.4g},  StdErr = {std_err_all_annual:.6f}\n")

    # Visualization Section
    plt.figure(figsize=(10,5))

    # Line chart for annual pollutant values (filtered by the specified year range)
    sns.lineplot(
        x='year',
        y='pollutant_mean_nearby',
        data=df_annual_filtered,
        marker='o',
        label=f'Nearby Sites Annual {pollutant_name} Mean'
    )
    sns.lineplot(
        x='year',
        y='pollutant_mean_all',
        data=df_annual_filtered,
        marker='o',
        label=f'Other Sites Annual {pollutant_name} Mean'
    )

    # Add regression lines
    y_pred_near_annual = slope_near_annual * x_year + intercept_near_annual
    y_pred_all_annual  = slope_all_annual  * x_year + intercept_all_annual

    plt.plot(
        x_year,
        y_pred_near_annual,
        color='red',
        linestyle='--',
        label='Regression (Nearby)'
    )
    plt.plot(
        x_year,
        y_pred_all_annual,
        color='blue',
        linestyle='--',
        label='Regression (Other)'
    )

    plt.title(f'Annual {pollutant_name} Concentration with Regression')
    plt.xlabel('Year')
    plt.ylabel(f'Annual Average {pollutant_name} ({pollutant_unit})')
    plt.legend()
    plt.tight_layout()
    plt.show()