# Import modules and functions

In [None]:
import os
import subprocess
from owslib.wfs import WebFeatureService
import shapely.wkt
import geopandas as gpd
import json
from pathlib import Path
import urllib
import urllib.request
import gzip
import pandas as pd
import fiona  # Importing fiona to handle geospatial data
import sys
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.linear_model import LinearRegression
import seaborn as sns

# Function to install a package
def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Ensure required packages are installed
try:
    import tqdm
except ImportError as e:
    package = str(e).split()[-1]
    install(package)

import warnings
# Ignore FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_columns', None)  # Show all columns of datasets

# Define File Paths

In [None]:
# Step 2: Define File Paths
print("Step 2: Define File Paths")
municipality_workspace = r"C:\Users\daphn\Documents\MADE_THESIS\DATA\RAW_DATA\AMSTERDAM_MUNICIPALITY\AmsterdamMunicipality.shp"
air_quality_folder = r"C:\Users\daphn\Documents\MADE_THESIS\DATA\RAW_DATA\DATA_AIR_QUALITY\Clean_AMS_DataMixed_3Pollutants"
NO2_file = os.path.join(air_quality_folder, 'Clean_AMS_DataMixed_NO2_5Jul.shp')
tree_canopy_path = r"C:\Users\daphn\Documents\MADE_THESIS\DATA\CLEAN_DATA\Clipped_Tree_Data.shp"
output_folder = r"C:\Users\daphn\Documents\MADE_THESIS\DATA\CLEAN_DATA\NO2_TRY"


## Load study area and data

In [None]:
# Step 3: Load Shapefiles
print("Step 3: Load Shapefiles")
try:
    municipality_boundary = gpd.read_file(municipality_workspace)
    NO2_data = gpd.read_file(NO2_file)
    tree_canopy_data = gpd.read_file(tree_canopy_path)
    print("Shapefiles loaded successfully")
except Exception as e:
    print(f"Error loading shapefiles: {e}")
    sys.exit(1)


## Verify Coordinate system

In [None]:
# Step 4: Ensure Coordinate Systems Match
print("Step 4: Ensure Coordinate Systems Match")
try:
    municipality_crs = municipality_boundary.crs

    if NO2_data.crs != municipality_crs:
        NO2_data = NO2_data.to_crs(municipality_crs)
        print("NO2 data reprojected to match municipality CRS")

    if tree_canopy_data.crs != municipality_crs:
        tree_canopy_data = tree_canopy_data.to_crs(municipality_crs)
        print("Tree canopy data reprojected to match municipality CRS")

    print("Coordinate systems verified and matched")
except Exception as e:
    print(f"Error in checking/reprojecting CRS: {e}")
    sys.exit(1)

In [None]:
## Clip NO2 Data to the municipality

In [None]:
# Step 5: Clip NO2 Data to the Municipality Boundary
print("Step 5: Clip NO2 data to the municipality boundary")
try:
    clipped_NO2_data = gpd.clip(NO2_data, municipality_boundary)
    clipped_NO2_data.to_file(os.path.join(output_folder, "clipped_NO2_data.shp"))
    print("NO2 data clipped successfully")
except Exception as e:
    print(f"Error in clipping NO2 data: {e}")
    sys.exit(1)

## Buffer

In [None]:
# Step 6: Buffer the NO2 data
print("Step 6: Buffer the NO2 data")
try:
    buffer_distance = 15  # Buffer distance in meters
    clipped_NO2_data['geometry'] = clipped_NO2_data.geometry.buffer(buffer_distance, cap_style='flat')
    clipped_NO2_data['shp_area_bf'] = clipped_NO2_data.geometry.area
    clipped_NO2_data.to_file(os.path.join(output_folder, "buffered_NO2_data.shp"))
    print("Buffer creation completed")
except Exception as e:
    print(f"Error in creating buffers: {e}")
    sys.exit(1)

In [None]:
from tqdm import tqdm
# Step 7: Process each buffer to identify trees within them and calculate the fraction of tree coverage
print("Step 7: Identify trees within buffers and calculate sum of tree areas")
processed_buffers = []

def process_buffer(buffer, tree_canopy, original_crs, visualize=False):
    try:
        # Use the original CRS from the GeoDataFrame to ensure the CRS matches
        if tree_canopy.crs != original_crs:
            tree_canopy = tree_canopy.to_crs(original_crs)
            print("Reprojected tree canopy data to match buffer CRS")

        # Use a bounding box to filter trees and create a spatial index
        buffer_bbox = buffer.geometry.bounds
        filtered_trees = tree_canopy.cx[buffer_bbox[0]:buffer_bbox[2], buffer_bbox[1]:buffer_bbox[3]]
        
        # Build a spatial index for the filtered trees
        sindex = filtered_trees.sindex
        
        # Filter further using spatial index and buffer intersection
        possible_matches_index = list(sindex.intersection(buffer.geometry.bounds))
        possible_matches = filtered_trees.iloc[possible_matches_index]
        trees_within_buffer = possible_matches[possible_matches.intersects(buffer.geometry)]

        # Perform a precise intersection to get only the intersecting parts
        if not trees_within_buffer.empty:
            # Clip the tree geometries to the buffer to get only the intersecting parts
            trees_within_buffer['geometry'] = trees_within_buffer['geometry'].apply(lambda geom: geom.intersection(buffer.geometry))

            # Calculate the area of the clipped geometries
            intersection_area = trees_within_buffer.geometry.area.sum()

            # Debugging: Print out the areas and geometry types to ensure correctness
            print(f"Buffer ID: {buffer.name}")
            print(f"Buffer Area: {buffer['shp_area_bf']}")
            print(f"Intersection Area (Tree Coverage): {intersection_area}")
            print(f"Calculated Fraction: {intersection_area / buffer['shp_area_bf']}")
            print(f"Tree Geometry Types after Intersection: {trees_within_buffer.geometry.type.unique()}")
        else:
            intersection_area = 0

        # Calculate the fraction of the buffer covered by tree canopy
        buffer['sum_area_tb'] = intersection_area
        buffer['frac_area_tb'] = intersection_area / buffer['shp_area_bf']

        # If frac_area_tb is greater than 1, log a warning
        if buffer['frac_area_tb'] > 1:
            print(f"Warning: frac_area_tb exceeded 1 for buffer at index {buffer.name}. Inspect the data.")
            return None

        # Visualization: plot the buffer and intersecting trees
        if visualize:
            fig, ax = plt.subplots()
            buffer_gdf = gpd.GeoDataFrame([buffer], crs=tree_canopy.crs)
            buffer_gdf.plot(ax=ax, facecolor='none', edgecolor='blue', linewidth=2, label='Buffer')
            trees_within_buffer.plot(ax=ax, color='green', alpha=0.5, label='Intersecting Trees')
            plt.legend()
            plt.title(f"Buffer {buffer.name} and Intersecting Trees")
            plt.show()

        return buffer

    except Exception as e:
        print(f"Error processing buffer: {e}")
        return None

# Process all buffered NO2 data
for idx, buffer in tqdm(clipped_NO2_data.iterrows(), total=clipped_NO2_data.shape[0]):
    buffer_result = process_buffer(buffer, tree_canopy_data, clipped_NO2_data.crs, visualize=(idx == 0))  # Visualize the first buffer
    if buffer_result is not None:
        processed_buffers.append(buffer_result)


In [None]:
# Step 8: Combine processed buffers and save results
if processed_buffers:
    processed_buffers_gdf = gpd.GeoDataFrame(processed_buffers, crs=clipped_NO2_data.crs)
    processed_buffers_gdf.to_file(os.path.join(output_folder, "processed_buffers.shp"))
    print("Processed buffers successfully and saved")
else:
    print("No buffers were processed successfully")

In [None]:
# Choose the index of the buffer you want to visualize (e.g., the second buffer)
buffer_index = 35050

# Index starts from 0, so 1 means the second buffer
buffer_to_visualize = processed_buffers[buffer_index]

# Proceed with the visualization
buffer_geometry = buffer_to_visualize['geometry']
intersecting_trees = tree_canopy_data[tree_canopy_data.intersects(buffer_geometry)]
intersecting_trees['geometry'] = intersecting_trees['geometry'].apply(lambda geom: geom.intersection(buffer_geometry))

fig, ax = plt.subplots()
gpd.GeoDataFrame([buffer_to_visualize], crs=tree_canopy_data.crs).plot(ax=ax, facecolor='none', edgecolor='blue', linewidth=2, label='Buffer')
intersecting_trees.plot(ax=ax, color='green', alpha=0.5, label='Intersecting Trees')
plt.legend()
plt.title(f'Buffer {buffer_index + 1} and Intersecting Trees')
plt.show()


## Visualization and Analysis (including scatter plots)

In [None]:
# Step 9: Visualization and Analysis (including scatter plots)

# Calculate and print the correlation
correlation = processed_buffers_gdf['frac_area_tb'].corr(processed_buffers_gdf['NO2_Data'])
print(f"Correlation: {correlation}")

# Linear Regression Analysis
X = processed_buffers_gdf[['frac_area_tb']].values.reshape(-1, 1)
y = processed_buffers_gdf['NO2_Data'].values
model = LinearRegression()
model.fit(X, y)
trend_line = model.predict(X)  # Generate the predicted values for the trend line

# Scatter Plot with Trend Line
plt.figure(figsize=(10, 6))
plt.scatter(processed_buffers_gdf['frac_area_tb'], processed_buffers_gdf['NO2_Data'], alpha=0.5, label='Data Points')

# Sorting the values before plotting the trend line
sorted_indices = np.argsort(processed_buffers_gdf['frac_area_tb'])
sorted_frac_area_tb = processed_buffers_gdf['frac_area_tb'].values[sorted_indices]
sorted_trend_line = trend_line[sorted_indices]

# Plotting the trend line
plt.plot(sorted_frac_area_tb, sorted_trend_line, color='red', label='Trend Line')

plt.title('Scatter Plot of Tree Fraction vs. NO2 Data')
plt.xlabel('Fraction of Trees')
plt.ylabel('NO2 Data')
plt.grid()
plt.legend()
plt.show()

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Step 1: Bin the frac_area_tb column (Tree Cover) into four predefined bins
tree_bins = [-0.001, 0.2500, 0.5000, 0.7500, 1.0000]
tree_labels = ['Low Tree Cover (<= 25%)', 'Medium-Low Tree Cover (25-50%)', 'Medium-High Tree Cover (50-75%)', 'High Tree Cover (> 75%)']
processed_buffers_gdf['Tree_Cover_Binned'] = pd.cut(processed_buffers_gdf['frac_area_tb'], bins=tree_bins, labels=tree_labels, include_lowest=True)

In [None]:
# Step 2: Bin the NO2_Data column into quantile-based bins for an even distribution
no2_bins = pd.qcut(processed_buffers_gdf['NO2_Data'], 4, labels=['Low NO2', 'Medium-Low NO2', 'Medium-High NO2', 'High NO2'])

# Assign the bins to a new column
processed_buffers_gdf['NO2_Binned'] = no2_bins


In [None]:
# Convert the categorical columns to strings before concatenating
processed_buffers_gdf['NO2_Tree_Combination'] = processed_buffers_gdf['NO2_Binned'].astype(str) + ' / ' + processed_buffers_gdf['Tree_Cover_Binned'].astype(str)

# Debugging: Display the first few rows to ensure bins and combinations are created correctly
print(processed_buffers_gdf[['frac_area_tb', 'NO2_Data', 'Tree_Cover_Binned', 'NO2_Binned', 'NO2_Tree_Combination']].head())




In [None]:


# Step 4: Count the number of occurrences in each NO2 and Tree Cover combination
combination_counts = processed_buffers_gdf['NO2_Tree_Combination'].value_counts()
print(combination_counts)

## quantile-based bins for evenly distributed data points

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Step 1: Classify NO2_Data into quantile-based bins for evenly distributed data points
# This will ensure each bin contains approximately the same number of data points
processed_buffers_gdf['NO2_Binned_Quantile'], bin_edges_no2_quantile = pd.qcut(
    processed_buffers_gdf['NO2_Data'], 4, labels=['Bin 1', 'Bin 2', 'Bin 3', 'Bin 4'], retbins=True
)

# Display the bin ranges for NO2_Data (based on quantiles)
print(f"NO2_Data classified into quantile-based bins (evenly distributed data points):")
for i in range(len(bin_edges_no2_quantile) - 1):
    print(f"Bin {i+1}: from {bin_edges_no2_quantile[i]:.4f} to {bin_edges_no2_quantile[i+1]:.4f}")

# Step 2: Predefined bins for Tree Coverage (if you already have them set up)
tree_bins = [-0.0010, 0.2500, 0.5000, 0.7500, 1.0000]  # Tree coverage bin ranges
tree_labels = ['Bin 1: -0.001 to 0.25', 'Bin 2: 0.25 to 0.50', 'Bin 3: 0.50 to 0.75', 'Bin 4: 0.75 to 1.00']
processed_buffers_gdf['Tree_Cover_Binned'] = pd.cut(processed_buffers_gdf['frac_area_tb'], bins=tree_bins, labels=tree_labels, include_lowest=True)

# Step 3: Display the first few rows to check the classification
print(processed_buffers_gdf[['NO2_Data', 'NO2_Binned_Quantile', 'frac_area_tb', 'Tree_Cover_Binned']].head())

# Step 4: Count the number of occurrences in each quantile-based bin
no2_bin_quantile_counts = processed_buffers_gdf['NO2_Binned_Quantile'].value_counts()
tree_bin_counts = processed_buffers_gdf['Tree_Cover_Binned'].value_counts()

# Step 5: Plot the distribution of NO2_Data quantile bins
plt.figure(figsize=(10, 6))
no2_bin_quantile_counts.sort_index().plot(kind='bar', color='blue', alpha=0.7)
plt.title("Distribution of NO2 Data (Quantile-Based Bins with Evenly Distributed Data Points)")
plt.xlabel("NO2 Data Quantile Bins")
plt.ylabel("Count")
plt.grid(True)
plt.show()

# Step 6: Plot the distribution of Tree Coverage bins
plt.figure(figsize=(10, 6))
tree_bin_counts.sort_index().plot(kind='bar', color='green', alpha=0.7)
plt.title("Distribution of Tree Coverage (Predefined Bins)")
plt.xlabel("Tree Coverage Bins")
plt.ylabel("Count")
plt.grid(True)
plt.show()

# Step 7: Display the binning ranges and counts
print("NO2_Data Quantile-Based Bin Distribution:")
print(no2_bin_quantile_counts)

print("\nTree Coverage Bin Distribution:")
print(tree_bin_counts)


In [None]:
print(processed_buffers_gdf['frac_area_tb_binned'].dtype)
print(processed_buffers_gdf[['frac_area_tb_binned']].head())



In [None]:
import os
import geopandas as gpd

# Define the output folder
output_folder = r"C:\Users\daphn\Documents\MADE_THESIS\DATA\CLEAN_DATA\NO2_TRY"

# Ensure the output folder exists
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Function to create an interactive heat map using .explore() and save it as an HTML file
def plot_interactive_heat_map_explore(gdf, column, title, filename, cmap='YlOrRd', basemap='CartoDB positron'):
    # Ensure the GeoDataFrame is in WGS84 projection (EPSG:4326) for folium compatibility
    if gdf.crs.to_string() != 'EPSG:4326':
        gdf = gdf.to_crs(epsg=4326)

    # Use GeoPandas' explore method for creating an interactive map with a grey basemap
    m = gdf.explore(
        column=column,  # Data column to visualize
        cmap=cmap,  # Colormap for the visualization
        legend=True,  # Show legend
        tooltip=column,  # Show data values in the tooltip
        title=title,
        tiles=basemap  # Set the basemap to a grey theme (CartoDB Positron)
    )
    
    # Save the interactive map as an HTML file
    output_path = os.path.join(output_folder, filename)
    m.save(output_path)

# Plot interactive heatmap for NO2 quantiles
plot_interactive_heat_map_explore(
    processed_buffers_gdf,
    'NO2_Binned_Quantile',
    'NO2 Quantile Heat Map',
    'NO2_Quantile_Heat_Map.html',
    cmap='RdYlBu'  # Colormap with more contrast for quantiles
)

# Plot interactive heatmap for Tree Canopy (frac_area_tb) quantiles
plot_interactive_heat_map_explore(
    processed_buffers_gdf,
    'Tree_Cover_Binned',
    'Tree Canopy Quantile Heat Map',
    'Tree_Canopy_Quantile_Heat_Map.html',
    cmap='YlGn'  # Green colormap for tree canopy
)


In [None]:
#this displays the gray map 
import os
import geopandas as gpd

# Define the output folder
output_folder = r"C:\Users\daphn\Documents\MADE_THESIS\DATA\CLEAN_DATA\NO2_TRY"

# Ensure the output folder exists
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Function to create an interactive heat map using .explore() and save it as an HTML file
def plot_interactive_heat_map_explore(gdf, column, title, filename, cmap='YlOrRd', basemap='CartoDB positron'):
    # Ensure the GeoDataFrame is in WGS84 projection (EPSG:4326) for folium compatibility
    if gdf.crs.to_string() != 'EPSG:4326':
        gdf = gdf.to_crs(epsg=4326)

    # Use GeoPandas' explore method for creating an interactive map with a grey basemap
    m = gdf.explore(
        column=column,  # Data column to visualize
        cmap=cmap,  # Colormap for the visualization
        legend=True,  # Show legend
        tooltip=['NO2_Data', 'frac_area_tb', column],  # Show NO2, Tree Canopy, and the column used for coloring
        title=title,
        tiles=basemap  # Set the basemap to a grey theme (CartoDB Positron)
    )
    
    # Save the interactive map as an HTML file
    output_path = os.path.join(output_folder, filename)
    m.save(output_path)

# Plot interactive heatmap for NO2 quantiles
plot_interactive_heat_map_explore(
    processed_buffers_gdf,
    'NO2_Binned_Quantile',
    'NO2 Quantile Heat Map',
    'NO2_Quantile_Heat_Map.html',
    cmap='RdYlBu'  # Colormap with more contrast for quantiles
)

# Plot interactive heatmap for Tree Canopy (frac_area_tb) quantiles
plot_interactive_heat_map_explore(
    processed_buffers_gdf,
    'Tree_Cover_Binned',
    'Tree Canopy Quantile Heat Map',
    'Tree_Canopy_Quantile_Heat_Map.html',
    cmap='YlGn'  # Green colormap for tree canopy
)


In [None]:
import os
import geopandas as gpd

# Define the output folder
output_folder = r"C:\Users\daphn\Documents\MADE_THESIS\DATA\CLEAN_DATA\NO2_TRY"

# Ensure the output folder exists
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Function to create an interactive heat map using .explore() and save it as an HTML file
def plot_interactive_heat_map_explore(gdf, column, title, filename, cmap='YlOrRd', basemap='CartoDB positron'):
    # Ensure the GeoDataFrame is in WGS84 projection (EPSG:4326) for folium compatibility
    if gdf.crs.to_string() != 'EPSG:4326':
        gdf = gdf.to_crs(epsg=4326)

    # Use GeoPandas' explore method for creating an interactive map with a grey basemap
    m = gdf.explore(
        column=column,  # Data column to visualize
        cmap=cmap,  # Colormap for the visualization
        legend=True,  # Show legend
        tooltip=['NO2_Data', 'frac_area_tb', column],  # Show NO2, Tree Canopy, and the column used for coloring
        title=title,
        tiles=basemap  # Set the basemap to a grey theme (CartoDB Positron)
    )
    
    # Save the interactive map as an HTML file
    output_path = os.path.join(output_folder, filename)
    m.save(output_path)
    print(f"Map saved to {output_path}")

# Filter for Top 25% NO2 and Top 25% Tree Canopy
top_25_no2_top_25_tree = processed_buffers_gdf_copy[
    (processed_buffers_gdf_copy['NO2_Binned_Value'] == 'Q4') & 
    (processed_buffers_gdf_copy['Tree_Cover_Binned_Value'] == 'Q4')
]

# Filter for Bottom 25% NO2 and Bottom 25% Tree Canopy
bottom_25_no2_bottom_25_tree = processed_buffers_gdf_copy[
    (processed_buffers_gdf_copy['NO2_Binned_Value'] == 'Q1') & 
    (processed_buffers_gdf_copy['Tree_Cover_Binned_Value'] == 'Q1')
]

# Interactive map for Top 25% NO2 and Top 25% Tree Canopy
plot_interactive_heat_map_explore(
    top_25_no2_top_25_tree,
    'NO2_Data',
    'Top 25% NO2 & Top 25% Tree Canopy Heat Map',
    'Top_25_NO2_Tree_Heat_Map.html',
    cmap='YlOrRd'  # Customize colormap if needed
)

# Interactive map for Bottom 25% NO2 and Bottom 25% Tree Canopy
plot_interactive_heat_map_explore(
    bottom_25_no2_bottom_25_tree,
    'NO2_Data',
    'Bottom 25% NO2 & Bottom 25% Tree Canopy Heat Map',
    'Bottom_25_NO2_Tree_Heat_Map.html',
    cmap='YlGn'  # Green colormap for tree canopy
)


In [None]:
import os
import geopandas as gpd

# Define the output folder
output_folder = r"C:\Users\daphn\Documents\MADE_THESIS\DATA\CLEAN_DATA\NO2_TRY"

# Ensure the output folder exists
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Function to create an interactive heat map using .explore() and save it as an HTML file
def plot_interactive_heat_map_explore(gdf, column, title, filename, cmap='YlOrRd', basemap='CartoDB positron'):
    # Ensure the GeoDataFrame is in WGS84 projection (EPSG:4326) for folium compatibility
    if gdf.crs.to_string() != 'EPSG:4326':
        gdf = gdf.to_crs(epsg=4326)

    # Use GeoPandas' explore method for creating an interactive map with a grey basemap
    m = gdf.explore(
        column=column,  # Data column to visualize
        cmap=cmap,  # Colormap for the visualization
        legend=True,  # Show legend
        tooltip=['NO2_Data', 'frac_area_tb', column],  # Show NO2, Tree Canopy, and the column used for coloring
        title=title,
        tiles=basemap  # Set the basemap to a grey theme (CartoDB Positron)
    )
    
    # Save the interactive map as an HTML file
    output_path = os.path.join(output_folder, filename)
    m.save(output_path)
    print(f"Map saved to {output_path}")

# Step 1: Filter for Top 25% Tree Canopy (frac_area_tb in Q4)
top_25_tree = processed_buffers_gdf_copy[
    (processed_buffers_gdf_copy['Tree_Cover_Binned_Value'] == 'Q4')
]

# Step 2: Filter for Top 25% NO2 (NO2_Data in Q4)
top_25_no2 = processed_buffers_gdf_copy[
    (processed_buffers_gdf_copy['NO2_Binned_Value'] == 'Q4')
]

# Step 3: Interactive map for Top 25% Tree Canopy
plot_interactive_heat_map_explore(
    top_25_tree,
    'frac_area_tb',  # Visualize tree canopy coverage
    'Top 25% Tree Canopy Heat Map',
    'Top_25_Tree_Canopy_Heat_Map.html',
    cmap='YlGn'  # Green colormap for tree canopy
)

# Step 4: Interactive map for Top 25% NO2 Levels
plot_interactive_heat_map_explore(
    top_25_no2,
    'NO2_Data',  # Visualize NO2 levels
    'Top 25% NO2 Heat Map',
    'Top_25_NO2_Heat_Map.html',
    cmap='YlOrRd'  # Red-yellow colormap for NO2 levels
)


In [None]:
import os
import geopandas as gpd

# Define the output folder
output_folder = r"C:\Users\daphn\Documents\MADE_THESIS\DATA\CLEAN_DATA\NO2_TRY"

# Ensure the output folder exists
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Function to create an interactive heat map using .explore() and save it as an HTML file
def plot_interactive_heat_map_explore(gdf, column, title, filename, cmap='YlOrRd', basemap='CartoDB positron'):
    # Ensure the GeoDataFrame is in WGS84 projection (EPSG:4326) for folium compatibility
    if gdf.crs.to_string() != 'EPSG:4326':
        gdf = gdf.to_crs(epsg=4326)

    # Use GeoPandas' explore method for creating an interactive map with a grey basemap
    m = gdf.explore(
        column=column,  # Data column to visualize
        cmap=cmap,  # Colormap for the visualization
        legend=True,  # Show legend
        tooltip=['NO2_Data', 'frac_area_tb', column],  # Show NO2, Tree Canopy, and the column used for coloring
        title=title,
        tiles=basemap  # Set the basemap to a grey theme (CartoDB Positron)
    )
    
    # Save the interactive map as an HTML file
    output_path = os.path.join(output_folder, filename)
    m.save(output_path)
    print(f"Map saved to {output_path}")

# Create a function to map each quartile separately for NO2 and Tree Canopy
def plot_quartiles_interactive_maps(gdf, column, title_prefix, cmap, output_file_prefix):
    quartiles = ['Q1', 'Q2', 'Q3', 'Q4']
    
    # Loop through quartiles and plot maps for each
    for quartile in quartiles:
        filtered_gdf = gdf[gdf[column] == quartile]
        plot_interactive_heat_map_explore(
            filtered_gdf,
            column=column,  # Use the binned column for visualization
            title=f"{title_prefix} - {quartile}",
            filename=f"{output_file_prefix}_{quartile}.html",
            cmap=cmap  # Use the provided colormap
        )

# Plot quartiles for NO2 Data
plot_quartiles_interactive_maps(
    processed_buffers_gdf_copy,
    'NO2_Binned_Value',  # Binned NO2 data column
    'NO2 Data Quartiles',
    'RdYlBu',  # Colormap for NO2 levels
    'NO2_Quartile_Heat_Map'  # Output filename prefix
)

# Plot quartiles for Tree Canopy Coverage (frac_area_tb)
plot_quartiles_interactive_maps(
    processed_buffers_gdf_copy,
    'Tree_Cover_Binned_Value',  # Binned tree canopy data column
    'Tree Canopy Quartiles',
    'YlGn',  # Green colormap for tree canopy
    'Tree_Canopy_Quartile_Heat_Map'  # Output filename prefix
)


## Division in quartiles for top 25% NO2 and top 25% tree cover

In [None]:
import pandas as pd
import os
import geopandas as gpd

# Step 1: Copy the original GeoDataFrame to avoid modifying the original
processed_buffers_gdf_copy_equal_quartiles = processed_buffers_gdf.copy()

# Step 2: Create equal-sized quartiles for NO2 levels and tree canopy using pd.qcut()
processed_buffers_gdf_copy_equal_quartiles['NO2_Binned_Quantile'] = pd.qcut(
    processed_buffers_gdf_copy_equal_quartiles['NO2_Data'], 
    q=4,  # Divides the data into 4 equal-sized quartiles
    labels=['Q1', 'Q2', 'Q3', 'Q4']  # Top 25% will be in Q4
)

processed_buffers_gdf_copy_equal_quartiles['Tree_Cover_Binned_Quantile'] = pd.qcut(
    processed_buffers_gdf_copy_equal_quartiles['frac_area_tb'], 
    q=4,  # Divides the data into 4 equal-sized quartiles
    labels=['Q1', 'Q2', 'Q3', 'Q4']  # Top 25% will be in Q4
)

# Step 3: Convert all categorical columns in the DataFrame to string
for col in processed_buffers_gdf_copy_equal_quartiles.select_dtypes(include='category').columns:
    processed_buffers_gdf_copy_equal_quartiles[col] = processed_buffers_gdf_copy_equal_quartiles[col].astype(str)

# Save the processed GeoDataFrame into a new file (for safe analysis)
output_file_path = r'C:\Users\daphn\Documents\MADE_THESIS\DATA\CLEAN_DATA\NO2_TRY\processed_buffers_equal_quartiles.gpkg'  # GeoPackage format

# Ensure the output folder exists
if not os.path.exists(os.path.dirname(output_file_path)):
    os.makedirs(os.path.dirname(output_file_path))

# Save as a GeoPackage
processed_buffers_gdf_copy_equal_quartiles.to_file(output_file_path, driver='GPKG')
print(f"Processed data with equal-sized quartiles saved to: {output_file_path}")


In [None]:
import os
import geopandas as gpd

# Define the output folder (same as before)
output_folder = r"C:\Users\daphn\Documents\MADE_THESIS\DATA\CLEAN_DATA\NO2_TRY"

# Ensure the output folder exists
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Function to create an interactive heat map using .explore() and save it as an HTML file
def plot_interactive_heat_map_explore(gdf, column, title, filename, cmap='YlOrRd', basemap='CartoDB positron'):
    # Ensure the GeoDataFrame is in WGS84 projection (EPSG:4326) for folium compatibility
    if gdf.crs.to_string() != 'EPSG:4326':
        gdf = gdf.to_crs(epsg=4326)

    # Use GeoPandas' explore method for creating an interactive map
    m = gdf.explore(
        column=column,  # Data column to visualize
        cmap=cmap,  # Colormap for the visualization
        legend=True,  # Show legend
        tooltip=['NO2_Data', 'frac_area_tb', column],  # Show NO2, Tree Canopy, and the column used for coloring
        title=title,
        tiles=basemap  # Set the basemap to a grey theme (CartoDB Positron)
    )
    
    # Save the interactive map as an HTML file
    output_path = os.path.join(output_folder, filename)
    m.save(output_path)
    print(f"Map saved to {output_path}")

# Step 1: Interactive map for Tree Canopy Quartiles
plot_interactive_heat_map_explore(
    processed_buffers_gdf_copy_equal_quartiles,
    'Tree_Cover_Binned_Quantile',  # Visualize tree canopy quartiles
    'Tree Canopy Quartiles Heat Map',
    'Tree_Canopy_Quartiles_Heat_Map.html',
    cmap='YlGn'  # Green colormap for tree canopy
)

# Step 2: Interactive map for NO2 Quartiles
plot_interactive_heat_map_explore(
    processed_buffers_gdf_copy_equal_quartiles,
    'NO2_Binned_Quantile',  # Visualize NO2 quartiles
    'NO2 Quartiles Heat Map',
    'NO2_Quartiles_Heat_Map.html',
    cmap='RdYlBu'  # Red-yellow-blue colormap for NO2 levels
)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
import numpy as np

# Step 1: Define the combinations based on the quartiles

# Top 25% NO2 with Top 25% Tree Canopy
top_25_no2_top_25_tree = processed_buffers_gdf_copy_equal_quartiles[
    (processed_buffers_gdf_copy_equal_quartiles['NO2_Binned_Quantile'] == 'Q4') & 
    (processed_buffers_gdf_copy_equal_quartiles['Tree_Cover_Binned_Quantile'] == 'Q4')
]

# Top 25% NO2 with Bottom 25% Tree Canopy
top_25_no2_bottom_25_tree = processed_buffers_gdf_copy_equal_quartiles[
    (processed_buffers_gdf_copy_equal_quartiles['NO2_Binned_Quantile'] == 'Q4') & 
    (processed_buffers_gdf_copy_equal_quartiles['Tree_Cover_Binned_Quantile'] == 'Q1')
]

# Bottom 25% NO2 with Top 25% Tree Canopy
bottom_25_no2_top_25_tree = processed_buffers_gdf_copy_equal_quartiles[
    (processed_buffers_gdf_copy_equal_quartiles['NO2_Binned_Quantile'] == 'Q1') & 
    (processed_buffers_gdf_copy_equal_quartiles['Tree_Cover_Binned_Quantile'] == 'Q4')
]

# Bottom 25% NO2 with Bottom 25% Tree Canopy
bottom_25_no2_bottom_25_tree = processed_buffers_gdf_copy_equal_quartiles[
    (processed_buffers_gdf_copy_equal_quartiles['NO2_Binned_Quantile'] == 'Q1') & 
    (processed_buffers_gdf_copy_equal_quartiles['Tree_Cover_Binned_Quantile'] == 'Q1')
]

# Step 2: Function to plot scatter plot with regression, correlation, and intercept
def plot_scatter_with_regression(df, title):
    if df.empty:
        print(f"No data available for {title}. Skipping plot.")
        return

    # Scatter plot with regression line and correlation
    plt.figure(figsize=(10, 6))
    plt.scatter(df['frac_area_tb'], df['NO2_Data'], alpha=0.5)

    # Prepare data for regression
    X = df['frac_area_tb'].values.reshape(-1, 1)
    y = df['NO2_Data'].values

    # Perform linear regression
    if len(X) > 1 and len(y) > 1:
        model = LinearRegression()
        model.fit(X, y)
        slope = model.coef_[0]
        intercept = model.intercept_
        r_squared = model.score(X, y)
        correlation = np.sqrt(r_squared) if slope >= 0 else -np.sqrt(r_squared)

        # Plot the regression line
        trend_line = model.predict(X)
        plt.plot(df['frac_area_tb'], trend_line, 'r--', label=f'Regression Line (R² = {r_squared:.2f})')

        # Print the regression coefficient and intercept
        print(f"{title} - Regression Coefficient (Slope): {slope:.4f}, Intercept: {intercept:.4f}, Correlation Coefficient: {correlation:.4f}")
    else:
        print(f"Insufficient data for regression for {title}. Skipping regression.")

    plt.title(f"{title} - Scatter Plot with Regression")
    plt.xlabel('Fraction of Tree Coverage (%)')
    plt.ylabel('NO2 Data')
    plt.grid(True)
    plt.legend()
    plt.show()

# Step 3: Generate scatter plots with regression for each combination

# Plot 1: Top 25% NO2 Data & Top 25% Tree Canopy
plot_scatter_with_regression(top_25_no2_top_25_tree, "Top 25% NO2 Data & Top 25% Tree Canopy")

# Plot 2: Top 25% NO2 Data & Bottom 25% Tree Canopy
plot_scatter_with_regression(top_25_no2_bottom_25_tree, "Top 25% NO2 Data & Bottom 25% Tree Canopy")

# Plot 3: Bottom 25% NO2 Data & Top 25% Tree Canopy
plot_scatter_with_regression(bottom_25_no2_top_25_tree, "Bottom 25% NO2 Data & Top 25% Tree Canopy")

# Plot 4: Bottom 25% NO2 Data & Bottom 25% Tree Canopy
plot_scatter_with_regression(bottom_25_no2_bottom_25_tree, "Bottom 25% NO2 Data & Bottom 25% Tree Canopy")

# Step 4: Optionally add more combined analysis like violin/boxplots if needed


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
import numpy as np

# Step 1: Define the combinations based on the quartiles

# Top 25% NO2 with Top 25% Tree Canopy
top_25_no2_top_25_tree = processed_buffers_gdf_copy_equal_quartiles[
    (processed_buffers_gdf_copy_equal_quartiles['NO2_Binned_Quantile'] == 'Q4') & 
    (processed_buffers_gdf_copy_equal_quartiles['Tree_Cover_Binned_Quantile'] == 'Q4')
]

# Top 25% NO2 with Bottom 25% Tree Canopy
top_25_no2_bottom_25_tree = processed_buffers_gdf_copy_equal_quartiles[
    (processed_buffers_gdf_copy_equal_quartiles['NO2_Binned_Quantile'] == 'Q4') & 
    (processed_buffers_gdf_copy_equal_quartiles['Tree_Cover_Binned_Quantile'] == 'Q1')
]

# Bottom 25% NO2 with Top 25% Tree Canopy
bottom_25_no2_top_25_tree = processed_buffers_gdf_copy_equal_quartiles[
    (processed_buffers_gdf_copy_equal_quartiles['NO2_Binned_Quantile'] == 'Q1') & 
    (processed_buffers_gdf_copy_equal_quartiles['Tree_Cover_Binned_Quantile'] == 'Q4')
]

# Bottom 25% NO2 with Bottom 25% Tree Canopy
bottom_25_no2_bottom_25_tree = processed_buffers_gdf_copy_equal_quartiles[
    (processed_buffers_gdf_copy_equal_quartiles['NO2_Binned_Quantile'] == 'Q1') & 
    (processed_buffers_gdf_copy_equal_quartiles['Tree_Cover_Binned_Quantile'] == 'Q1')
]

# Step 2: Function to plot scatter plot with regression, correlation, and intercept
def plot_scatter_with_regression(df, title):
    if df.empty:
        print(f"No data available for {title}. Skipping plot.")
        return

    # Scatter plot with regression line and correlation
    plt.figure(figsize=(10, 6))
    plt.scatter(df['frac_area_tb'], df['NO2_Data'], alpha=0.5)

    # Prepare data for regression
    X = df['frac_area_tb'].values.reshape(-1, 1)
    y = df['NO2_Data'].values

    # Perform linear regression
    if len(X) > 1 and len(y) > 1:
        model = LinearRegression()
        model.fit(X, y)
        slope = model.coef_[0]
        intercept = model.intercept_
        r_squared = model.score(X, y)
        correlation = np.sqrt(r_squared) if slope >= 0 else -np.sqrt(r_squared)

        # Plot the regression line
        trend_line = model.predict(X)
        plt.plot(df['frac_area_tb'], trend_line, 'r--', label=f'Regression Line (R² = {r_squared:.2f})')

        # Print the regression coefficient and intercept
        print(f"{title} - Regression Coefficient (Slope): {slope:.4f}, Intercept: {intercept:.4f}, Correlation Coefficient: {correlation:.4f}")
    else:
        print(f"Insufficient data for regression for {title}. Skipping regression.")

    plt.title(f"{title} - Scatter Plot with Regression")
    plt.xlabel('Fraction of Tree Coverage (%)')
    plt.ylabel('NO2 Data')
    plt.grid(True)
    plt.legend()
    plt.show()

# Step 3: Function to create histograms for NO2 Data and Tree Coverage
def plot_histograms(df, title):
    if df.empty:
        print(f"No data available for {title}. Skipping plot.")
        return

    # Histogram of NO2 Data
    plt.figure(figsize=(10, 6))
    plt.hist(df['NO2_Data'], bins=20, alpha=0.7, color='blue', edgecolor='black')
    plt.title(f"{title} - NO2 Data Histogram")
    plt.xlabel('NO2 Data')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

    # Histogram of Fraction of Tree Coverage
    plt.figure(figsize=(10, 6))
    plt.hist(df['frac_area_tb'], bins=20, alpha=0.7, color='green', edgecolor='black')
    plt.title(f"{title} - Tree Coverage Histogram")
    plt.xlabel('Fraction of Tree Coverage (%)')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

# Step 4: Function to create violin and boxplots for NO2 Data based on Tree Coverage
def plot_violin_boxplot(df, title):
    if df.empty:
        print(f"No data available for {title}. Skipping plot.")
        return

    plt.figure(figsize=(10, 6))

    # Violin plot
    sns.violinplot(x='Tree_Cover_Binned_Quantile', y='NO2_Data', data=df, inner=None, color='lightgray')

    # Boxplot
    sns.boxplot(x='Tree_Cover_Binned_Quantile', y='NO2_Data', data=df, width=0.2, boxprops=dict(alpha=0.6))

    plt.title(f"{title} - Combined Violin and Box Plot")
    plt.xlabel('Tree Canopy Bins')
    plt.ylabel('NO2 Data')
    plt.grid(True)
    plt.show()

# Step 5: Generate scatter plots with regression, histograms, and violin/boxplots for each combination

# Plot 1: Top 25% NO2 Data & Top 25% Tree Canopy
plot_scatter_with_regression(top_25_no2_top_25_tree, "Top 25% NO2 Data & Top 25% Tree Canopy")
plot_histograms(top_25_no2_top_25_tree, "Top 25% NO2 Data & Top 25% Tree Canopy")
plot_violin_boxplot(top_25_no2_top_25_tree, "Top 25% NO2 Data & Top 25% Tree Canopy")

# Plot 2: Top 25% NO2 Data & Bottom 25% Tree Canopy
plot_scatter_with_regression(top_25_no2_bottom_25_tree, "Top 25% NO2 Data & Bottom 25% Tree Canopy")
plot_histograms(top_25_no2_bottom_25_tree, "Top 25% NO2 Data & Bottom 25% Tree Canopy")
plot_violin_boxplot(top_25_no2_bottom_25_tree, "Top 25% NO2 Data & Bottom 25% Tree Canopy")

# Plot 3: Bottom 25% NO2 Data & Top 25% Tree Canopy
plot_scatter_with_regression(bottom_25_no2_top_25_tree, "Bottom 25% NO2 Data & Top 25% Tree Canopy")
plot_histograms(bottom_25_no2_top_25_tree, "Bottom 25% NO2 Data & Top 25% Tree Canopy")
plot_violin_boxplot(bottom_25_no2_top_25_tree, "Bottom 25% NO2 Data & Top 25% Tree Canopy")

# Plot 4: Bottom 25% NO2 Data & Bottom 25% Tree Canopy
plot_scatter_with_regression(bottom_25_no2_bottom_25_tree, "Bottom 25% NO2 Data & Bottom 25% Tree Canopy")
plot_histograms(bottom_25_no2_bottom_25_tree, "Bottom 25% NO2 Data & Bottom 25% Tree Canopy")
plot_violin_boxplot(bottom_25_no2_bottom_25_tree, "Bottom 25% NO2 Data & Bottom 25% Tree Canopy")


In [None]:
import os
import geopandas as gpd

# Define the output folder (same as before)
output_folder = r"C:\Users\daphn\Documents\MADE_THESIS\DATA\CLEAN_DATA\NO2_TRY"

# Ensure the output folder exists
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Function to create an interactive heat map using .explore() and save it as an HTML file
def plot_interactive_heat_map_explore(gdf, column, title, filename, cmap='YlOrRd', basemap='CartoDB positron'):
    # Ensure the GeoDataFrame is in WGS84 projection (EPSG:4326) for folium compatibility
    if gdf.crs.to_string() != 'EPSG:4326':
        gdf = gdf.to_crs(epsg=4326)

    # Use GeoPandas' explore method for creating an interactive map
    m = gdf.explore(
        column=column,  # Data column to visualize
        cmap=cmap,  # Colormap for the visualization
        legend=True,  # Show legend
        tooltip=['NO2_Data', 'frac_area_tb', column],  # Show NO2, Tree Canopy, and the column used for coloring
        title=title,
        tiles=basemap  # Set the basemap to a grey theme (CartoDB Positron)
    )
    
    # Save the interactive map as an HTML file
    output_path = os.path.join(output_folder, filename)
    m.save(output_path)
    print(f"Map saved to {output_path}")

# Step 1: Define the combinations based on the quartiles

# Top 25% NO2 with Top 25% Tree Canopy
top_25_no2_top_25_tree = processed_buffers_gdf_copy_equal_quartiles[
    (processed_buffers_gdf_copy_equal_quartiles['NO2_Binned_Quantile'] == 'Q4') & 
    (processed_buffers_gdf_copy_equal_quartiles['Tree_Cover_Binned_Quantile'] == 'Q4')
]

# Top 25% NO2 with Bottom 25% Tree Canopy
top_25_no2_bottom_25_tree = processed_buffers_gdf_copy_equal_quartiles[
    (processed_buffers_gdf_copy_equal_quartiles['NO2_Binned_Quantile'] == 'Q4') & 
    (processed_buffers_gdf_copy_equal_quartiles['Tree_Cover_Binned_Quantile'] == 'Q1')
]

# Bottom 25% NO2 with Top 25% Tree Canopy
bottom_25_no2_top_25_tree = processed_buffers_gdf_copy_equal_quartiles[
    (processed_buffers_gdf_copy_equal_quartiles['NO2_Binned_Quantile'] == 'Q1') & 
    (processed_buffers_gdf_copy_equal_quartiles['Tree_Cover_Binned_Quantile'] == 'Q4')
]

# Bottom 25% NO2 with Bottom 25% Tree Canopy
bottom_25_no2_bottom_25_tree = processed_buffers_gdf_copy_equal_quartiles[
    (processed_buffers_gdf_copy_equal_quartiles['NO2_Binned_Quantile'] == 'Q1') & 
    (processed_buffers_gdf_copy_equal_quartiles['Tree_Cover_Binned_Quantile'] == 'Q1')
]

# Step 2: Create interactive maps for each combination

# Map 1: Top 25% NO2 & Top 25% Tree Canopy
plot_interactive_heat_map_explore(
    top_25_no2_top_25_tree,
    'NO2_Data',  # Visualize NO2 levels in the top 25%
    'Top 25% NO2 & Top 25% Tree Canopy',
    'Top_25_NO2_Top_25_Tree_Heat_Map.html',
    cmap='YlOrRd'  # Red-yellow colormap for NO2 levels
)

# Map 2: Top 25% NO2 & Bottom 25% Tree Canopy
plot_interactive_heat_map_explore(
    top_25_no2_bottom_25_tree,
    'NO2_Data',  # Visualize NO2 levels in the top 25%
    'Top 25% NO2 & Bottom 25% Tree Canopy',
    'Top_25_NO2_Bottom_25_Tree_Heat_Map.html',
    cmap='YlOrRd'  # Red-yellow colormap for NO2 levels
)

# Map 3: Bottom 25% NO2 & Top 25% Tree Canopy
plot_interactive_heat_map_explore(
    bottom_25_no2_top_25_tree,
    'NO2_Data',  # Visualize NO2 levels in the bottom 25%
    'Bottom 25% NO2 & Top 25% Tree Canopy',
    'Bottom_25_NO2_Top_25_Tree_Heat_Map.html',
    cmap='YlOrRd'  # Red-yellow colormap for NO2 levels
)

# Map 4: Bottom 25% NO2 & Bottom 25% Tree Canopy
plot_interactive_heat_map_explore(
    bottom_25_no2_bottom_25_tree,
    'NO2_Data',  # Visualize NO2 levels in the bottom 25%
    'Bottom 25% NO2 & Bottom 25% Tree Canopy',
    'Bottom_25_NO2_Bottom_25_Tree_Heat_Map.html',
    cmap='YlOrRd'  # Red-yellow colormap for NO2 levels
)


### Sensitivity analysis with drive days 

In [None]:
# Check the minimum and maximum values in the 'DriveDays' column
drive_days_min = processed_buffers_gdf['DriveDays'].min()
drive_days_max = processed_buffers_gdf['DriveDays'].max()

print(f"DriveDays range from {drive_days_min} to {drive_days_max}")


In [None]:
# Create a copy of the original GeoDataFrame
processed_buffers_gdf_copy = processed_buffers_gdf.copy()

# Check if the copy was successful by displaying the first few rows
print(processed_buffers_gdf_copy.head())

In [None]:
import pandas as pd

# Divide DriveDays into equal-sized quartiles using pd.qcut()
processed_buffers_gdf_copy['DriveDays_Binned_Quantiles'] = pd.qcut(
    processed_buffers_gdf_copy['DriveDays'], 
    q=4, 
    labels=['Q1', 'Q2', 'Q3', 'Q4']  # Q4 will be the top 25% of DriveDays
)

# Display the first few rows to verify the binning
print(processed_buffers_gdf_copy[['DriveDays', 'DriveDays_Binned_Quantiles']].head())

In [None]:
# Create a copy of the dataset and filter for the top 25% of DriveDays
top_25_drive_days_gdf = processed_buffers_gdf_copy[processed_buffers_gdf_copy['DriveDays_Binned_Quantiles'] == 'Q4']

# Check how many records are in the top 25% of DriveDays
print(f"Number of records in the top 25% (Q4) of DriveDays: {len(top_25_drive_days_gdf)}")

# Optionally display the first few rows of the filtered dataset
print(top_25_drive_days_gdf[['DriveDays', 'frac_area_tb', 'NO2_Data']].head())


In [None]:
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import numpy as np

# Function to plot scatter with regression for NO2_DATA
def plot_scatter_with_regression_NO2(df, title):
    if df.empty:
        print(f"No data available for {title}. Skipping plot.")
        return

    # Scatter plot with regression line and correlation
    plt.figure(figsize=(10, 6))
    plt.scatter(df['frac_area_tb'], df['NO2_Data'], alpha=0.5)

    # Prepare data for regression
    X = df['frac_area_tb'].values.reshape(-1, 1)
    y = df['NO2_Data'].values

    # Perform linear regression
    if len(X) > 1 and len(y) > 1:
        model = LinearRegression()
        model.fit(X, y)
        slope = model.coef_[0]
        intercept = model.intercept_
        r_squared = model.score(X, y)
        correlation = np.sqrt(r_squared) if slope >= 0 else -np.sqrt(r_squared)

        # Plot the regression line
        trend_line = model.predict(X)
        plt.plot(df['frac_area_tb'], trend_line, 'r--', label=f'Regression Line (R² = {r_squared:.2f})')

        # Print the regression coefficient and intercept
        print(f"{title} - Slope: {slope:.4f}, Intercept: {intercept:.4f}, Correlation Coefficient: {correlation:.4f}")
    else:
        print(f"Insufficient data for regression for {title}. Skipping regression.")

    plt.title(f"{title} - Scatter Plot with Regression")
    plt.xlabel('Fraction of Tree Coverage (%)')
    plt.ylabel('NO2 Data')
    plt.grid(True)
    plt.legend()
    plt.show()

# Apply the function to the top 25% DriveDays dataset with NO2_DATA
plot_scatter_with_regression_NO2(top_25_drive_days_gdf, "Top 25% DriveDays (Q4) - Scatter Plot of NO2 Data vs Tree Coverage")

In [None]:
import matplotlib.pyplot as plt

# Plot histogram for DriveDays (before filtering)
plt.figure(figsize=(10, 6))
plt.hist(processed_buffers_gdf_copy['DriveDays'], bins=20, alpha=0.7, color='blue', edgecolor='black')
plt.title('DriveDays Distribution (Before Filtering)')
plt.xlabel('DriveDays')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

# Plot histogram for DriveDays (after filtering: DriveDays >= 5)
plt.figure(figsize=(10, 6))
filtered_gdf = processed_buffers_gdf_copy[processed_buffers_gdf_copy['DriveDays'] >= 5]
plt.hist(filtered_gdf['DriveDays'], bins=20, alpha=0.7, color='green', edgecolor='black')
plt.title('DriveDays Distribution (After Filtering: DriveDays >= 5)')
plt.xlabel('DriveDays')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
# Histogram for NO2_Data in the top 25% of DriveDays
plt.figure(figsize=(10, 6))
plt.hist(top_25_drive_days_gdf['NO2_Data'], bins=20, alpha=0.7, color='blue', edgecolor='black')
plt.title('NO2_Data Distribution (Top 25% DriveDays)')
plt.xlabel('NO2_Data')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
import seaborn as sns

# Violin and Boxplot combined for NO2_Data in the top 25% of DriveDays
plt.figure(figsize=(10, 6))

# Violin plot
sns.violinplot(y=top_25_drive_days_gdf['NO2_Data'], inner=None, color='lightgray', alpha=0.6)

# Boxplot
sns.boxplot(y=top_25_drive_days_gdf['NO2_Data'], width=0.2, boxprops=dict(alpha=0.6))

plt.title('NO2_Data Violin and Boxplot (Top 25% DriveDays)')
plt.ylabel('NO2_Data')
plt.grid(True)
plt.show()


### Value based binning

In [None]:
# Create an extra copy of the previously copied GeoDataFrame
processed_buffers_gdf_copy_extra = processed_buffers_gdf_copy.copy()

# Check if the second copy was successful by displaying the first few rows
print(processed_buffers_gdf_copy_extra.head())

### Value based binning 

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Step 1: Bin the frac_area_tb column (Tree Cover) into four predefined bins
tree_bins = [-0.001, 0.2500, 0.5000, 0.7500, 1.0000]
tree_labels = ['Low Tree Cover (<= 25%)', 'Medium-Low Tree Cover (25-50%)', 'Medium-High Tree Cover (50-75%)', 'High Tree Cover (> 75%)']
processed_buffers_gdf_copy_extra['Tree_Cover_Binned'] = pd.cut(processed_buffers_gdf_copy_extra['frac_area_tb'], bins=tree_bins, labels=tree_labels, include_lowest=True)

In [None]:
# Step 2: Bin the NO2_Data column into quantile-based bins for an even distribution
no2_bins = pd.qcut(processed_buffers_gdf_copy_extra['NO2_Data'], 4, labels=['Low NO2', 'Medium-Low NO2', 'Medium-High NO2', 'High NO2'])

# Assign the bins to a new column
processed_buffers_gdf_copy_extra['NO2_Binned'] = no2_bins


In [None]:
# Check if the copy was successful by displaying the first few rows
print(processed_buffers_gdf_copy_extra.head())

In [None]:
# Convert the categorical columns to strings before concatenating
processed_buffers_gdf_copy_extra['NO2_Tree_Combination'] = processed_buffers_gdf_copy_extra['NO2_Binned'].astype(str) + ' / ' + processed_buffers_gdf_copy_extra['Tree_Cover_Binned'].astype(str)

# Debugging: Display the first few rows to ensure bins and combinations are created correctly
print(processed_buffers_gdf_copy_extra[['frac_area_tb', 'NO2_Data', 'Tree_Cover_Binned', 'NO2_Binned', 'NO2_Tree_Combination']].head())




In [None]:

# Step 4: Count the number of occurrences in each NO2 and Tree Cover combination
combination_counts = processed_buffers_gdf_copy_extra['NO2_Tree_Combination'].value_counts()
print(combination_counts)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
import numpy as np

# Function to add a regression line and display correlation statistics
def plot_scatter_with_regression(df, title, x_col='frac_area_tb', y_col='NO2_Data'):
    # Scatter plot with regression line and correlation
    plt.figure(figsize=(16, 8))  # Increase width for the plot
    sns.scatterplot(x=x_col, y=y_col, hue='NO2_Tree_Combination', data=df, palette='coolwarm', alpha=0.6)

    # Prepare data for regression
    X = df[x_col].values.reshape(-1, 1)
    y = df[y_col].values

    # Perform linear regression
    if len(X) > 1 and len(y) > 1:
        model = LinearRegression()
        model.fit(X, y)
        slope = model.coef_[0]
        intercept = model.intercept_
        r_squared = model.score(X, y)
        correlation = np.corrcoef(df[x_col], df[y_col])[0, 1]  # Pearson correlation coefficient

        # Plot the regression line
        trend_line = model.predict(X)
        plt.plot(df[x_col], trend_line, 'r--', label=f'Regression Line (R² = {r_squared:.2f})')

        # Display regression statistics
        print(f"{title} - Slope: {slope:.4f}, Intercept: {intercept:.4f}, Correlation Coefficient: {correlation:.4f}")
    else:
        print(f"Insufficient data for regression for {title}. Skipping regression.")

    plt.title(f"{title} - Scatter Plot with Regression Line")
    plt.xlabel('Fraction of Tree Cover (frac_area_tb)')
    plt.ylabel('NO2 Data')
    plt.grid(True)
    plt.legend(title='NO2 / Tree Cover Combination', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

# Call the function for your data
plot_scatter_with_regression(processed_buffers_gdf_copy_extra, "Scatter Plot of Tree Fraction vs NO2 Data by NO2/Tree Cover Combination")


In [None]:
import matplotlib.pyplot as plt
import os
import geopandas as gpd
from mpl_toolkits.axes_grid1 import make_axes_locatable

# Define the output folder
output_folder = r"C:\Users\daphn\Documents\MADE_THESIS\DATA\CLEAN_DATA\NO2_TRY"

# Ensure the output folder exists
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Updated function to save the heat map as a PNG with modifications
def plot_heat_map(gdf, column, title, filename, cmap='YlOrRd', dpi=300):
    # Create a larger figure with higher DPI
    fig, ax = plt.subplots(1, 1, figsize=(20, 20), dpi=dpi)  # Increased figsize and DPI for higher resolution

    # Create a divider for the axis to control the size of the colorbar
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="5%", pad=0.1)  # Smaller colorbar
    
    # Plot the data
    gdf.plot(column=column, ax=ax, legend=True, cmap=cmap, cax=cax)
    ax.set_title(title, fontsize=16)

    # Save the plot as a high-resolution PNG file
    output_path = os.path.join(output_folder, filename)
    plt.savefig(output_path, format='png', bbox_inches='tight', dpi=dpi)  # DPI increased
    
    # Show the plot
    plt.show()

# Save and plot NO2 concentration heat map (using default colormap)
plot_heat_map(processed_buffers_gdf, 'NO2_Data', 'NO2 Concentration Heat Map', 'NO2_Concentration_Heat_Map.png', dpi=300)

# Save and plot Tree Canopy Coverage heat map with green colormap
plot_heat_map(processed_buffers_gdf, 'frac_area_tb', 'Tree Canopy Coverage Heat Map', 'Tree_Canopy_Coverage_Heat_Map.png', cmap='Greens', dpi=300)