# Spatial Data Analysis

## Configuration

In [None]:
# Check free memory available
%system free -m

In [None]:
# Import the necessary libraries

# Basic python libraries
import os
import numpy as np
from IPython.display import display

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Geospatial data visualization
import geopandas as gpd
import folium
from shapely.ops import unary_union

# Google cloud libraries
from google.cloud import bigquery
from google.cloud import storage

# Pandas and BigQuery
import pandas_gbq as pdg
import pandas as pd

In [None]:
# Current working directory
os.getcwd()

In [None]:
# Set output data and output locations
raw_data = "../data/raw/"
interim_data = "../data/interim/"
processed_data = "../data/processed/"

figures = "../reports/figures/"
config = "../config/"

## Data Loading

### neet_chd

In [None]:
# Set the filename for your CSV file
csv_filename = "neet_chd.csv"

# Combine the path and filename
csv_filepath = os.path.join(processed_data, csv_filename)

# Load the CSV file into a DataFrame
neet_chd_df = pd.read_csv(csv_filepath)

# Display the loaded DataFrame
neet_chd_df

### other_covariates

In [None]:
# Set the filename for your CSV file
csv_filename = "other_covariates.csv"

# Combine the path and filename
csv_filepath = os.path.join(processed_data, csv_filename)

# Load the CSV file into a DataFrame
other_covariates_df = pd.read_csv(csv_filepath)

# Display the loaded DataFrame
other_covariates_df

### Data Filteration

In [None]:
# Filter the neet_chd_df by 'home_lsoa_in_bradford'
neet_chd_filtered = neet_chd_df[(neet_chd_df['home_lsoa_in_bradford'])]

### DataFrames to GeoDataFrames

In [None]:
# Convert the neet_chd_filtered to GeoDataFrame
neet_crs = {'init': 'epsg:4326'}
neet_geometry = gpd.GeoDataFrame(neet_chd_filtered, geometry=gpd.GeoSeries.from_wkt(neet_chd_filtered['geometry']), crs=neet_crs)

### Data Aggregation and Count of individuals by Ward

In [None]:
# Set 'ward_code' as the index
neet_chd_filtered = neet_chd_filtered.set_index('ward_code')

# Aggregate data by 'ward_name' and summing the count of different scenarios separately
neet_geometry_count_gdf = neet_chd_filtered.groupby('ward_name').agg(
    ever_neet_status=('ever_neet_status', 'sum'),
    persistent_neet_status=('persistent_neet_status', 'sum'),
    total_persons=('person_id', 'count'),
    latitude=('latitude', 'mean'),  # Aggregate latitude
    longitude=('longitude', 'mean'),  # Aggregate longitude
    geometry=('geometry', lambda x: unary_union(x)),  # Combine geometries
).reset_index()

# Calculate the percentages directly on the DataFrame
neet_geometry_count_gdf['ever_neet_status_percentage'] = (neet_geometry_count_gdf['ever_neet_status'] / neet_geometry_count_gdf['total_persons'] * 100).round(2)
neet_geometry_count_gdf['persistent_neet_status_percentage'] = (neet_geometry_count_gdf['persistent_neet_status'] / neet_geometry_count_gdf['total_persons'] * 100).round(2)

# Display the result
neet_geometry_count_gdf

### Build Choropleth Map using Percentage

In [None]:
# Set the Coordinate Reference System (CRS) to EPSG:4326
neet_geometry_count_gdf = gpd.GeoDataFrame(neet_geometry_count_gdf, geometry='geometry', crs='EPSG:4326')

# Create a Folium map
m = folium.Map(location=[neet_geometry_count_gdf.geometry.centroid.y.mean(), neet_geometry_count_gdf.geometry.centroid.x.mean()], zoom_start=11)

# Set the column's name into a list
count_columns = ['ever_neet_status', 'persistent_neet_status']

# Calculate the maximum value among all layers to set a consistent color scale range
max_value = neet_geometry_count_gdf[[f'{column}_percentage' for column in count_columns]].max().max()

# Add choropleth layers for each density column
for count_column in count_columns:
    # Calculate the percentage with two decimal points
    neet_geometry_count_gdf[f'{count_column}_percentage'] = (neet_geometry_count_gdf[count_column] / neet_geometry_count_gdf['total_persons']) * 100
    neet_geometry_count_gdf[f'{count_column}_percentage'] = neet_geometry_count_gdf[f'{count_column}_percentage'].round(2)

    choropleth = folium.Choropleth(
        geo_data=neet_geometry_count_gdf,
        name=f'{count_column}_percentage',
        data=neet_geometry_count_gdf,
        columns=['ward_name', f'{count_column}_percentage'],
        key_on='feature.properties.ward_name',
        fill_color='BuGn',
        fill_opacity=0.75,
        line_opacity=0.75,
        legend_name=f'{count_column} percentage',
        bins=np.linspace(0, max_value, 6)  # Specify the same color scale range for all layers
    ).add_to(m)

    # Add tooltips with 'ward_name'
    choropleth.geojson.add_child(folium.features.GeoJsonTooltip(['ward_name', count_column, f'{count_column}_percentage'], aliases=['Ward Name:', 'Count:', 'Percent:']))
    
# Add Layer Control to the map
folium.LayerControl(collapsed=False).add_to(m)

# Save the map
bradford_ward_map = "../reports/figures/bradford_ward_map.html"
m.save(bradford_ward_map)

# Display the map
display(m)


## Spatial Analysis by Ward and Academic Year

### Data Filteration

In [None]:
# Filter the neet_chd_df by 'home_lsoa_in_bradford'
neet_chd_filtered = neet_chd_df[(neet_chd_df['home_lsoa_in_bradford'])]

### Data Merging

In [None]:
# Joining on person_id
neet_geom_academic_year_df = pd.merge(neet_chd_filtered[['person_id', 'home_lsoa_in_bradford', 'ward_code', 'ward_name', 'geometry']], other_covariates_df[['person_id', 'academic_year', 'neet_count']], on='person_id', how='left')

# Remove duplicates from joined_df
neet_geom_academic_year_df = neet_geom_academic_year_df.drop_duplicates()

# Display the result
neet_geom_academic_year_df

### ever_neet_status and persistent_neet_status creation

In [None]:
# Add ever_neet_status and persistent_neet_status columns to the df
neet_geom_academic_year_df['ever_neet_status'] = neet_geom_academic_year_df['neet_count'] >= 1
neet_geom_academic_year_df['persistent_neet_status'] = neet_geom_academic_year_df['neet_count'] >= 4

# Display the result
neet_geom_academic_year_df

### Convert DataFrame to GeoDataFrame

In [None]:
# Convert DataFrame to GeoDataFrame
neet_geom_academic_year_gdf = gpd.GeoDataFrame(neet_geom_academic_year_df, geometry=gpd.GeoSeries.from_wkt(neet_geom_academic_year_df['geometry']), crs={'init': 'epsg:4326'})

In [None]:
# Display the detail info of the joined_gdf
neet_geom_academic_year_gdf.info()

### Count of individuals by Ward in Bradford for each academic year

In [None]:
# Set 'ward_code' as the index
neet_geom_academic_year_gdf = neet_geom_academic_year_gdf.set_index('ward_code')

# Group the data by 'ward_name' and 'academic_year', and aggregate ever_neet_status and persistent_neet_status by summing the True values
neet_geom_academic_year_gdf = neet_geom_academic_year_gdf.groupby(['academic_year', 'ward_name']).agg(
    ever_neet_count=('ever_neet_status', 'sum'),  # Count of True values for ever_neet_status
    persistent_neet_count=('persistent_neet_status', 'sum'),  # Count of True values for persistent_neet_status
    total_persons=('person_id', 'size'),  # Count total persons in each ward and academic year
    geometry=('geometry', lambda x: unary_union(x)),  # Combine geometries
).reset_index()

# Calculate the percentages directly on the DataFrame
neet_geom_academic_year_gdf['ever_neet_status_percentage'] = (neet_geom_academic_year_gdf['ever_neet_count'] / neet_geom_academic_year_gdf['total_persons'] * 100).round(2)
neet_geom_academic_year_gdf['persistent_neet_status_percentage'] = (neet_geom_academic_year_gdf['persistent_neet_count'] / neet_geom_academic_year_gdf['total_persons'] * 100).round(2)

# Display the result
neet_geom_academic_year_gdf

In [None]:
# Save dataframe to CSV file
csv_filename = 'neet_geom_academic_year.csv'

# Combine the path and filename
csv_filepath = os.path.join(processed_data, csv_filename)

# Save the DataFrame to CSV
neet_geom_academic_year_gdf.to_csv(csv_filepath, index=False)
print(f"DataFrame saved to: {csv_filepath}")

### Build Choropleth Map using Percentage

In [None]:
# Set the Coordinate Reference System (CRS) to EPSG:4326
neet_geom_academic_year_gdf = gpd.GeoDataFrame(neet_geom_academic_year_gdf, geometry='geometry', crs='EPSG:4326')

# Create a Folium map
m = folium.Map(location=[neet_geom_academic_year_gdf.geometry.centroid.y.mean(), neet_geom_academic_year_gdf.geometry.centroid.x.mean()], zoom_start=11)

# Set the column's name into a list
count_columns = ['ever_neet_status', 'persistent_neet_status']

# Calculate the maximum value among all layers to set a consistent color scale range
max_value = neet_geom_academic_year_gdf[[f'{column}_percentage' for column in count_columns]].max().max()

# Iterate over each academic year
for academic_year, group in neet_geom_academic_year_gdf.groupby('academic_year'):
    # Add choropleth layer for ever_neet_percentage
    ever_neet_choropleth = folium.Choropleth(
        geo_data=group,
        name=f'Ever NEET Percentage {academic_year}',
        data=group,
        columns=['ward_name', 'ever_neet_status_percentage'],
        key_on='feature.properties.ward_name',
        fill_color='BuGn',  # Using a ColorBrewer code
        fill_opacity=0.75,
        line_opacity=0.75,
        legend_name=f'Ever NEET Percentage {academic_year}',
        bins=np.linspace(0, max_value, 6)  # Specify the same color scale range for all layers
    ).add_to(m)

    # Add tooltips with 'ward_name'
    ever_neet_choropleth.geojson.add_child(folium.features.GeoJsonTooltip(['ward_name', 'ever_neet_status_percentage'], aliases=['Ward Name:', 'Ever NEET Percentage:']))

    # Add choropleth layer for persistent_neet_percentage
    persistent_neet_choropleth = folium.Choropleth(
        geo_data=group,
        name=f'Persistent NEET Percentage {academic_year}',
        data=group,
        columns=['ward_name', 'persistent_neet_status_percentage'],
        key_on='feature.properties.ward_name',
        fill_color='BuGn',  # Using a ColorBrewer code
        fill_opacity=0.75,
        line_opacity=0.75,
        legend_name=f'Persistent NEET Percentage {academic_year}',
        bins=np.linspace(0, max_value, 6)  # Specify the same color scale range for all layers
    ).add_to(m)

    # Add tooltips with 'ward_name'
    persistent_neet_choropleth.geojson.add_child(folium.features.GeoJsonTooltip(['ward_name', 'persistent_neet_status_percentage'], aliases=['Ward Name:', 'Persistent NEET Percentage:']))

# Add Layer Control to the map
folium.LayerControl(collapsed=False).add_to(m)

# Save the map
bradford_ward_academic_year_map = "../reports/figures/bradford_ward_academic_year_map.html"
m.save(bradford_ward_academic_year_map)

# Display the map
display(m)

## Annual Trajectory - Ever NEET

In [None]:
# Create a figure and axis object
fig, ax = plt.subplots(figsize=(15, 10))

# Plot average ever NEET percentage trend for all wards with a bolder line
average_ever_neet = neet_geom_academic_year_gdf.groupby('academic_year')['ever_neet_status_percentage'].mean()
ax.plot(average_ever_neet.index, average_ever_neet, label='Average Ever NEET', color='b', linestyle='-', alpha=0.7, linewidth=2.5)  # Set linewidth to 2.5

# Define unique line styles and colors for each ward
line_styles = ['-', '--', '-.', ':', (0, (3, 5, 1, 5)), (0, (1, 10)), (0, (3, 1, 1, 1))]
colors = ['g', 'r', 'c', 'm', 'y', 'k', 'orange', 'purple', 'brown', 'lime', 'cyan', 'pink']

# Plot ever NEET percentage trend for all wards
unique_wards = neet_geom_academic_year_gdf['ward_name'].unique()
for i, ward_name in enumerate(unique_wards):
    ward_data = neet_geom_academic_year_gdf[neet_geom_academic_year_gdf['ward_name'] == ward_name]
    ax.plot(ward_data['academic_year'], 
            (ward_data['ever_neet_count'] / ward_data['total_persons']) * 100,
            label=ward_name,
            linestyle=line_styles[i % len(line_styles)],
            color=colors[i % len(colors)],
            alpha=0.7)  # Set transparency

# Set labels and title for the plot with increased font sizes
ax.set_xlabel('Academic Year', fontsize=16)
ax.set_ylabel('Ever NEET Percentage', fontsize=16)
ax.set_title('Ever NEET Percentage Trend', fontsize=20) 
ax.legend(loc='upper left', bbox_to_anchor=(1, 1), title='Ward', fontsize=14)
ax.grid(True)

# Set x-axis tick labels to actual academic years
ax.set_xticklabels(average_ever_neet.index, rotation=45, fontsize=12)  # Increased x-axis tick label font size

# Set the y-axis range from 0 to 30
plt.ylim(0, 30)

# Show plot
plt.tight_layout()
plt.show()


## Annual Trajectory - Persistent NEET

In [None]:
# Create a figure and axis object
fig, ax = plt.subplots(figsize=(15, 10))

# Plot average persistent NEET percentage trend for all wards with a bolder line
average_persistent_neet = neet_geom_academic_year_gdf.groupby('academic_year')['persistent_neet_status_percentage'].mean()
ax.plot(average_persistent_neet.index, average_persistent_neet, label='Average Persistent NEET', color='b', linestyle='-', alpha=0.7, linewidth=2.5)  # Set linewidth to 2.5

# Define unique line styles and colors for each ward
line_styles = ['-', '--', '-.', ':', (0, (3, 5, 1, 5)), (0, (1, 10)), (0, (3, 1, 1, 1))]
colors = ['g', 'r', 'c', 'm', 'y', 'k', 'orange', 'purple', 'brown', 'lime', 'cyan', 'pink']

# Plot persistent NEET percentage trend for all wards
unique_wards = neet_geom_academic_year_gdf['ward_name'].unique()
for i, ward_name in enumerate(unique_wards):
    ward_data = neet_geom_academic_year_gdf[neet_geom_academic_year_gdf['ward_name'] == ward_name]
    ax.plot(ward_data['academic_year'], 
            (ward_data['persistent_neet_count'] / ward_data['total_persons']) * 100,
            label=ward_name,
            linestyle=line_styles[i % len(line_styles)],
            color=colors[i % len(colors)],
            alpha=0.7)  # Set transparency

# Set labels and title for the plot with increased font sizes
ax.set_xlabel('Academic Year', fontsize=16)
ax.set_ylabel('Persistent NEET Percentage', fontsize=16)
ax.set_title('Persistent NEET Percentage Trend', fontsize=20)
ax.legend(loc='upper left', bbox_to_anchor=(1, 1), title='Ward', fontsize=14)
ax.grid(True)

# Set x-axis tick labels to actual academic years
ax.set_xticklabels(average_persistent_neet.index, rotation=45, fontsize=12)

# Set the y-axis range from 0 to 30
plt.ylim(0, 30)

# Show plot
plt.tight_layout()
plt.show()
