In [1]:
import sys
import importlib
import subprocess
import gc

def install_and_check_libraries(lib_list):
    for lib in lib_list:
        try:
            # Try to import library
            importlib.import_module(lib)
            print(f"'{lib}' is already installed.")
        except ImportError:
            # If not installed, install library
            try:
                print(f">>>> Installing {lib}.")
                subprocess.check_call([sys.executable, "-m", "pip", "install", lib])
            except Exception as e:
                print(f"Error installing {lib}:", str(e))

libraries = ["numpy", "pandas", "matplotlib", "seaborn", "folium","geopandas", "networkx", "deap"]
install_and_check_libraries(libraries)

'numpy' is already installed.
'pandas' is already installed.
'matplotlib' is already installed.
'seaborn' is already installed.
'folium' is already installed.
'geopandas' is already installed.
'networkx' is already installed.
'deap' is already installed.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from deap import base, creator, tools, algorithms
from deap.benchmarks.tools import igd
from math import factorial
import warnings
from collections import defaultdict
import seaborn as sns
import folium
import random
import geopandas as gpd
from geopandas import GeoSeries
from functools import partial
from statistics import mean
from datetime import datetime
import requests
import itertools
from itertools import product
import time
import os
import networkx as nx
from branca.element import Figure

warnings.filterwarnings('ignore')

Lets set up some initial parameters

In [3]:
nsga3 = False
weighted_mutation = False
restricted_mutation = False
restricted_mutation_depth = 10 # nearest x number of sites by travel times
elite_pop = 10
include_extreme_individual = False
include_original_sites = False

In [4]:
activities = pd.read_csv('./Badgernet_Activity_2.csv', encoding='ISO-8859-1')
sites = pd.read_csv('./Sites.csv', encoding='ISO-8859-1')

We'll use a function to turn our financial year into useable dates

In [5]:
def get_fin_year_dates(financial_year):
    start_year_part, end_year_part = financial_year.split('/')

    start_year = int("20" + start_year_part)  
    end_year = int("20" + end_year_part)

    start_date = pd.Timestamp(f"{start_year}-04-01")
    end_date = pd.Timestamp(f"{end_year}-03-31")

    return start_date, end_date

We need to load our data, firstly our travel times which we have in a pre-calculated table

In [6]:
# Read in the travel times data
travel_times = pd.read_csv('./LSOA_Travel_Times.csv')
travel_times = travel_times.dropna()

# Initialize an empty DataFrame to hold the new travel times from CSV files
new_travel_times_df = pd.DataFrame()

directory = "./"

# Loop through the files in the directory
for filename in os.listdir(directory):
    if filename.startswith("Missing_travel_times") and filename.endswith(".csv"):
        # Construct the full file path
        file_path = os.path.join(directory, filename)
        # Read the CSV file into a DataFrame
        current_df = pd.read_csv(file_path)
        
        print(f"Adding file {filename}")
        # Append the current DataFrame to the new travel times DataFrame
        new_travel_times_df = pd.concat([new_travel_times_df, current_df], ignore_index=True)


# Drop any rows with NaN values that may have appeared in the new DataFrame
new_travel_times_df = new_travel_times_df.dropna()

new_travel_times_df = new_travel_times_df.rename(columns={'Travel_Time': 'TT', 'home_LSOA': 'Home_LSOA'})

# Concatenate the existing and new travel times DataFrames
combined_travel_times_df = pd.concat([travel_times, new_travel_times_df], ignore_index=True)

# Drop duplicates in case some entries are in both DataFrames
combined_travel_times_df = combined_travel_times_df.drop_duplicates(subset=['Home_LSOA', 'Site_LSOA'])

# Convert the combined DataFrame into a dictionary
travel_times_dict = {(row["Home_LSOA"], row["Site_LSOA"]): row["TT"] for _, row in combined_travel_times_df.iterrows()}

Adding file Missing_travel_times_20231117_153542.csv
Adding file Missing_travel_times_20231117_163729.csv
Adding file Missing_travel_times_20231120_002542.csv
Adding file Missing_travel_times_20231120_083042.csv
Adding file Missing_travel_times_20231120_083043.csv
Adding file Missing_travel_times_20231120_143912.csv
Adding file Missing_travel_times_20231121_025324.csv
Adding file Missing_travel_times_20231121_075202.csv
Adding file Missing_travel_times_20231121_075206.csv
Adding file Missing_travel_times_20231122_132507.csv
Adding file Missing_travel_times_20231122_151104.csv
Adding file Missing_travel_times_20240116_161859.csv
Adding file Missing_travel_times_20240117_094839.csv
Adding file Missing_travel_times_20240117_102403.csv
Adding file Missing_travel_times_20240117_104236.csv
Adding file Missing_travel_times_20240117_124704.csv
Adding file Missing_travel_times_20240118_111610.csv
Adding file Missing_travel_times_20240130_132413.csv


In [7]:
combined_travel_times_df.to_csv('./Data_Output/Full Data/combined_travel_times_df.csv', index=False)

Let us load and process our data about our sites

In [8]:
#remove unnecessary columns
sites = sites.loc[:, ['UnitCode', 'LSOA','NICU','LCU','SCBU']]

#Apply data cleansing
sites = sites.replace('', np.nan)
sites = sites.dropna()

And our activities data 

In [9]:
# #Remove unecessary data and columns
# activities = activities.dropna(subset=['Sustainability_And_Transformation_Partnership'])
# values_to_keep = ['QE1', 'QOP', 'QYG']
# activities = activities[activities['Sustainability_And_Transformation_Partnership'].isin(values_to_keep)]

activities = activities.dropna(subset=['SiteLSOA'])
values_to_exclude = ['E01006570']
activities = activities[~activities['SiteLSOA'].isin(values_to_exclude)]

# activities = activities.dropna(subset=['CC_Level'])
# values_to_exclude = ['NICU']
# activities = activities[activities['CC_Level'].isin(values_to_exclude)]

activities_orig = activities.loc[:, ['Der_Postcode_LSOA_Code','CC_Activity_Date','SiteLSOA', 'CC_Level']]
activities = activities.loc[:, ['Der_Postcode_LSOA_Code','CC_Activity_Date','SiteLSOA', 'CC_Level']]

#Apply data cleansing
activities = activities.replace('', np.nan)
activities = activities.dropna()

# Ensure the date is a date
activities['CC_Activity_Date'] = pd.to_datetime(activities['CC_Activity_Date'], format='%d/%m/%Y')
activities_indexed = activities.set_index('Der_Postcode_LSOA_Code')

# time_periods = pd.date_range(start_date, end_date, freq='D')

int_to_activity = {i: activity for i, activity in enumerate(activities['CC_Level'].unique())}

home_lsoas = []
most_frequent_sites = []
home_activities = []
home_populations = []
all_sites = []

def data_prep(activities, start_date, end_date):
    filtered_activities = activities.loc[(activities['CC_Activity_Date'] >= start_date) & (activities['CC_Activity_Date'] <= end_date)]
    filtered_activities = filtered_activities.set_index('Der_Postcode_LSOA_Code')
    home_lsoas = sorted(filtered_activities.index.unique().tolist())
    num_homes = len(home_lsoas)
    num_sites = len(site_codes)# Group by DER_Postcode_LSOA_Code and count the occurrences
    home_populations_dict = filtered_activities.groupby('Der_Postcode_LSOA_Code').size().to_dict()
    home_activities_dict = filtered_activities.groupby('Der_Postcode_LSOA_Code')['CC_Level'].value_counts().unstack(fill_value=0).to_dict(orient='index')
    home_activities = [[home_activities_dict[home][int_to_activity[i]] for i in range(3)] for home in home_lsoas]
    # Convert it to list matching the order of home_lsoas
    home_populations = [home_populations_dict.get(home, 0) for home in home_lsoas]
    site_frequencies = filtered_activities.groupby(['Der_Postcode_LSOA_Code', 'SiteLSOA']).size().reset_index(name='counts')
    most_frequent_sites = site_frequencies.loc[site_frequencies.groupby('Der_Postcode_LSOA_Code')['counts'].idxmax()]
    return filtered_activities, num_homes, num_sites, most_frequent_sites, home_lsoas, home_activities, home_populations

def data_prep(activities, start_date, end_date, site_codes, int_to_activity):
    # Filtering activities within the date range
    filtered_activities = activities.loc[(activities['CC_Activity_Date'] >= start_date) & (activities['CC_Activity_Date'] <= end_date)]
    # Setting the index to 'Der_Postcode_LSOA_Code'
    filtered_activities = filtered_activities.set_index('Der_Postcode_LSOA_Code')
    # Getting unique LSOA codes
    home_lsoas = sorted(filtered_activities.index.unique().tolist())
    # Calculating the number of homes and sites
    num_homes = len(home_lsoas)
    num_sites = len(site_codes)
    # Grouping by LSOA code to get populations
    home_populations_dict = filtered_activities.groupby('Der_Postcode_LSOA_Code').size().to_dict()
    # Grouping by LSOA code and CC_Level to get activities
    home_activities_dict = filtered_activities.groupby('Der_Postcode_LSOA_Code')['CC_Level'].value_counts().unstack(fill_value=0).to_dict(orient='index')
    # Dynamically determining the range based on unique CC_Level values
    unique_cc_levels = sorted(filtered_activities['CC_Level'].unique())
    home_activities = [[home_activities_dict[home].get(int_to_activity[i], 0) for i in range(len(unique_cc_levels))] for home in home_lsoas]
    # Getting populations matching the order of home_lsoas
    home_populations = [home_populations_dict.get(home, 0) for home in home_lsoas]
    # Calculating site frequencies
    site_frequencies = filtered_activities.groupby(['Der_Postcode_LSOA_Code', 'SiteLSOA']).size().reset_index(name='counts')
    most_frequent_sites = site_frequencies.loc[site_frequencies.groupby('Der_Postcode_LSOA_Code')['counts'].idxmax()]
    return filtered_activities, num_homes, num_sites, most_frequent_sites, home_lsoas, home_activities, home_populations


#Add site code to our df
activities = pd.merge(activities, sites[['LSOA','UnitCode']], left_on='SiteLSOA', right_on='LSOA', how='left')
activities = activities.drop('LSOA', axis=1)
activities.rename(columns={'UnitCode': 'SiteCode'}, inplace=True)


# Make a list of all our homes and sites
site_codes = sites['LSOA'].unique().tolist()
home_codes =  activities_indexed.index.unique().tolist()

# print (f"filtered_activities row count: {len(filtered_activities)}")
endrange = activities['CC_Activity_Date'].max()
startrange = activities['CC_Activity_Date'].min()

print(f"Data ranges from {startrange} to {endrange}")

Data ranges from 2018-07-06 00:00:00 to 2021-12-31 00:00:00


We also want to look up any travel times that might be missing from our data

In [10]:
class OutOfAPICallsException(Exception):
    """Exception raised when the API returns a 403 status code indicating the quota has been exceeded."""
    pass

class NoSuchLocationException(Exception):
    """Exception raised when the API returns a 404 status code indicating the location hasnt been found."""
    pass 

class RateLimitException(Exception):
    """Exception raised when the API returns a 429 status code indicating too many requests."""
    pass


def calculate_travel_time_openrouteservice(api_key, start_coords, end_coords, api_request_no, transport_mode='driving-car'):
    """ Calculate travel time using the Openrouteservice API. """
    
    url = "https://api.openrouteservice.org/v2/directions/{}/geojson".format(transport_mode)
    
    # Set up the headers with the API key
    headers = {
        'Authorization': api_key,
        'Content-Type': 'application/json'
    }
    
    # Set up the parameters with the start and end coordinates
    body = {
        'coordinates': [start_coords, end_coords]
    }
    
    # Make the request 
    response = requests.post(url, headers=headers, json=body)
    
    # Check response
    if response.status_code == 200:
        # Parse the response
        directions = response.json()
        try:
            # Travel time in seconds is nested in the 'features' list, under the 'properties' dictionary
            duration_seconds = directions['features'][0]['properties']['segments'][0]['duration']
            return duration_seconds
        except (IndexError, KeyError):
            print("Error parsing the response.")
            return None
    elif response.status_code == 403:  # Out of API calls
        print(f"API request {api_request_no} failed with status code {response.status_code}")
        raise OutOfAPICallsException("API quota exceeded")
    elif response.status_code == 404:  # Out of API calls
        print(f"API request {api_request_no} failed with location not found {response.status_code}")
        raise NoSuchLocationException("No location found")
    elif response.status_code == 429:  # Rate limited by the API
        print(f"API request {api_request_no} has been rate-limited with status code {response.status_code}")
        raise RateLimitException("Rate limit exceeded")
    else:
        print(f"API request {api_request_no} failed with status code {response.status_code}")
        return None

# Example usage
api_key = '5b3ce3597851110001cf62486f4bed53db4c47b7a841e3da98655493'
start_coordinates = (8.681495, 49.41461)  # Example coordinates (longitude, latitude)
end_coordinates = (8.687872, 49.420318)  # Example coordinates (longitude, latitude)
transport_mode = 'driving-car'  # Mode of transportation

# Calculate travel time
# travel_time_seconds = calculate_travel_time_openrouteservice(api_key, start_coordinates, end_coordinates, transport_mode)
# print(f"Estimated travel time: {travel_time_seconds / 60:.2f} minutes")


In [11]:
LSOA_LL_df = pd.read_csv('./LSOA_to_LL.csv')

# Create the Cartesian product of home_codes and site_codes
combination_product = list(product(home_codes, site_codes + ['E01012632']))

# List to store lat/lng details
lat_lng_details = list()

LSOA_LL_df

# Loop through each combination
for home, site in combination_product:
    # Check if we have the travel time for this home and site
    if (home, site) not in travel_times_dict and home != 'M99999999':
        # Filter the DataFrame for the home and check if it's empty
        home_rows = LSOA_LL_df[LSOA_LL_df['LSOA'] == home][['Latitude_1m', 'Longitude_1m']]
        if not home_rows.empty:
            home_lat_lng = home_rows.iloc[0]
        else:
            # Handle the case where no match is found, for example by continuing to the next iteration
            continue

        # Filter the DataFrame for the site and check if it's empty
        site_rows = LSOA_LL_df[LSOA_LL_df['LSOA'] == site][['Latitude_1m', 'Longitude_1m']]
        if not site_rows.empty:
            site_lat_lng = site_rows.iloc[0]
        else:
            # Handle the case where no match is found, for example by continuing to the next iteration
            continue
        
        # Store the details in a dictionary
        lat_lng_detail = {
            'home_code': home,
            'home_latitude': home_lat_lng['Latitude_1m'],
            'home_longitude': home_lat_lng['Longitude_1m'],
            'site_code': site,
            'site_latitude': site_lat_lng['Latitude_1m'],
            'site_longitude': site_lat_lng['Longitude_1m']
        }
        
        # Add the dictionary to our list
        lat_lng_details.append(lat_lng_detail)
        
len(lat_lng_details)

22

In [12]:
# Initialize an empty DataFrame to store home_LSOA, Site_LSOA, and Travel Time
missing_travel_times_df = pd.DataFrame(columns=['home_LSOA', 'Site_LSOA', 'Travel_Time'])

# API rate limiting parameters
api_request_count = 0
api_limit = 2000
api_per_minute_limit = 40  # Adjust to per-minute limit
delay_between_requests = 60 / api_per_minute_limit  # Delay to adhere to per-minute limit

not_found_details = []

max_retries = 5

# Function to handle API requests and retries
def fetch_travel_time(detail, max_retries):
    retries = 0
    while retries < max_retries:
        try:
            travel_time_seconds = calculate_travel_time_openrouteservice(
                api_key, 
                (detail['home_longitude'], detail['home_latitude']), 
                (detail['site_longitude'], detail['site_latitude']), 
                api_request_count, 
                transport_mode
            )

            if travel_time_seconds is not None:
                return round(travel_time_seconds / 60, 1)  # Convert to minutes

        except OutOfAPICallsException:
            # If out of API calls, halt further processing
            print("API quota exceeded. Halting process.")
            return "quota_exceeded"
        except NoSuchLocationException:
            # Log and break for locations not found
            not_found_details.append((detail['home_code'], detail['site_code']))
            print(f"Location not found for {detail['home_code']} - {detail['site_code']}")
            break
        except RateLimitException:
            # If rate limit is hit, wait and retry
            time.sleep(delay_between_requests)
        except Exception as e:
            # Log unexpected exceptions and retry
            print(f"An unexpected exception occurred: {e}")

        retries += 1
        if retries < max_retries:
            print(f"Retrying request for {detail['home_code']} to {detail['site_code']}. Attempt {retries}/{max_retries}")
            time.sleep(delay_between_requests)

    # Return None if unsuccessful after all retries
    return None

# List to collect results
results = []

# Loop through each home-site pair
for detail in lat_lng_details:
    if api_request_count >= api_limit:
        print("Stopped due to API quota being exceeded.")
        break

    # Fetch travel time with retries
    travel_time_minutes = fetch_travel_time(detail, max_retries)

    if travel_time_minutes is not None and travel_time_minutes != "quota_exceeded":
        # Append successful results
        results.append({
            'home_LSOA': detail['home_code'],
            'Site_LSOA': detail['site_code'],
            'Travel_Time': travel_time_minutes
        })
        api_request_count += 1

    if travel_time_minutes == "quota_exceeded":
        # Break the loop if API quota is exceeded
        break

    if travel_time_minutes is None and isinstance(travel_time_minutes, OutOfAPICallsException):
        # If out of API calls, break from the loop
        break
    
    print(f"Home: {detail['home_code']} Site: {detail['site_code']} >>> {travel_time_minutes} minutes")
    
    time.sleep(delay_between_requests)




API request 0 failed with location not found 404
Location not found for E01016196 - E01007251
Home: E01016196 Site: E01007251 >>> None minutes
API request 0 failed with location not found 404
Location not found for E01016196 - E01018377
Home: E01016196 Site: E01018377 >>> None minutes
API request 0 failed with location not found 404
Location not found for E01016196 - E01018480
Home: E01016196 Site: E01018480 >>> None minutes
API request 0 failed with location not found 404
Location not found for E01016196 - E01006512
Home: E01016196 Site: E01006512 >>> None minutes
API request 0 failed with location not found 404
Location not found for E01016196 - E01018616
Home: E01016196 Site: E01018616 >>> None minutes
API request 0 failed with location not found 404
Location not found for E01016196 - E01025488
Home: E01016196 Site: E01025488 >>> None minutes
API request 0 failed with location not found 404
Location not found for E01016196 - E01012457
Home: E01016196 Site: E01012457 >>> None minutes

In [13]:
# Convert results to DataFrame
missing_travel_times_df = pd.DataFrame(results)

print(missing_travel_times_df)

def export_travel_times(df):
    if len(df) > 0:
        now = datetime.now()
        timestamp = now.strftime("%Y%m%d_%H%M%S") 
        # Specify the file name
        log_name = f"./Missing_travel_times_{timestamp}.csv"
        # Save the DataFrame to CSV
        df.to_csv(log_name, index=False)
        print(f"Exported file {log_name}")
    
# Call the export function
export_travel_times(missing_travel_times_df)

Empty DataFrame
Columns: []
Index: []


Load and organise our geographic information 

In [14]:
# Load the LSOA shape file
lsoas = gpd.read_file('./LSOA_Dec_2011_PWC_in_England_and_Wales/LSOA_Dec_2011_PWC_in_England_and_Wales.shp')

# Make a sites GeoDF
sites_geo_df = lsoas[lsoas['lsoa11cd'].isin(site_codes)]
sites_geo_df = sites_geo_df.set_index('lsoa11cd')
sites_geo_df['centroid'] = sites_geo_df.geometry.centroid

# Make a homes GeoDF
homes_geo_df = lsoas[lsoas['lsoa11cd'].isin(home_codes)]
homes_geo_df = homes_geo_df.set_index('lsoa11cd')
homes_geo_df['centroid'] = homes_geo_df.geometry.centroid

# Extract the centroids as a GeoSeries
homes_centroids = GeoSeries(homes_geo_df['centroid'])
sites_centroids = GeoSeries(sites_geo_df['centroid'])

# Set the CRS of the centroids to EPSG:27700
homes_centroids.crs = "EPSG:27700"
sites_centroids.crs = "EPSG:27700"

# Convert the centroids to EPSG:4326 (latitude and longitude)
homes_centroids_ll = homes_centroids.to_crs(epsg=4326)
sites_centroids_ll = sites_centroids.to_crs(epsg=4326)


And we can then plot the assignments on a map

In [15]:
current_site_list = []

def plot_assignments_folium(individual):

    # Make a homes GeoDF
    homes_geo_df = lsoas[lsoas['lsoa11cd'].isin(home_codes)]
    homes_geo_df = homes_geo_df.set_index('lsoa11cd')
    homes_geo_df['centroid'] = homes_geo_df.geometry.centroid

    sites_geo_df = lsoas[lsoas['lsoa11cd'].isin(current_site_list)]
    sites_geo_df = sites_geo_df.set_index('lsoa11cd')
    sites_geo_df['centroid'] = sites_geo_df.geometry.centroid

    # Convert centroids to latitude and longitude
    homes_centroids_ll = homes_geo_df['centroid'].to_crs(epsg=4326)
    sites_centroids_ll = sites_geo_df['centroid'].to_crs(epsg=4326)
    
    # Calculate the mean latitude and longitude
    center_latitude = homes_centroids_ll.apply(lambda p: p.y).mean()
    center_longitude = homes_centroids_ll.apply(lambda p: p.x).mean()

    # Create a map centered at the mean coordinates

    m = folium.Map(location=[center_latitude, center_longitude], zoom_start=7, width='100%', height='100%')

    
    # Add the site locations as blue markers
    for point in sites_centroids_ll:
        folium.Marker([point.y, point.x], icon=folium.Icon(color="blue")).add_to(m)

    # Add the home locations as small red markers
    for point in homes_centroids_ll:
        folium.CircleMarker([point.y, point.x], radius=2, color="red").add_to(m)

    # Plot lines from home to site
    for home_idx, site_idx in enumerate(individual):
        home_code = home_lsoas[home_idx]
        site_code = current_site_list[site_idx]
        
        if home_code in homes_geo_df.index and site_code in sites_geo_df.index:
            home_point = homes_centroids_ll.loc[home_code]
            site_point = sites_centroids_ll.loc[site_code]
            coords = [[home_point.y, home_point.x], [site_point.y, site_point.x]]
            folium.PolyLine(coords, color="grey", weight=1, opacity=0.8).add_to(m)
            
    fig = Figure(width=600, height=400)
    fig.add_child(m)  
         
    return fig


In [16]:
# filtered_activities = pd.DataFrame()
# sorted_sites = {}

activities_to_rank = activities.copy()
activities_to_rank.set_index('Der_Postcode_LSOA_Code', inplace=True)
home_lsoa_codes = sorted(activities_to_rank.index.unique().tolist())

def get_sites(home, num_sites=None):
    # If num_sites is None, return all sites
    num_sites = len(site_codes)
    # Returns a list of Site_LSOA codes sorted by distance (nearest first)
    sorted_sites = sorted(site_codes, key=lambda site: travel_times_dict.get((home, site), float('inf')))
    return sorted_sites[:num_sites]

def calculate_percentages(data):
    ranking_counts = data['Ranking'].value_counts()
    total_counts = data['Ranking'].count()  # Count only non-null rankings
    percentages = (ranking_counts / total_counts) * 100
    return percentages.sort_index()

sorted_sites_by_distance = {home: get_sites(home) for home in home_lsoa_codes}

def optimized_determine_site_ranking(data, sorted_sites_by_distance):
    # Assuming 'SiteLSOA' is a column in 'data' and 'home_lsoa_codes' is a list of unique LSOAs
    # Update 'Ranking' column based on pre-computed 'sorted_sites_by_distance'
    for home_lsoa in home_lsoa_codes:
        sites_list = sorted_sites_by_distance.get(home_lsoa, [])
        # Use a vectorized operation or apply to update rankings
        data.loc[data.index == home_lsoa, 'Ranking'] = data[data.index == home_lsoa]['SiteLSOA'].apply(lambda x: sites_list.index(x) + 1 if x in sites_list else None)

optimized_determine_site_ranking(activities_to_rank, sorted_sites_by_distance)
percentages = calculate_percentages(activities_to_rank)

percentages 

# 1     65.824697
# 2     14.283368
# 3      6.887122
# 4      3.835668
# 5      2.270525
# 6      2.157204
# 7      0.976367
# 8      0.979651
# 9      0.283302
# 10     0.194616
# 11     0.463138
# 12     0.156843
# 13     0.068978
# 14     0.394981
# 15     0.918885
# 16     0.128923
# 17     0.135492
# 18     0.000821
# 19     0.003285
# 20     0.026277
# 22     0.009854

Ranking
1       63.630942
2       14.230988
3        6.025464
4        4.448060
5        2.736549
6        1.795877
7        1.335572
8        1.078626
9        0.443817
10.0     0.457282
11       0.461404
12       0.460580
13       0.169008
14       0.513893
15       1.259450
16       0.560336
17       0.156366
18       0.127237
19       0.075023
20       0.026382
21       0.003023
22       0.004122
Name: count, dtype: float64

In [17]:
def calculate_cumulative_probabilities(probabilities):
    total = sum(probabilities)
    normalized_probs = [p / total for p in probabilities]
    cumulative_probs = []
    cumulative_sum = 0
    for p in normalized_probs:
        cumulative_sum += p
        cumulative_probs.append(cumulative_sum)
    return cumulative_probs

cumulative_probs = calculate_cumulative_probabilities(percentages)

cumulative_probs

[0.6363094240276569,
 0.7786193042383804,
 0.8388739423285673,
 0.8833545394337284,
 0.9107200272610604,
 0.9286788004034198,
 0.9420345215161767,
 0.9528207777646481,
 0.9572589443484141,
 0.9618317673796132,
 0.9664458117722714,
 0.9710516118926378,
 0.9727416877124619,
 0.977880617441033,
 0.9904751174121781,
 0.9960784744798553,
 0.9976421381245382,
 0.9989145041482432,
 0.9996647329267994,
 0.9999285496401378,
 0.9999587786385411,
 1.0000000000000002]

So a simple nearest assignment has distinct limitations and does not allow us to balance any other competing priorities. 

This kind of problem is a variant of the travelling salesman problem (https://en.wikipedia.org/wiki/Travelling_salesman_problem) which is NP-Hard meaning that it is too computationally expensive to compute all possible solutions and find the best one. Therefore needs to be approached and solved using a heuristic approach.

For this purpose we are going to use a genetic algorithm to enable us to balance competing priorities and come up with a balanced good solution. 

We will need to define the parameters for our genetic algorithm

    * Population Size
    * Chance to cross breed
    * Mutation Probabilities
    * Max number of generations to run for

The base mutation rate probability is adaptive and increased based on the stagnation of both the size and diversity of the pareto front, but we can also set the probability that elements of an individual will be mutated once the individual has been selected for mutation.

Cessation of the process is also controlled by stagnation in the pareto front, once the mutation rate has been increased and yet still no improvements have been realised in both size and diversity of the pareto front for a number of generations then the evolution is stopped 

In [18]:
# Let us set up sone of the parameters for the evolution of our solution
# number of solutions in a population
pop_num = 200
# percentage chance to cross breed one solution with another
cross_chance = 0.3
# percentage chance to introduce random mutations into the solutions, % of selected individuals
initial_mutation_prob = 0.05
# maximum percentage chance to introduce random mutations into the solutions, % of selected individuals
max_mutation_prob = 0.8
# percentage chance to introduce random mutations into the individuals selected for mutation
individual_mutation_prob = 0.2
individual_mutation_max_prob = 0.8

Now we will set up and run our evolutionary algorithm. The most important part is the custom evaluation function. Most of the population, generations, breeding and mutating is handled by the DEAP library, but we need to define our own custom function to assess the fitness of each solution. These scores are then used to find the best individual solutions in each generation to breed off and mutate in later generations to evelove the population towards a 'good' solution to our problem with competing priorities.

Lets add all our competing priorities in to our evaluation function as per the source paper: https://www.journalslibrary.nihr.ac.uk/hsdr/hsdr06350/#/abstract

We want to:

    * Minimise the average travel time
    * Maximise the proportion within 30 minutes
    * Minimise the maximum distance for any assignment
    * Maximise the number taking place in units with more than x admissions per year
    * Maximise the smallest number of admissions per year  
    * Minimise the largest number of admissions per year 
    * Maximise the proportion within 30 minutes and in units with more than x admissions per year

The fourth and final of these are different in this approach as we are not working with admissions data but with critical care information, what we will model instead here is whether a NICU, LNU, and SCBU site meets the minimum required number of days as set out in the BAPM standards https://hubble-live-assets.s3.amazonaws.com/bapm/file_asset/file/1494/BAPM_Service_Quality_Standards_FINAL.pdf and we will look at the proiportion of activities taking place in the nicu sites as a general positive given these sites are the most specialised.

So we have:

    * Minimise the average travel time
    * Maximise the proportion within 30 minutes
    * Minimise the maximum distance for any assignment
    * Maximise the number taking place in level 3 nicu units
    * Maximise the smallest number of admissions per year  
    * Minimise the largest number of admissions per year 
    * Maximise the proportion within 30 minutes and in in level 3 nicu units

We can also adjust the weightings that we give to each of these should we want to.

In [19]:
# Let us set up variables for the weightings
min_travel_time         = -1.0
max_in_30               = 1.0
min_max_distance        = -1.0
max_large_unit          = 1.0
max_min_no              = 1.0
min_max_no              = -1.0
max_in_30_and_large     = 1.0
 # the following are a helper metric to aid the maximisation of activities taking place in larger units
max_large_nicu          = 1.0 
consolidation           = 1.0

# Define the threshold for minimum admissions
nicu_activities_threshold = 1000  # set to 1000 to make the algorithm reach the threshold of over lnu range and insentivise those solutions
lnu_activities_threshold = 1000  
scbu_activities_threshold = 500

# Using this we can provide objectives to our evolutionary process
# must be structured like this {
#     'E01024897': {'NICU': {'min': 0, 'max': 500}}
#     ,'E01005062': {'NICU': {'min': 4000}}
#     }
# can provide both minimums, maximums to any existing site and any activity level

activity_limits = set()

# Sites that should not be assigned to any home, for modelling full site closures
restricted_sites = set()

# Do we want to propose a new site, we can add the LSOA of the proposed site and run our process against it
# E01012632 would be blackburn hospital
proposed_additions = list()

# Activity to focus on in the evolutionary assignment
activity_focus = list()

# We can also add an extreme individual to the population this is to ensure that the population space contains 
# the most optimal fitness for one of our evaluation metrics.. in this case the minimisation of travel time
include_original_sites = False

# Number of elite individuals to carry to the next generation
num_elites = elite_pop

# normalisation boundaries, these are based on known results, these could need further evaluation
min_avg_time = 10
max_avg_time = 70
min_prop_within_30_mins = 0.1
max_prop_within_30_mins = 0.9
min_min_max_distance = 200
max_min_max_distance = 350
min_number_of_sites_over_nicu_threshold = 0.0
max_number_of_sites_over_nicu_threshold = 0.4
min_smallest_site = 900 
max_smallest_site = 5000
min_largest_site = 6000 
max_largest_site = 13000
min_constraint_adherence = 0 
max_constraint_adherence = 3000
min_prop_within_30_mins_and_large_NICU = 0.05 
max_prop_within_30_mins_and_large_NICU = 0.20
min_max_large_nicu = 1000
max_max_large_nicu = 6000
min_consolidation_metric = 100
max_consolidation_metric = 1300

Let us add these priorities in to our evaluation function algorithm

In [20]:
creator.create("FitnessMulti", base.Fitness, weights=(min_travel_time
                                                      , max_in_30
                                                      , min_max_distance
                                                      , max_large_unit
                                                      , max_min_no
                                                      , min_max_no
                                                      , max_in_30_and_large
                                                      , max_large_nicu
                                                      , consolidation
                                                      ))
creator.create("Individual", list, fitness=creator.FitnessMulti)

toolbox = base.Toolbox()

num_sites = 0

current_site_list = []

def get_nearby_sites(home, mutation_depth_no=restricted_mutation_depth):
    # Returns a list of site indices sorted by distance (nearest first)
    sorted_sites = sorted(range(len(current_site_list)), key=lambda site_idx: travel_times_dict.get((home, current_site_list[site_idx]), float('inf')))
    return sorted_sites[:mutation_depth_no]

def prepare_site_list(site_codes, proposed_additions, restricted_sites):
    combined_sites = site_codes + proposed_additions
    filtered_sites = [site for site in combined_sites if site not in restricted_sites]
    return filtered_sites

# Function to asign a random site to each individual in the population but allow us to add or remove sites
def restricted_random_site():
    return random.choice(range(len(current_site_list)))

restricted_site_indices = {site_codes.index(code) for code in restricted_sites if code in current_site_list}

toolbox.register("random_site", restricted_random_site)

# This function allows us to create random individuals based on the weighted distribution of the patients travelling to their nearest x site
def weighted_random_choice(cumulative_probs):
    rnd = random.random()
    for i, prob in enumerate(cumulative_probs):
        if rnd <= prob:
            return i
    print(len(cumulative_probs) - 1)
    return len(cumulative_probs) - 1  # Fallback in case of rounding errors

def weighted_site(home, nearby_sites, cumulative_probs):
    weighted_index = weighted_random_choice(cumulative_probs)
    home_sites = nearby_sites[home]
    # Safety check to ensure the index is within bounds
    while weighted_index >= len(home_sites):
        weighted_index = weighted_random_choice(cumulative_probs)
    selected_site_index = home_sites[weighted_index]
    return selected_site_index

def create_weighted_individual(cumulative_probs, home_lsoas, nearby_sites):
    individual = []
    for home in home_lsoas:
        weighted_site_code = weighted_site(home, nearby_sites, cumulative_probs)
        individual.append(weighted_site_code)
    return creator.Individual(individual)

# Create an extreme individual based on the the data itself
def nearest_restricted_site(home, restricted_site_indices, passed_sites, travel_times_dict):
    # Find the nearest non-restricted site
    valid_sites_indices = [i for i in range(len(passed_sites)) if i not in restricted_site_indices]
    nearest_site_idx = min(valid_sites_indices, key=lambda site_idx: travel_times_dict.get((home, passed_sites[site_idx]), float('inf')))
    return nearest_site_idx

def create_individual_based_on_data(most_frequent_sites, home_lsoas, passed_sites, restricted_site_indices, nearby_sites):
    site_code_indices = {code: idx for idx, code in enumerate(passed_sites)}
    site_index_map = {}
    for _, row in most_frequent_sites.iterrows():
        home_code = row['Der_Postcode_LSOA_Code']
        site_code = row['SiteLSOA']
        site_idx = site_code_indices.get(site_code)
        
        # If the site is not restricted, use it; otherwise, use the first site from nearby_sites
        if site_idx is not None and site_idx not in restricted_site_indices:
            site_index_map[home_code] = site_idx
        else:
            # Assign the first site from nearby_sites, as it already accounts for restrictions
            home_nearby_sites = nearby_sites[home_code]
            site_index_map[home_code] = home_nearby_sites[0] if home_nearby_sites else None

    # Build the individual based on the most frequented site index or the first site from nearby_sites
    individual = [site_index_map.get(home, nearby_sites[home][0] if nearby_sites[home] else None) for home in home_lsoas]

    return creator.Individual(individual)


# Create an extreme individual based on the sites in the data using most frequent where more than one site

def create_extreme_individual():
    individual = []
    for home_idx, home in enumerate(home_lsoas):
        nearest_site_idx = nearest_restricted_site(home, restricted_site_indices, current_site_list, travel_times_dict)
        individual.append(nearest_site_idx)
    return creator.Individual(individual)

proportion_weighted = 0.1

def init_population(n, prop_weighted):
    population = []
    num_weighted = int(n * prop_weighted)
    num_random = n - num_weighted - len(population)
    z = 0
    # Add the extreme individual if flagged to
    if include_extreme_individual:
        population.append(create_extreme_individual())
        z += 1
    # Add the individual based on the actual data if flagged to
    if include_original_sites:
        population.append(create_individual_based_on_data(most_frequent_sites, home_lsoas, current_site_list, restricted_site_indices, nearby_sites))
        z += 1

    # Add weighted individuals
    for _ in range(num_weighted):
        population.append(create_weighted_individual(cumulative_probs, home_lsoas, nearby_sites))

    # Add random individuals
    for _ in range(num_random - z):
        population.append(toolbox.individual())
    print (f"        Added {num_weighted} weighted individuals and {num_random - z} random individuals")

    return population

toolbox.register("population", init_population, prop_weighted = proportion_weighted)

def create_logs_df():
    column_types = {'individual': 'str',
                    'avg_time': 'float64'
                    ,'prop_within_30_mins': 'float64'
                    ,'max_distance': 'float64'
                    ,'units_over_x': 'float64'
                    ,'smallest_site': 'float64'
                    ,'largest_site': 'float64'
                    ,'max_in_30_and_large': 'float64'
                    ,'totals': 'float64'
                    ,'large_nicu': 'float64'
                    ,'consolidation_stdev': 'float64'
                    }

    # Create a DataFrame with the specified columns and data types
    logs_df = pd.DataFrame(columns=column_types.keys()).astype(column_types)
    return logs_df

inner_log_df = pd.DataFrame(columns=['site',
                                    'home',
                                    'activity_type',
                                    'activity_counts'])

activity_log_df = pd.DataFrame(columns=['Generation', 'Site', 'HDU', 'SCBU', 'NICU'])

def calculate_activity_counts(individual):
    activity_counts = defaultdict(lambda: [0, 0, 0])  # Initialize counts for each activity at each site
    # used_sites = site_codes + proposed_additions  # Combine existing and proposed sites
    # Iterate over each home-site pair
    for home_idx, site_idx in enumerate(individual):
        site = current_site_list[site_idx]  # Get the site assigned to this home
        home_activity_counts = home_activities[home_idx]  # Get the activity counts for this home
        # Aggregate activities at the assigned site
        for i in range(len(home_activity_counts)):
            activity_counts[site][i] += home_activity_counts[i]
    return activity_counts

# these function are to allow us to apply penalty objective to the evolution, 
# this will enable us to evaluate different proposed scenarios
def is_feasible(individual):
    activity_counts = calculate_activity_counts(individual)
    for site, counts in activity_counts.items():
        if site in activity_limits:
            for i, activity in enumerate(['HDU', 'SCBU', 'NICU']):
                limits = activity_limits[site].get(activity)
                if limits:
                    if counts[i] < limits.get('min', 0) or counts[i] > limits.get('max', float('inf')):
                        return False
    return True

def distance_to_feasibility(individual):
    distance = 0
    activity_counts = calculate_activity_counts(individual)
    for site, counts in activity_counts.items():
        if site in activity_limits:
            for i, activity in enumerate(['HDU', 'SCBU', 'NICU']):
                limits = activity_limits[site].get(activity)
                if limits:
                    excess = max(0, counts[i] - limits.get('max', float('inf')))
                    shortfall = max(0, limits.get('min', 0) - counts[i])
                    distance += excess + shortfall
    return distance

base_penalty = 0.1  # Base penalty
penalty_factor = 1.1  # Exponential factor

def exponential_penalty(individual):
    distance = distance_to_feasibility(individual)
    penalty_value = base_penalty * (penalty_factor ** distance)
    weights = creator.FitnessMulti.weights
    # print(f"Distance: {distance}, Penalty Value: {penalty_value}")
    penalties = []
    for weight in weights:
        if weight > 0:  # Penalise maximisation objectives
            penalties.append(-penalty_value)
        else:           # Penalise minimisation objectives
            penalties.append(penalty_value)
    # print(f"Penalties: {penalties}")
    return tuple(penalties)

# Normalization function in order that no parameter dominations the evolutionary process simply due to its scale
def normalize(raw_value, min_value, max_value):
    return (raw_value - min_value) / (max_value - min_value)


def eval_func(individual, activity_focus=None):
    global inner_log_df, logs_df 
    # Initialize accumulators and counters
    total_time = 0
    total_population = 0
    within_30_mins = 0
    # constraint_adherence = 0
    total_time_activity_weighted = 0
    total_activity_count = 0

    # Calculate activity counts for each site
    activity_counts = calculate_activity_counts(individual)

    missing_combinations = [] 
    
    # Loop over each home-site pair in the individual
    for home_idx, site_idx in enumerate(individual):
        home = home_lsoas[home_idx]
        site = current_site_list[site_idx]
        key = (home, site)
        if key not in travel_times_dict:
            missing_combinations.append(key)

        if (home, site) in travel_times_dict:
            travel_time = travel_times_dict[(home, site)]
            total_time += travel_time * home_populations[home_idx]
            total_population += home_populations[home_idx]

            if travel_time <= 30:
                within_30_mins += home_populations[home_idx]

            activity_counts_per_home = home_activities[home_idx]
            for activity_count in activity_counts_per_home:
                total_time_activity_weighted += travel_time * home_populations[home_idx] * activity_count
                total_activity_count += activity_count

    avg_time = total_time / total_population if total_population else 0
    # avg_time_activity_weighted = total_time_activity_weighted / total_activity_count if total_activity_count else 0
    prop_within_30_mins = within_30_mins / total_population if total_population else 0
    travel_times = [travel_times_dict[(home_lsoas[home_idx], current_site_list[site_idx])]
                for home_idx, site_idx in enumerate(individual)
                if (home_lsoas[home_idx], current_site_list[site_idx]) in travel_times_dict]

    max_distance = max(travel_times, default=0)  # default=0 handles empty list


    site_activities = {site: sum(counts) for site, counts in activity_counts.items()}
    
    #print(site_activities)
    
    smallest_site = min(site_activities.values())
    largest_site = max(site_activities.values())
    
    min_max_values = [
        (min_avg_time, max_avg_time), 
        (min_prop_within_30_mins, max_prop_within_30_mins),
        (min_min_max_distance, max_min_max_distance),
        (min_number_of_sites_over_nicu_threshold, max_number_of_sites_over_nicu_threshold ),
        (min_smallest_site, max_smallest_site),
        (min_largest_site, max_largest_site),
        (min_prop_within_30_mins_and_large_NICU, max_prop_within_30_mins_and_large_NICU),
        (min_max_large_nicu, max_max_large_nicu),
        (min_consolidation_metric,max_consolidation_metric)
    ]
    if not site_activities:
        return [0] * len(min_max_values)  # Return a list of zeroes for each objective, or handle as appropriate
    
    # Count the number of sites that meet or exceed the threshold for NICU activities
    NICU_INDEX = 2
    HDU_INDEX = 0
    
    # Find the sites that meet the NICU threshold
    nicu_sites = [site for site, counts in activity_counts.items() if counts[NICU_INDEX] >= nicu_activities_threshold]
    # number_of_sites_over_nicu_threshold = len(nicu_sites)
    large_nicu = [counts[NICU_INDEX] for site, counts in activity_counts.items()]
    large_nicu_count = max(large_nicu)
    
    # Calculate the total NICU activity count across all sites
    total_nicu_activities = sum(counts[NICU_INDEX] for site, counts in activity_counts.items())
    # Calculate the NICU activity count at sites that exceed the threshold
    over_threshold_nicu_activities = sum(counts[NICU_INDEX] for site, counts in activity_counts.items() if counts[NICU_INDEX] >= nicu_activities_threshold)
    # Calculate the proportion of NICU activities that are at sites over the threshold
    proportion_over_threshold_nicu_activities = (over_threshold_nicu_activities / total_nicu_activities 
                                                if total_nicu_activities != 0 else 0)

    # Calculate the population within 30 minutes and going to a large NICU site
    within_30_mins_and_large_NICU = 0
    for home_idx, site_idx in enumerate(individual):
        home = home_lsoas[home_idx]
        site = current_site_list[site_idx]
        travel_time = travel_times_dict.get((home, site), float('inf'))
        if travel_time <= 30 and site in nicu_sites:
            within_30_mins_and_large_NICU += home_populations[home_idx]
            
    # Calculate the proportion (or 0 if total_population is 0)
    prop_within_30_mins_and_large_NICU = within_30_mins_and_large_NICU / total_population if total_population != 0 else 0

    # CONSOLITDATION METRIC TO AID LARGER SITE CREATION
    # Extract NICU activities for each site
    nicu_activities_per_site = {site: counts[NICU_INDEX] for site, counts in activity_counts.items()}
    # Get the list of NICU activities per site
    nicu_activities = list(nicu_activities_per_site.values())
    # Calculate the standard deviation of NICU activities as the consolidation score
    std_dev_nicu_activities = np.std(nicu_activities) if nicu_activities else 0
    # This score represents the spread in NICU activities across sites; higher values indicate more consolidation
    consolidation_score_nicu = std_dev_nicu_activities


    # Create a new DataFrame from the dictionary
    new_row = pd.DataFrame([{
        'individual': individual.index,  
        'avg_time': avg_time,
        'prop_within_30_mins': prop_within_30_mins,
        'max_distance': max_distance,
        'units_over_x': proportion_over_threshold_nicu_activities,
        'smallest_site': smallest_site,
        'largest_site': largest_site,
        'totals' : total_population,
        'activity_counts': activity_counts,
        'large_nicu': large_nicu_count,
        'consolidation_stdev': consolidation_score_nicu
    }])

    # Concatenate the new DataFrame with the existing one
    logs_df = pd.concat([logs_df, new_row], ignore_index=True)


    # Raw objective values
    raw_objectives = [
        avg_time, 
        prop_within_30_mins, 
        max_distance, 
        proportion_over_threshold_nicu_activities,
        smallest_site, 
        largest_site, 
        prop_within_30_mins_and_large_NICU,
        large_nicu_count,
        consolidation_score_nicu
    ]
    
    # Normalize objectives
    normalized_objectives = [
        normalize(raw, min_val, max_val) 
        for raw, (min_val, max_val) in zip(raw_objectives, min_max_values)
    ]

    # return (avg_time,
    #         prop_within_30_mins,
    #         max_distance,
    #         proportion_over_threshold_nicu_activities,
    #         smallest_site,
    #         largest_site,
    #         prop_within_30_mins_and_large_NICU,
    #         large_nicu_count,
    #         consolidation_score_nicu)
            
    return normalized_objectives

# Random mutation function
def restricted_mutUniformInt(individual, low, up, indpb, objective_stagnation_threshold, activity_limits):
    for i, site_index in enumerate(individual):
        if random.random() < indpb:
            individual[i] = restricted_random_site()
    if generations_since_improvement >= objective_stagnation_threshold and activity_limits:
        # print(f'         forceful mutation used {generations_since_improvement}')
        # print(f"Scenario structure before calling forceful_mutation: {type(activity_limits)}")
        individual = forceful_mutation(individual, activity_limits, nearby_sites, cumulative_probs, home_lsoas)
    return individual,

# Let us also create an alternative mutation function which limits the choice of site to one of the 3 nearest rather than any
# This should reflect the more realistic real world scenario whereby travel is more limited to nearer sites

def restricted_mutNearbyInt(individual, indpb, nearby_passed_sites):
    for i, site_index in enumerate(individual):
        if random.random() < indpb:
            home = home_lsoas[i]
            if home in nearby_passed_sites:
                # Choose from nearby random site
                individual[i] = random.choice(nearby_passed_sites[home])
            else:
                # Fallback to random if nearby info is not available
                individual[i] = restricted_random_site()
    return individual,

# # Following, there is another alternative mutation assigning nearby sites 
# # based on the real data distribution of sites based on travel times

nearby_sites = {}

def forceful_mutation(individual, objective_sites, nearby_sites, cumulative_probs, home_lsoas, maximization_percentage=1, minimization_percentage=1):
    for i, assigned_site in enumerate(individual):
        home = home_lsoas[i]
        # original_site = assigned_site  
        for site_code, objectives in objective_sites.items():
            # Extract NICU objectives
            nicu_objective = objectives.get('NICU', {})

            sc_id = current_site_list.index(site_code)

            # Apply Lower Limiting Logic
            if 'min' in nicu_objective:
                # print (f"Assigned Site {assigned_site} NICU Objective Site Code {sc_id}")
                if assigned_site != sc_id and random.random() < maximization_percentage:
                    # Check if the target site (for Lower Limiting Logic) is among the top preferences for this home
                    potential_sites = nearby_sites[home][:3]  # Top 3 potential sites for this home
                    if sc_id in potential_sites:
                        new_site = weighted_site(home, nearby_sites, cumulative_probs)
                        individual[i] = new_site
                        # print(f"Lower Limiting Logic: Home {home} from Site {original_site} to {new_site}")

            # Apply Upper Limiting Logic
            elif 'max' in nicu_objective and assigned_site == sc_id:
                if random.random() < minimization_percentage:
                    # Exclude the current minimization target site from options
                    options = [s for s in nearby_sites[home] if s != site_code]
                    new_site = weighted_site(home, nearby_sites, cumulative_probs)
                    individual[i] = new_site
                    # print(f"Upper Limiting Logic: Home {home} from Site {original_site} to {new_site}")
    return individual

def weighted_mutation_function(individual, indpb, cumulative_probs, home_lsoas, objective_stagnation_threshold, activity_limits):
    global generations_since_improvement
# def weighted_mutation_function(individual, indpb, cumulative_probs, home_lsoas, objective_stagnation_threshold, generations_since_improvement, activity_limits):

    for i, site_index in enumerate(individual):
        if random.random() < indpb:
            home = home_lsoas[i]
            # home_nearby_sites = nearby_sites[home]
            # weighted_index = weighted_random_choice(cumulative_probs)
            weighted_site_code = weighted_site(home, nearby_sites, cumulative_probs)
            # original_site = individual[i]
            individual[i] = weighted_site_code
            # print(f"Mutated Home: {home}, Original Site: {original_site}, New Site: {nearest_site_code}, Weighted Index: {weighted_index}")
    
    # Apply forceful mutation if stagnation is detected
    if generations_since_improvement >= objective_stagnation_threshold and activity_limits:
        # print(f'         forceful mutation used {generations_since_improvement}')
        # print(f"Scenario structure before calling forceful_mutation: {type(activity_limits)}")
        individual = forceful_mutation(individual, activity_limits, nearby_sites, cumulative_probs, home_lsoas)
    
    return individual,

# def weighted_mutation_function(individual, indpb, cumulative_probs, home_lsoas):
#     for i, site_index in enumerate(individual):
#         if random.random() < indpb:
#             home = home_lsoas[i]
#             home_nearby_sites = nearby_sites[home]
#             weighted_index = weighted_random_choice(cumulative_probs)
#             weighted_site_code = weighted_site(home, nearby_sites, cumulative_probs)
#             original_site = individual[i]
#             individual[i] = weighted_site_code
#             # print(f"Mutated Home: {home}, Original Site: {original_site}, New Site: {nearest_site_code}, Weighted Index: {weighted_index}")
#     return individual,

def evaluate_with_penalty(individual):
    fitness_values = eval_func(individual)
    if not is_feasible(individual):
        penalties = exponential_penalty(individual)
        penalized_fitness = tuple(fv + pv for fv, pv in zip(fitness_values, penalties))
        return penalized_fitness
    return fitness_values

# Create a partial function that has activity_focus pre-specified
eval_func_focused = partial(eval_func, activity_focus=activity_focus)

toolbox.register("evaluate", evaluate_with_penalty)
# toolbox.register("evaluate", eval_func_focused)
# toolbox.decorate("evaluate", tools.DeltaPenalty(is_feasible, 7.0, distance_to_feasibility))

toolbox.register("mate", tools.cxTwoPoint)

# Generate reference points for NSGA3
# Parameters
NOBJ = 9
P = [2, 1]
SCALES = [1, 0.5]

# Create, combine and removed duplicates
ref_points = [tools.uniform_reference_points(NOBJ, p, s) for p, s in zip(P, SCALES)]
ref_points = np.concatenate(ref_points, axis=0)
_, uniques = np.unique(ref_points, axis=0, return_index=True)
ref_points = ref_points[uniques] 

history = tools.History()

# ADAPTIVE STRATEGY FOR MUTATION RATE

def adapt_mutation_rate_based_on_stagnation(generations_since_improvement, threshold, initial_mutation_prob, max_mutation_prob):
    if generations_since_improvement > threshold:
        # Increase mutation probability up to a maximum
        return min(initial_mutation_prob * (1 + generations_since_improvement / threshold), max_mutation_prob)
    else:
        return initial_mutation_prob

individual_mutation_prob_amt = 0.3

def adapt_individual_mutation_rate_based_on_stagnation(generations_since_improvement, threshold, individual_mutation_prob, individual_mutation_max_prob):
    if generations_since_improvement > threshold:
        # Increase mutation probability up to a maximum
        return min(individual_mutation_prob * (1 + generations_since_improvement / threshold), individual_mutation_max_prob)
    else:
        return initial_mutation_prob
    
def calculate_diversity(front):
    if len(front) < 2:
        return 0

    distances = []
    for i in range(len(front) - 1):
        dist = np.linalg.norm(np.array(front[i].fitness.values) - np.array(front[i+1].fitness.values))
        distances.append(dist)

    return np.mean(distances)

def has_pareto_front_improved(current_front, previous_front, diversity_threshold):
    if previous_front is None:
        return True

    current_size = len(current_front)
    previous_size = len(previous_front)

    if current_size > previous_size:
        return True

    if current_size > diversity_threshold:
        current_diversity = calculate_diversity(current_front)
        previous_diversity = calculate_diversity(previous_front)
        # print(f"Current diversity: {current_diversity} > Previous diversity: {previous_diversity}?")
        if current_diversity > previous_diversity:
            return True
    
    return False

initial_diversity_threshold = 0.10
stagnation_limit = 20
max_number_generations = 10000
stagnation_threshold = 10
scenario = []

generations_since_improvement = 0

def main():
    global generations_since_improvement
    
    # restricted_site_indices = {site_codes.index(code) for code in restricted_sites}
    
    if nsga3:
        toolbox.register("select", tools.selNSGA3, ref_points=ref_points)
    else:
        toolbox.register("select", tools.selNSGA2)

    # Define statistics for each objective
    stats_time = tools.Statistics(key=lambda ind: ind.fitness.values[0])
    stats_time.register("avg_time", np.mean)

    stats_prop = tools.Statistics(key=lambda ind: ind.fitness.values[1])
    stats_prop.register("prop_within_30_mins", np.max)
    
    stats_max_distance = tools.Statistics(key=lambda ind: ind.fitness.values[2])
    stats_max_distance.register("max_distance", np.mean)
    
    stats_large_sites = tools.Statistics(key=lambda ind: ind.fitness.values[3])
    stats_large_sites.register("large_sites", np.max)
    
    smallest_site_stats = tools.Statistics(key=lambda ind: ind.fitness.values[4])
    smallest_site_stats.register("smallest_site", np.max)
    
    largest_site_stats = tools.Statistics(key=lambda ind: ind.fitness.values[5])
    largest_site_stats.register("largest_site", np.max)
    
    thirty_and_large_stats = tools.Statistics(key=lambda ind: ind.fitness.values[6])
    thirty_and_large_stats.register("30_and_large", np.max)
    
    large_nicu_stats = tools.Statistics(key=lambda ind: ind.fitness.values[7])
    large_nicu_stats.register("large_nicu", np.max)

    
    consolidation_stats = tools.Statistics(key=lambda ind: ind.fitness.values[7])
    consolidation_stats.register("consolidation", np.max)

    
    # Combine statistics into MultiStatistics
    mstats = tools.MultiStatistics(time=stats_time
                                   , prop=stats_prop
                                   , max_dist=stats_max_distance
                                    , large_sites=stats_large_sites
                                    ,smallest_site=smallest_site_stats
                                    , largest_site=largest_site_stats,
                                   thirty_and_large = thirty_and_large_stats
                                , large_nicu = large_nicu_stats
                                , consolidation = consolidation_stats
                                   )

    # Initialize and evaluate the population
    pop = toolbox.population(n=pop_num)
    history.update(pop)
    hof = tools.HallOfFame(1)
    paretofront = tools.ParetoFront()
    fitnesses = map(toolbox.evaluate, pop)
    for ind, fit in zip(pop, fitnesses):
        ind.fitness.values = fit
    bestie = ()

    # Create a logbook and record initial statistics
    logbook = tools.Logbook()
    logbook.header = ['gen', 'nevals'] + (mstats.fields if mstats else [])
    record = mstats.compile(pop) if mstats else {}
    logbook.record(gen=0, nevals=len(pop), **record)
    
    # Function to select elite individuals for crossover
    # def ranked_selection(population, k):
    #     # Rank the population by fitness
    #     sorted_pop = sorted(population, key=lambda ind: ind.fitness, reverse=True)
    #     # Select the top k individuals
    #     return sorted_pop[:k]

    # # Number of individuals to select for crossover
    # k = len(pop) // 2

    generations_since_improvement = 0
    previous_pareto_front = None
    gen = 0
    objective_stagnation_threshold = 20

    while generations_since_improvement < stagnation_limit and gen < max_number_generations:
    
        gen += 1
        
        # Update hall of fame and Pareto front (paretofront)
        hof.update(pop)
        paretofront.update(pop)
        
        current_pop = len(pop) * gen
        diversity_threshold = initial_diversity_threshold * current_pop
    
        # Check if the Pareto front has improved
        if has_pareto_front_improved(paretofront, previous_pareto_front, diversity_threshold):
            generations_since_improvement = 0
            # Store the current Pareto front as the previous front for the next generation
            previous_pareto_front = list(paretofront)
        else:
            generations_since_improvement += 1
            
        mutation_prob = adapt_mutation_rate_based_on_stagnation(generations_since_improvement,stagnation_threshold,initial_mutation_prob,max_mutation_prob)
        individual_mutation_prob_amt = adapt_individual_mutation_rate_based_on_stagnation(generations_since_improvement,stagnation_threshold,individual_mutation_prob,individual_mutation_max_prob)
        
        # def mutation_wrapper(individual):
        #     return weighted_mutation_function(individual
        #                                       , individual_mutation_prob_amt
        #                                       , cumulative_probs
        #                                       , home_lsoas
        #                                       , objective_stagnation_threshold
        #                                       , generations_since_improvement
        #                                       , activity_limits)
        
        # if weighted_mutation:
        #     toolbox.register("mutate", mutation_wrapper)


        # print(f"Generation: {gen} / Pareto Front Size:{len(paretofront)} / Diversity threshold: {diversity_threshold}")     
        # print(f"Mutation probability {mutation_prob}")
        # print(f"Individual mutation probability {individual_mutation_prob_amt}, at {generations_since_improvement} generations since improvement")

        # Select the next generation individuals
        offspring = toolbox.select(pop, len(pop) - num_elites)
        # Clone the selected individuals
        offspring = list(map(toolbox.clone, offspring))
        
        # # Select individuals for crossover
        # selected_for_crossover = ranked_selection(pop, k)

        # # Apply crossover to elite selected individuals
        # for child1, child2 in zip(selected_for_crossover[::2], selected_for_crossover[1::2]):
        #     if np.random.rand() < cross_chance:
        #         toolbox.mate(child1, child2)
        #         del child1.fitness.values
        #         del child2.fitness.values

        # Apply crossover and mutation on the offspring
        for child1, child2 in zip(offspring[::2], offspring[1::2]):
            if np.random.rand() < cross_chance:
                toolbox.mate(child1, child2)
                del child1.fitness.values
                del child2.fitness.values

        for mutant in offspring:
            if np.random.rand() < mutation_prob:
                toolbox.mutate(mutant)
                del mutant.fitness.values

        # Evaluate the individuals with an invalid fitness
        invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
        fitnesses = map(toolbox.evaluate, invalid_ind)
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit
                
        # Select the elite individuals
        elites = tools.selBest(pop, num_elites)
        offspring.extend(elites)
        pop[:] = offspring
        
        # Record statistics for this generation
        record = mstats.compile(pop) if mstats else {}
        logbook.record(gen=gen+1, nevals=len(invalid_ind), **record)
        
        sys.stdout.write("\r        Generation: {}, Generations Since Improvement: {}  ".format(gen, generations_since_improvement))
        sys.stdout.flush()
        # print ("\r        Generation: {}, Generations Since Improvement: {}  ".format(gen, generations_since_improvement))
        
            
    bestie = tools.selBest(pop, 1)[0]
    print(" ")

    gc.collect()
    
    return pop, logbook, hof, paretofront, bestie

We can use DEAPs built in selBest tool to select the best individual from the population 

In [21]:
# Here we translate the best individual (which is a list of site indices) into a list of (home_code, site_code) pairs
def create_solution_list(bestind, home_lsoas, current_site_list):
    solution = []
    # used_sites = site_codes + proposed_additions
    for i, site_index in enumerate(bestind):
        home_code = home_lsoas[i]
        site_code = current_site_list[site_index]
        solution.append((home_code, site_code))
    return solution  # return the solution list

# def add_solution(activities, solution, solution_number, activity_focus):
    
#     solution_column_name = f'solution_{solution_number}'
#     solution_unit_name = f'solution_{solution_number}_unit'
    
#     # Ensure the solution column exists
#     if solution_column_name not in activities.columns:
#         activities[solution_column_name] = np.nan
    
#     # Convert the solution list to a dictionary for faster lookup
#     solution_dict = dict(solution)
    
#     # Iterate over the activities DataFrame and update where conditions match
#     for idx, row in activities.iterrows():
#         if (not activity_focus or row['CC_Level'] in activity_focus) and row['Der_Postcode_LSOA_Code'] in solution_dict:
#             activities.at[idx, solution_column_name] = solution_dict[row['Der_Postcode_LSOA_Code']]
            
#     # Drop the solution_unit_name column if it exists
#     if solution_unit_name in activities.columns:
#         activities = activities.drop(solution_unit_name, axis=1)
    
#     # Merge and then drop the LSOA column, ensuring the merged column name is correct
#     merged_df = pd.merge(activities, sites[['LSOA', 'UnitCode']], left_on=solution_column_name, right_on='LSOA', how='left')
#     merged_df = merged_df.drop('LSOA', axis=1)
#     merged_df.rename(columns={'UnitCode': solution_unit_name}, inplace=True)
    
#     return merged_df

def add_solution(activities, solution, solution_number, activity_focus, travel_times):
    solution_column_name = f'solution_{solution_number}'
    solution_unit_name = f'solution_{solution_number}_unit'
    solution_travel_time = f'solution_{solution_number}_travel_time'
    
    # Ensure the solution and travel time columns exist
    if solution_column_name not in activities.columns:
        activities[solution_column_name] = np.nan
    activities[solution_travel_time] = np.nan  # New column for travel times
    
    # Convert the solution list to a dictionary for faster lookup
    solution_dict = dict(solution)
    
    # Iterate over the activities DataFrame and update where conditions match
    for idx, row in activities.iterrows():
        lsoa_code = row['Der_Postcode_LSOA_Code']
        if (not activity_focus or row['CC_Level'] in activity_focus) and lsoa_code in solution_dict:
            target_code = solution_dict[lsoa_code]
            activities.at[idx, solution_column_name] = target_code
            
            # Lookup and set the travel time
            travel_time_key = (lsoa_code, target_code)
            if travel_time_key in travel_times:
                activities.at[idx, solution_travel_time] = travel_times[travel_time_key]
    
    if solution_unit_name in activities.columns:
        activities = activities.drop(solution_unit_name, axis=1)
    
    merged_df = pd.merge(activities, sites[['LSOA', 'UnitCode']], left_on=solution_column_name, right_on='LSOA', how='left')
    merged_df = merged_df.drop('LSOA', axis=1)
    merged_df.rename(columns={'UnitCode': solution_unit_name}, inplace=True)
    
    return merged_df

In [22]:
def export_log(solution_id, timestamp):
    # now = datetime.now()
    # timestamp = now.strftime("%Y%m%d_%H%M%S") 
    ## Specify the file name
    if solution_id:
        log_name = f"./Logs/activities_output_{timestamp}_solution_{solution_id}.csv.gz"
    else:
        log_name = f"./Logs/activities_output_{timestamp}.csv.gz"
    # Save the DataFrame to CSV
    logs_df.to_csv(log_name, index=False)

In [23]:
def export_solutions(activities_with_solutions, financial_year ,timestamp, details):
    
    file_name = f"./Data_Output/activities_output_{financial_year.replace('/', '')}_AT_{timestamp}_{details}.csv"
    activities_with_solutions.to_csv(file_name, index=False)

In [24]:
financial_year = ''

def aggregate_results(df):
      
      # solution_columns = [col for col in df.columns if 'solution_' in col and '_unit' not in col]
      solution_columns = [col for col in df.columns if 'solution_' in col and '_unit' not in col and '_travel_time' not in col]


      df_melted = df.melt(id_vars=[col for col in df.columns if col not in solution_columns],
                        value_vars=solution_columns, 
                        var_name='SolutionColumn', 
                        value_name='Solution')

      df_melted['SolutionNumber'] = df_melted['SolutionColumn'].apply(lambda x: x.split('_')[1])

      df_melted['CC_Activity_Date'] = pd.to_datetime(df_melted['CC_Activity_Date'])
      df_melted['Fin_Year'] = pd.cut(df_melted['CC_Activity_Date'], 
                                    bins=[pd.Timestamp('2018-04-01'), pd.Timestamp('2019-04-01'),
                                          pd.Timestamp('2020-04-01'), pd.Timestamp('2021-04-01'),
                                          pd.Timestamp('2022-04-01')],
                                    labels=['18/19', '19/20', '20/21', '21/22'])

      grouped = df_melted.groupby(['Solution', 'SolutionNumber', 
                                    'CC_Level', 'Fin_Year']).size().reset_index(name='Activity_Count')

      sorted_df = grouped.sort_values(by=['SolutionNumber', 'Solution', 'CC_Level', 'Fin_Year'])

      final_df = sorted_df.loc[sorted_df['Fin_Year'] == financial_year]

      return final_df

In [25]:
def generate_detail_string(nsga3 = nsga3, restricted_mutation = restricted_mutation
                           , restricted_mutation_depth = restricted_mutation_depth,
                           include_extreme_individual = include_extreme_individual,
                           include_original_sites = include_original_sites):
    detail_string_parts = []
    if nsga3:
        detail_string_parts.append("NSGA3")
    if not nsga3:
        detail_string_parts.append("NSGA2")
    if restricted_mutation:
        detail_string_parts.append(f"Site_Limit_{restricted_mutation_depth}")
    if weighted_mutation:
        detail_string_parts.append(f"Weighted")
    if include_extreme_individual:
        detail_string_parts.append("EI_Inc")
    if include_original_sites:
        detail_string_parts.append("OI_Inc")
    detail_string_parts.append(f"Num_Elites_{elite_pop}")
    detail_string = '_'.join(detail_string_parts)
    return detail_string

In [26]:
def output_results(results,timestamp,run_detail_string):

    file_parts = ["./Data_Output/activities_output_grouped", financial_year.replace('/', ''), f"AT_{timestamp}"]
    file_parts.append(run_detail_string)
    file_name = '_'.join(file_parts) + ".csv"

    results.to_csv(file_name, index=False)
    
    return print(f"File output: {file_name}")

In [27]:
def output_map(m,timestamp,solution_number,run_detail_string):

    file_parts = ["./Data_Output/map", financial_year.replace('/', ''), f"AT_{timestamp}"]
    file_parts.append(f"Solution_{solution_number}")
    file_parts.append(run_detail_string)
    file_name = '_'.join(file_parts) + ".html"

    m.save(file_name)
    

Then lets run our algorithm and evolve our solutions

In [28]:
 # https://hubble-live-assets.s3.eu-west-1.amazonaws.com/bapm/file_asset/file/64/LNU_doc_Nov_2018.pdf
 # LNUs over 600 advised to have dedicated tier 3 resource so ideally under
 # LNUs over 400 IC days dedicated resident Tier 2 resource 
 # so surmising that the idealised tier 2 should have between 600 and 400 IC days
scenarios = [
           ['1: Full run Alder Hey removed',
                 # objectives
                {}
                ,# proposed additions
                []
                ,# proposed restrictions
                ['E01006570'] # Remove Alder Hey
                ]
            ,
            ['2: Burnley tier 2 IC Days Limit',
                 # objectives
                {'E01024897': {'NICU': {'max': 600}} # Burnley
                }
                ,# proposed additions
                []
                ,# proposed restrictions
                ['E01006570'] # Remove Alder Hey
                ]
            ,
            ['3: Preston tier 2 IC Days Limit',
                 # objectives
                {'E01025300': {'NICU': {'max': 600}} # Preston
                }
                ,# proposed additions
                []
                ,# proposed restrictions
                ['E01006570'] # Remove Alder Hey
                ]
            ,
            ['4: Burnley tier 1 ICU Limit',
                 # objectives
                {'E01024897': {'NICU': {'max': 400}} # Burnley
                }
                ,# proposed additions
                []
                ,# proposed restrictions
                ['E01006570'] # Remove Alder Hey
                ]
            ,
            ['5: Preston tier 1 ICU Limit',
                 # objectives
                {'E01025300': {'NICU': {'max': 400}} # Preston
                }
                ,# proposed additions
                []
                ,# proposed restrictions
                ['E01006570'] # Remove Alder Hey
                ]
            ,
            ['6: Blackburn added as tier 3',
                 # objectives
                {'E01012632': {'NICU': {'min': 1000}} # Blackburn
                }
                ,# proposed additions
                ['E01012632'] # Blackburn
                ,# proposed restrictions
                ['E01006570'] # Remove Alder Hey
                ]            
            ,
            ['7: Blackburn added as tier 2',
                 # objectives
                {'E01012632': {'NICU': {'max': 600}} # Blackburn
                }
                ,# proposed additions
                ['E01012632'] # Blackburn
                ,# proposed restrictions
                ['E01006570'] # Remove Alder Hey
                ]
            ,
            ['8: Blackburn added as tier 3 Burnley removed',
                 # objectives
                {'E01012632': {'NICU': {'min': 1000}} # Blackburn
                }
                ,# proposed additions
                ['E01012632'] # Blackburn
                ,# proposed restrictions
                ['E01006570','E01024897'] # Remove Alder Hey and Burnley
                ]
            ,
            ['9: Blackburn added as tier 3 Preston Removed',
                 # objectives
                {'E01012632': {'NICU': {'min': 1000}} # Blackburn
                }
                ,# proposed additions
                ['E01012632'] # Blackburn
                ,# proposed restrictions
                ['E01006570','E01025300'] # Remove Alder Hey and Preston
                ]
            ,
            ['10: Blackburn added as tier 3, Preston and Burnley restricted to tier 1',
                 # objectives
                {'E01012632': {'NICU': {'min': 2000}} # Blackburn
                 ,'E01024897': {'NICU': {'max': 400}} # Burnley
                 ,'E01025300': {'NICU': {'max': 400}} # Preston
                }
                ,# proposed additions
                ['E01012632'] # Blackburn
                ,# proposed restrictions
                ['E01006570'] # Remove Alder Hey
                ]  
            ,
            ['11: Tameside restricted to tier 1',
                 # objectives
                {'E01005944': {'NICU': {'max': 400}} # Tameside
                }
                ,# proposed additions
                [] 
                ,# proposed restrictions
                ['E01006570'] # Remove Alder Hey
                ]      
            ,
            ['12: Bolton restricted to tier 1',
                 # objectives
                {'E01004880': {'NICU': {'max': 400}} # Bolton
                }
                ,# proposed additions
                [] 
                ,# proposed restrictions
                ['E01006570'] # Remove Alder Hey
                ]    
            ,  
            ['13: Wigan (Royal Albert) restricted to tier 1',
                 # objectives
                {'E01006370': {'NICU': {'max': 400}} # Wigan
                }
                ,# proposed additions
                [] 
                ,# proposed restrictions
                ['E01006570'] # Remove Alder Hey
                ] 
            ,     
            ['14: St Marys (MFT) restricted to tier 1',
                 # objectives
                {'E01005062': {'NICU': {'max': 400}} # St Marys
                }
                ,# proposed additions
                [] 
                ,# proposed restrictions
                ['E01006570'] # Remove Alder Hey
                ]      
            ,     
            ['15: Oldham restricted to tier 1',
                 # objectives
                {'E01005354': {'NICU': {'max': 400}} # Oldham
                }
                ,# proposed additions
                [] 
                ,# proposed restrictions
                ['E01006570'] # Remove Alder Hey
                ]          
            ,     
            ['16: Leighton restricted to tier 1',
                 # objectives
                {'E01018480': {'NICU': {'max': 400}} # Leighton
                }
                ,# proposed additions
                [] 
                ,# proposed restrictions
                ['E01006570'] # Remove Alder Hey
                ] 
            , 
            ['17: Arrowe Park (Wirral) restricted to tier 1',
                 # objectives
                {'E01007251': {'NICU': {'max': 400}} # Arrowe Park
                }
                ,# proposed additions
                [] 
                ,# proposed restrictions
                ['E01006570'] # Remove Alder Hey
                ]             
            ,     
            ['18: COC restricted to tier 1',
                 # objectives
                {'E01018377': {'NICU': {'max': 400}} # COC
                }
                ,# proposed additions
                [] 
                ,# proposed restrictions
                ['E01006570'] # Remove Alder Hey
                ]    
            ,     
            ['19: Arrowe Park (Wirral) removed',
                 # objectives
                {}
                ,# proposed additions
                [] 
                ,# proposed restrictions
                ['E01006570', 'E01007251'] # Remove Alder Hey and Arrowe Park
                ]             
            ,     
            ['20: COC removed',
                 # objectives
                {}
                ,# proposed additions
                [] 
                ,# proposed restrictions
                ['E01006570','E01018377'] # Remove Alder Hey and COC
                ]    
            ,    
            ['21: Warrington restricted to tier 1',
                 # objectives
                {'E01012457': {'NICU': {'max': 400}} # Warrington
                }
                ,# proposed additions
                [] 
                ,# proposed restrictions
                ['E01006570'] # Remove Alder Hey
                ] 
            ,     
            ['22: Whiston restricted to tier 1',
                 # objectives
                {'E01006499': {'NICU': {'max': 400}} # Whiston
                }
                ,# proposed additions
                [] 
                ,# proposed restrictions
                ['E01006570'] # Remove Alder Hey
                ]    
            ,    
            ['23: Warrington removed',
                 # objectives
                {}
                ,# proposed additions
                [] 
                ,# proposed restrictions
                ['E01006570','E01012457'] # Remove Alder Hey
                ] 
            ,     
            ['24: Whiston removed',
                 # objectives
                {}
                ,# proposed additions
                [] 
                ,# proposed restrictions
                ['E01006570','E01006499'] # Remove Alder Hey
                ]   
]


In [29]:
scenarios = [          
               ['25: Arrowe Park (Wirral) restricted to tier 2',
                 # objectives
                {'E01007251': {'NICU': {'max': 600}} # Arrowe Park
                }
                ,# proposed additions
                [] 
                ,# proposed restrictions
                ['E01006570'] # Remove Alder Hey
                ]   
          ,         
               ['26: Bolton restricted to tier 2',
                 # objectives
                {'E01004880': {'NICU': {'max': 600}} # Bolton
                }
                ,# proposed additions
                [] 
                ,# proposed restrictions
                ['E01006570'] # Remove Alder Hey
                ] 
          ,         
               ['27: Oldham restricted to tier 2',
                 # objectives
                {'E01005354': {'NICU': {'max': 600}} # Oldham
                }
                ,# proposed additions
                [] 
                ,# proposed restrictions
                ['E01006570'] # Remove Alder Hey
                ] 
          ,         
               ['28: Wythenshaw restricted to tier 1',
                 # objectives
                {'E01005070': {'NICU': {'max': 400}} # Wythenshaw
                }
                ,# proposed additions
                [] 
                ,# proposed restrictions
                ['E01006570'] # Remove Alder Hey
                ] 
          ,         
               ['29: North Manchester restricted to tier 1',
                 # objectives
                {'E01005164': {'NICU': {'max': 400}} # North Manchester
                }
                ,# proposed additions
                [] 
                ,# proposed restrictions
                ['E01006570'] # Remove Alder Hey
                ] 
          ,         
               ['30: Stepping Hill restricted to tier 1',
                 # objectives
                {'E01005801': {'NICU': {'max': 400}} # Stepping Hill
                }
                ,# proposed additions
                [] 
                ,# proposed restrictions
                ['E01006570'] # Remove Alder Hey
                ] 
          # ,         
          #      ['31: Rochdale restricted to tier 1',
          #        # objectives
          #       {'E01007251': {'NICU': {'max': 400}} 
          #       }
          #       ,# proposed additions
          #       [] 
          #       ,# proposed restrictions
          #       ['E01006570'] # Remove Alder Hey
          #       ] 
          ,         
               ['32: Blackpool restricted to tier 1',
                 # objectives
                {'E01012722': {'NICU': {'max': 400}} # Blackpool
                }
                ,# proposed additions
                [] 
                ,# proposed restrictions
                ['E01006570'] # Remove Alder Hey
                ] 
          ,         
               ['33: Lancaster restricted to tier 1',
                 # objectives
                {'E01033071': {'NICU': {'max': 400}} # Lancaster
                }
                ,# proposed additions
                [] 
                ,# proposed restrictions
                ['E01006570'] # Remove Alder Hey
                ] 
          ,         
               ['34: Barrow removed',
                 # objectives
                {}
                ,# proposed additions
                [] 
                ,# proposed restrictions
                ['E01006570','E01019155'] # Remove Alder Hey and Barrow
                ] 
          ,         
               ['35: Blackpool removed',
                 # objectives
                {}
                ,# proposed additions
                [] 
                ,# proposed restrictions
                ['E01006570','E01012722'] # Remove Alder Hey and Blackpool
                ] 
          ,         
               ['36: Lancaster removed',
                 # objectives
                {}
                ,# proposed additions
                [] 
                ,# proposed restrictions
                ['E01006570','E01033071'] # Remove Alder Hey and Lancaster
                ] 
          ,         
               ['37: Leighton removed',
                 # objectives
                {}
                ,# proposed additions
                [] 
                ,# proposed restrictions
                ['E01006570','E01018480'] # Remove Alder Hey and Leighton
                ] 
          ,         
               ['38: Macclesfield removed',
                 # objectives
                {}
                ,# proposed additions
                [] 
                ,# proposed restrictions
                ['E01006570','E01018616'] # Remove Alder Hey and Macclesfield
                ] 
          ,         
               ['39: Southport and Ormskirk restricted to Level 1',
                 # objectives
                {'E01025488': {'NICU': {'max': 400}} # Southport and Ormskirk
                }
                ,# proposed additions
                [] 
                ,# proposed restrictions
                ['E01006570'] # Remove Alder Hey 
                ] 
          ,         
               ['40: Southport and Ormskirk removed',
                 # objectives
                {}
                ,# proposed additions
                [] 
                ,# proposed restrictions
                ['E01006570','E01025488'] # Remove Alder Hey and Southport and Ormskirk 
                ] 
]


In [31]:
num_switches = 3
periods = ['19/20','20/21','21/22','22/23','23/24']
mutation_limits = [20]
pop_num = 100
restricted_sites = [] 
include_extreme_individual = False
include_original_sites = True
elite_pop = int(pop_num * 0.1)
proportion_weighted= 0.2
base_penalty = 0.1 # for objective feasibility
activity_focus = []
# activity_focus_list = ['NICU',['HDU', 'SCBU']]
activity_focus_list = []
objective_stagnation_threshold = 20
weighted_mutation = False

restricted_mutation_depth = mutation_limits[0]

# the maximum number of generations to run the evolution for
max_number_generations = 1000
# number of generations that the pareto front is stagnant before stopping
stagnation_limit = 120 # stagnation before stopping
# the number of generations to wait with a stagnant pareto fron before increasing the mutation rate
stagnation_threshold = 10 # to increase mutaion rate after x generations

start = datetime.now()
print(f'Start time: {start}')

log_columns = ['Start Time', 'Financial Year', 'Mutation Depth', 'Include Original Sites', 'NSGA Version', 
               'Scenario', 'Proposed Additions', 'Current Site List', 
               'Index of Best Individual', 'Fitness of Best Individual']

for year in periods:
    financial_year = year
    start_date, end_date = get_fin_year_dates(financial_year)
    
    # check the dataset covers the period
    if start_date >= startrange and end_date <= endrange:
        activities_with_solutions = activities.loc[(activities['CC_Activity_Date'] >= start_date) & (activities['CC_Activity_Date'] <= end_date)].copy().reset_index(drop=True)
        
        # data_prep
        filtered_activities, num_homes, num_sites, most_frequent_sites, home_lsoas, home_activities, home_populations = data_prep(activities, start_date, end_date, site_codes, int_to_activity)
        
        toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.random_site, num_homes)
        
        toolbox.register("population", init_population, prop_weighted = proportion_weighted)
        
        for switch_tuple in itertools.product([False, True], repeat=num_switches):
            nsga3 = switch_tuple[0] 
            include_original_sites = switch_tuple[1] 
            weighted_mutation = switch_tuple[2] 
            nearby_sites = {}
            num_elites = elite_pop

            run_details_df = pd.DataFrame(columns=log_columns)
        
            
            print(
                f"Run as {financial_year} using "
                f"{'NSGA3' if nsga3 else 'NSGA2'}"
                f"{' and EI included' if include_extreme_individual else ''}"
                f"{' and OI included' if include_original_sites else ''}"
                f"{f' and mutation distance limit {restricted_mutation_depth}' if restricted_mutation else ''}"
                f"{f' and {elite_pop} Elites'}"
                )
            
            now = datetime.now()
            timestamp = now.strftime("%Y%m%d_%H%M%S")
            deets = generate_detail_string(nsga3=nsga3, restricted_mutation=restricted_mutation,
                                           restricted_mutation_depth=restricted_mutation_depth,
                                           include_extreme_individual=include_extreme_individual,
                                           include_original_sites=include_original_sites)

            for solution, objectives in enumerate(scenarios, 1):
                # solution_number = solution
                scenario = objectives [0]
                solution_number_str, _ = scenario.split(':', 1) 
                solution_number = int(solution_number_str)
                activity_limits = objectives [1]
                proposed_additions = objectives [2]
                restricted_sites = objectives [3]

                
                # Check and register the appropriate mutation function based on the conditions
                if restricted_mutation:
                    toolbox.register("mutate", restricted_mutNearbyInt, indpb=individual_mutation_prob_amt, nearby_passed_sites=nearby_sites)
                    print("    Registered restricted mutation function")
                elif weighted_mutation:
                    toolbox.register("mutate", weighted_mutation_function, indpb=individual_mutation_prob_amt
                                    , cumulative_probs = cumulative_probs, home_lsoas=home_lsoas
                                    ,objective_stagnation_threshold = objective_stagnation_threshold, activity_limits=activity_limits 
                                    )
                    print("    Registered weighted mutation function")
                else:
                    toolbox.register("mutate", restricted_mutUniformInt, low=0, up=num_sites-1, indpb=individual_mutation_prob
                                    ,objective_stagnation_threshold = objective_stagnation_threshold, activity_limits=activity_limits )
                    
                # toolbox.decorate("mate",   history.decorator)
                # toolbox.decorate("mutate", history.decorator)  

                for activity in activity_focus_list if activity_focus_list else [None]:

                    activity_focus = activity
                
                    restricted_site_indices = {site_codes.index(code) for code in restricted_sites if code in current_site_list}
                    current_site_list = prepare_site_list(site_codes, proposed_additions, restricted_sites)
                    nearby_sites = {home: get_nearby_sites(home, len(current_site_list)) for home in home_lsoas}

                    if weighted_mutation:
                        restricted_mutation_depth = len(current_site_list)

                    history = tools.History()
                    logs_df = create_logs_df()
                    print(f"    Solution number: {solution_number}")
                    print(f"        Current site list: {current_site_list}")
                    print(f"        Scenario: {scenario}")
                    print(f"        Proposed Additions: {proposed_additions}")
                    print(f"        Site Objectives: {activity_limits}")
                    print(f"        Activity Focus: {activity_focus}")
                    
                    pop, log, hof, paretofront, best = main()
                    best_index = pop.index(best)

                    print(f"        Index of the best individual: {best_index}")
                    print(f"        Fitness: {pop[best_index].fitness.values}")

                    log_entry = {
                                'Start Time': start.strftime("%Y-%m-%d %H:%M:%S"),
                                'Financial Year': financial_year,
                                'Mutation Depth': restricted_mutation_depth,
                                'Include Original Sites': include_original_sites,
                                'NSGA Version': 'NSGA3' if nsga3 else 'NSGA2',
                                'Scenario': scenario,
                                'Proposed Additions': proposed_additions,
                                'Current Site List': current_site_list,
                                'Index of Best Individual': best_index,
                                'Fitness of Best Individual': pop[best_index].fitness.values
                                }
                    
                    log_entry_df = pd.DataFrame([log_entry]) 
                    run_details_df = pd.concat([run_details_df, log_entry_df], ignore_index=True)
                    
                    # m = plot_assignments_folium(pop[best_index])
                    # output_map(m,timestamp,solution_number,deets)
                    # display(m)
                    
                    home_to_site_mapping = create_solution_list(best, home_lsoas, current_site_list)
                    activities_with_solutions = add_solution(activities_with_solutions, home_to_site_mapping, solution_number, activity_focus, travel_times_dict)
                    export_log(solution_number, timestamp)
                
            export_solutions(activities_with_solutions, financial_year ,timestamp, deets)
            # results = aggregate_results(activities_with_solutions)
            # output_results(results, timestamp, deets)

            run_details_df.to_csv(f"./Data_Output/Run_Details_Log_{deets}_{timestamp}.csv", index=False)

            gc.collect()
    else:
        print(f"No data present for {start_date} to {end_date}")    
    
    gc.collect()  

end = datetime.now()
duration = end - start
print(f'End time: {end}')
print(f'Total duration: {duration}')



Start time: 2024-03-27 12:45:35.496842
Run as 19/20 using NSGA2 and 10 Elites
    Solution number: 25
        Current site list: ['E01007251', 'E01018377', 'E01018480', 'E01006512', 'E01018616', 'E01025488', 'E01012457', 'E01006499', 'E01005062', 'E01005164', 'E01005070', 'E01006370', 'E01004880', 'E01005354', 'E01005801', 'E01005944', 'E01019155', 'E01033071', 'E01025300', 'E01024897', 'E01012722']
        Scenario: 25: Arrowe Park (Wirral) restricted to tier 2
        Proposed Additions: []
        Site Objectives: {'E01007251': {'NICU': {'max': 600}}}
        Activity Focus: None
        Added 20 weighted individuals and 80 random individuals
        Generation: 1000, Generations Since Improvement: 0   
        Index of the best individual: 0
        Fitness: (0.20144059005939893, 0.872455205057652, 0.5653333333333334, 1.3880655226209049, 0.14365853658536584, 0.7567142857142857, 2.580276030747729, 0.162, 0.2687725894751238)
    Solution number: 26
        Current site list: ['E01007