### Extracting "Brooklyn" borough data from Socrata using SQL query

In [1]:
# Import libraries
from pandas.core import api
import datetime
from sodapy import Socrata
import pandas as pd
import folium
import numpy as np
import seaborn as sns

In [2]:
import datetime
import pandas as pd
from sodapy import Socrata

# Initialize the Socrata client
client = Socrata("data.cityofnewyork.us", None)

# Define the current date and one year ago
today = datetime.date.today()
one_year_ago = today - datetime.timedelta(days=365)

# Set the query parameters for Brooklyn collisions
query_params = (
    f"borough='BROOKLYN' AND "
    f"(number_of_persons_injured>0 OR number_of_persons_killed>0) AND "
    f"crash_date>='{one_year_ago}'"
)

# Query the data for Brooklyn collisions
results = client.get("h9gi-nx95", where=query_params, limit=10000)

# Create a DataFrame and preprocess it
df = pd.DataFrame.from_records(results)
df['latitude'] = df['latitude'].astype(float)
df['longitude'] = df['longitude'].astype(float)

# Drop rows with missing latitude or longitude
df = df.dropna(subset=["latitude", "longitude"])

# Define bounding box for Brooklyn (adjust as necessary)
df = df[(df['latitude'] > 40.5774) & (df['latitude'] < 40.7399)]

# Query ambulance stations in Brooklyn
query_params = "factype='AMBULANCE STATION' AND boro='BROOKLYN'"
ambulance_stations = client.get("ji82-xba5", where=query_params)

# Convert to DataFrame and preprocess
ambulance_stations_df = pd.DataFrame.from_records(ambulance_stations)
ambulance_stations_df['latitude'] = ambulance_stations_df['latitude'].astype(float)
ambulance_stations_df['longitude'] = ambulance_stations_df['longitude'].astype(float)

# Query hospitals in Brooklyn
query_params = "factype='HOSPITAL' AND boro='BROOKLYN'"
hospitals = client.get("ji82-xba5", where=query_params)

# Convert to DataFrame and preprocess
hospitals_df = pd.DataFrame.from_records(hospitals)
hospitals_df['latitude'] = hospitals_df['latitude'].astype(float)
hospitals_df['longitude'] = hospitals_df['longitude'].astype(float)




### Viewing dataframe

In [3]:
print(hospitals_df.columns)


Index(['uid', 'facname', 'addressnum', 'streetname', 'address', 'city', 'boro',
       'borocode', 'zipcode', 'latitude', 'longitude', 'xcoord', 'ycoord',
       'bin', 'bbl', 'cd', 'council', 'ct2010', 'ct2020', 'nta2010', 'nta2020',
       'facgroup', 'facsubgrp', 'factype', 'capacity', 'optype', 'opname',
       'opabbrev', 'overlevel', 'overagency', 'overabbrev', 'datasource',
       'facdomain', 'schooldist', 'policeprct', 'servarea', 'geometry'],
      dtype='object')


In [4]:
print(hospitals_df.head(10))

                                uid                                  facname  \
0  057440531385e338d9e1cb6f50e27b1b        BROOKDALE HOSPITAL MEDICAL CENTER   
1  0ec1c511cbc4a01f5de2c07233ed1b1f          UNIVERSITY HOSPITAL OF BROOKLYN   
2  2fe180969947782aba008d184d74807e                     MOUNT SINAI BROOKLYN   
3  4621b7758fb1daa64f243cb0784c2cf0                         CALVARY HOSPITAL   
4  47005f3b7115b10d6755307ba1f36c8d             KINGS COUNTY HOSPITAL CENTER   
5  67cdabbcfe6113f4639d8c3c7f1807a8            NYU LANGONE HOSPITAL-BROOKLYN   
6  727c880fdc602a54126580501c2305a5                    SOUTH BROOKLYN HEALTH   
7  737404c8395a96d36ed729a9b9ec7251  WOODHULL MEDICAL & MENTAL HEALTH CENTER   
8  82fd6376320f4c013c9e002705aae035    MAIMONIDES MIDWOOD COMMUNITY HOSPITAL   
9  a772f225e69700a2e5a125fc582399da                MAIMONIDES MEDICAL CENTER   

  addressnum       streetname              address      city      boro  \
0          1  BROOKDALE PLAZA    1 BROOKDALE 

### Identify the nearest hospital for each collision in Brooklyn

In [5]:
from geopy.distance import geodesic
import folium
import seaborn as sns
import numpy as np

# Function to calculate the nearest hospital for each collision
def find_nearest_facility(collision, facilities):
    # Check for empty facilities DataFrame
    if facilities.empty:
        return None, float('inf')  # Placeholder if no facilities are available
    
    collision_location = (collision['latitude'], collision['longitude'])
    try:
        # Calculate distances
        distances = facilities.apply(
            lambda x: geodesic(collision_location, (x['latitude'], x['longitude'])).meters, axis=1
        )
        nearest_index = distances.idxmin()
        # Ensure nearest_index is valid
        return facilities.loc[nearest_index, 'facname'], distances[nearest_index]
    except Exception as e:
        print(f"Error in find_nearest_facility function:")
        print(f"Collision Data: {collision}")
        print(f"Facilities Data: {facilities}")
        raise e

# Debugging: Ensure 'facname' column exists in hospitals_df
if 'facname' not in hospitals_df.columns:
    print("Error: 'facname' column is missing from hospitals_df!")
    raise ValueError("'facname' column is missing.")

# Debugging: Strip whitespace from 'facname' to ensure consistency
hospitals_df['facname'] = hospitals_df['facname'].str.strip()

# Assign unique "Set2" colors for each hospital
set2_colors = sns.color_palette("Set2", len(hospitals_df)).as_hex()
hospital_color_dict = {
    hospitals_df.iloc[i]['facname']: set2_colors[i]
    for i in range(len(hospitals_df))
}

# Debugging: Check if NYU Langone Hospital-Brooklyn exists in hospitals_df and hospital_color_dict
print("Hospital names in hospitals_df:")
print(hospitals_df['facname'].unique())
print("Hospital colors dictionary keys:")
print(hospital_color_dict.keys())
if "NYU LANGONE HOSPITAL-BROOKLYN" not in hospital_color_dict:
    print("Error: NYU Langone Hospital-Brooklyn is missing from hospital_color_dict!")
    raise KeyError("NYU Langone Hospital-Brooklyn is not found in hospital_color_dict.")

# Debugging: Check for empty collisions DataFrame
if df.empty:
    print("Collisions DataFrame (df) is empty. Ensure there is valid collision data.")
else:
    try:
        # Calculate the nearest hospital for each collision
        df['nearest_hospital'], df['distance_to_hospital'] = zip(*df.apply(
            lambda x: find_nearest_facility(x, hospitals_df), axis=1
        ))
    except ValueError as e:
        print("Debugging: Some rows returned an incorrect number of values.")
        print(f"Error: {e}")
        raise

# Debugging: Check nearest hospitals assigned to collisions
print("Nearest hospital assignments:")
print(df[['nearest_hospital', 'distance_to_hospital']].head())

# Create a Folium map centered at Brooklyn
map_center = [df['latitude'].mean(), df['longitude'].mean()]
collision_map = folium.Map(location=map_center, zoom_start=12)

# Add hospital markers with color in the popup
for _, row in hospitals_df.iterrows():
    hospital_name = row['facname']
    hospital_color = hospital_color_dict[hospital_name]
    folium.Marker(
        [row['latitude'], row['longitude']],
        popup=(
            f"<b>{hospital_name}</b><br>"
            f"Associated Collision Color: <span style='color:{hospital_color};'>{hospital_color}</span>"
        ),
        icon=folium.Icon(color='blue', icon='hospital', prefix='fa')
    ).add_to(collision_map)


# Add collision markers, using "Set2" colors based on nearest hospital
for _, row in df.iterrows():
    nearest_hospital = row['nearest_hospital']
    collision_color = hospital_color_dict[nearest_hospital]  # Assign colorblind color based on nearest hospital
    folium.CircleMarker(
        location=[row['latitude'], row['longitude']],
        radius=3,  # Adjust the size of the dot
        color=collision_color,  # Color based on nearest hospital
        fill=True,
        fill_color=collision_color,  # Fill color matches outline
        fill_opacity=0.6
    ).add_to(collision_map)

# Save the map to an HTML file
collision_map.save("brooklyn_collision_map.html")
print("Collision map saved as 'brooklyn_collision_map.html'")


Hospital names in hospitals_df:
['BROOKDALE HOSPITAL MEDICAL CENTER' 'UNIVERSITY HOSPITAL OF BROOKLYN'
 'MOUNT SINAI BROOKLYN' 'CALVARY HOSPITAL' 'KINGS COUNTY HOSPITAL CENTER'
 'NYU LANGONE HOSPITAL-BROOKLYN' 'SOUTH BROOKLYN HEALTH'
 'WOODHULL MEDICAL & MENTAL HEALTH CENTER'
 'MAIMONIDES MIDWOOD COMMUNITY HOSPITAL' 'MAIMONIDES MEDICAL CENTER'
 'BROOKLYN HOSPITAL CENTER - DOWNTOWN CAMPUS'
 'NYU LANGONE HOSPITAL - JOSEPH S. AND DIANE H. STEINBERG AMBULATORY CARE CENTER'
 'INTERFAITH MEDICAL CENTER'
 'NEWYORK-PRESBYTERIAN BROOKLYN METHODIST HOSPITAL'
 'WYCKOFF HEIGHTS MEDICAL CENTER' 'KINGSBROOK JEWISH MEDICAL VILLAGE']
Hospital colors dictionary keys:
dict_keys(['BROOKDALE HOSPITAL MEDICAL CENTER', 'UNIVERSITY HOSPITAL OF BROOKLYN', 'MOUNT SINAI BROOKLYN', 'CALVARY HOSPITAL', 'KINGS COUNTY HOSPITAL CENTER', 'NYU LANGONE HOSPITAL-BROOKLYN', 'SOUTH BROOKLYN HEALTH', 'WOODHULL MEDICAL & MENTAL HEALTH CENTER', 'MAIMONIDES MIDWOOD COMMUNITY HOSPITAL', 'MAIMONIDES MEDICAL CENTER', 'BROOKLYN H

### Determining the location in Brooklyn that would minimise response time.

* Haversine Distance Calculation: computes the great-circle distance between two latitude/longitude points.

* Cost Function: calculates the total cost for a given facility location.<br>

    The cost is defined as the sum of the distances from each collision site to its nearest facility, including the new location.

* Using Hill-Climbing Algorithm: explores the neighborhood of a given starting location (can be hospital) to find a local minimum of the cost function.<br>

    The neighborhood is defined by moving north, south, east, and west by a specified step size.


In [6]:
# Import libraries 
import pandas as pd
import numpy as np
from joblib import Parallel, delayed
from math import radians, sin, cos, sqrt, atan2

In [7]:
from geopy.distance import geodesic
import pandas as pd
from math import radians, sin, cos, sqrt, atan2
from joblib import Parallel, delayed

# Function to calculate Haversine distance (faster approach than geopy)
def haversine(lat1, lon1, lat2, lon2):
    R = 6371000  # Radius of Earth in meters
    phi1, phi2 = radians(lat1), radians(lat2)
    dphi = radians(lat2 - lat1)
    dlambda = radians(lon2 - lon1)
    a = sin(dphi / 2) ** 2 + cos(phi1) * cos(phi2) * sin(dlambda / 2) ** 2
    return R * 2 * atan2(sqrt(a), sqrt(1 - a))

# Function to calculate total cost for a given facility location
def calculate_cost(collisions, facilities, new_location):
    total_cost = 0
    new_lat, new_lon = new_location
    for _, collision in collisions.iterrows():
        collision_lat, collision_lon = collision['latitude'], collision['longitude']
        min_distance = float('inf')
        # Calculate distance to all existing and new facilities
        for _, facility in facilities.iterrows():
            facility_lat, facility_lon = facility['latitude'], facility['longitude']
            distance = haversine(collision_lat, collision_lon, facility_lat, facility_lon)
            min_distance = min(min_distance, distance)
        # Include distance to the new location
        distance_to_new = haversine(collision_lat, collision_lon, new_lat, new_lon)
        min_distance = min(min_distance, distance_to_new)
        total_cost += min_distance
    return total_cost

# Function to perform hill-climbing
def hill_climbing(collisions, facilities, starting_point, step_size=1000, max_iterations=50):
    current_location = starting_point
    current_cost = calculate_cost(collisions, facilities, current_location)
    for _ in range(max_iterations):
        neighbors = [
            (current_location[0] + step_size / 111000, current_location[1]),  # North
            (current_location[0] - step_size / 111000, current_location[1]),  # South
            (current_location[0], current_location[1] + step_size / (111000 * cos(radians(current_location[0])))),  # East
            (current_location[0], current_location[1] - step_size / (111000 * cos(radians(current_location[0]))))   # West
        ]
        best_neighbor = None
        best_neighbor_cost = current_cost
        for neighbor in neighbors:
            neighbor_cost = calculate_cost(collisions, facilities, neighbor)
            if neighbor_cost < best_neighbor_cost:
                best_neighbor = neighbor
                best_neighbor_cost = neighbor_cost
        if best_neighbor_cost < current_cost:
            current_location = best_neighbor
            current_cost = best_neighbor_cost
        else:
            break  # Local optimum reached
    return current_location, current_cost

# Optimization function to find the best location for a new facility
def optimize_new_facility(collisions, facilities):
    best_location = None
    best_cost = float('inf')
    
    results = Parallel(n_jobs=-1)(
        delayed(hill_climbing)(
            collisions, facilities, (facility['latitude'], facility['longitude'])
        ) for _, facility in facilities.iterrows()
    )
    
    for location, cost in results:
        if cost < best_cost:
            best_location = location
            best_cost = cost
    return best_location, best_cost

# Example Usage:
# 1. Filter the collision data for Brooklyn (we will adjust the query for Brooklyn).
# 2. Ensure that the facilities (e.g., hospitals) are also from Brooklyn.

# Filter collisions DataFrame for Brooklyn (coordinates: approx 40.5774 to 40.7399 latitude and -74.0473 to -73.9105 longitude)
df_brooklyn = df[(df['latitude'] >= 40.5774) & (df['latitude'] <= 40.7399) &
                 (df['longitude'] >= -74.0473) & (df['longitude'] <= -73.9105)]

# Filter the hospitals DataFrame for Brooklyn
hospitals_brooklyn = hospitals_df[(hospitals_df['latitude'] >= 40.5774) & (hospitals_df['latitude'] <= 40.7399) &
                                  (hospitals_df['longitude'] >= -74.0473) & (hospitals_df['longitude'] <= -73.9105)]

# Ensure collision and facilities data are valid
if df_brooklyn.empty or hospitals_brooklyn.empty:
    print("No collision or hospital data found for Brooklyn. Please check the dataset.")
else:
    # Run the optimization to find the best location for a new facility in Brooklyn
    best_location, best_cost = optimize_new_facility(df_brooklyn, hospitals_brooklyn)

    print(f"Best New Location: {best_location}")
    print(f"Total Cost at Best Location: {best_cost:.2f} meters")


Best New Location: (40.610507813063975, -73.9973487341904)
Total Cost at Best Location: 10221524.77 meters


### Incorporate best location (new facility) into map

In [8]:
from geopy.distance import geodesic
import folium
import seaborn as sns
import numpy as np
from joblib import Parallel, delayed
from math import radians, sin, cos, sqrt, atan2

# Haversine function for faster distance calculations
def haversine(lat1, lon1, lat2, lon2):
    R = 6371000  # Earth's radius in meters
    phi1, phi2 = radians(lat1), radians(lat2)
    dphi = radians(lat2 - lat1)
    dlambda = radians(lon2 - lon1)
    a = sin(dphi / 2) ** 2 + cos(phi1) * cos(phi2) * sin(dlambda / 2) ** 2
    return R * 2 * atan2(sqrt(a), sqrt(1 - a))

# Hill-climbing optimization for new facility location
def hill_climbing(collisions, facilities, starting_point, step_size=1000, max_iterations=50):
    current_location = starting_point
    current_cost = calculate_cost(collisions, facilities, current_location)
    for _ in range(max_iterations):
        neighbors = [
            (current_location[0] + step_size / 111000, current_location[1]),  # North
            (current_location[0] - step_size / 111000, current_location[1]),  # South
            (current_location[0], current_location[1] + step_size / (111000 * cos(radians(current_location[0])))),  # East
            (current_location[0], current_location[1] - step_size / (111000 * cos(radians(current_location[0]))))   # West
        ]
        best_neighbor = None
        best_neighbor_cost = current_cost
        for neighbor in neighbors:
            neighbor_cost = calculate_cost(collisions, facilities, neighbor)
            if neighbor_cost < best_neighbor_cost:
                best_neighbor = neighbor
                best_neighbor_cost = neighbor_cost
        if best_neighbor_cost < current_cost:
            current_location = best_neighbor
            current_cost = best_neighbor_cost
        else:
            break  # Local optimum
    return current_location, current_cost

# Calculate cost (total distances)
def calculate_cost(collisions, facilities, new_location):
    total_cost = 0
    new_lat, new_lon = new_location
    for _, collision in collisions.iterrows():
        collision_lat, collision_lon = collision['latitude'], collision['longitude']
        min_distance = float('inf')
        for _, facility in facilities.iterrows():
            facility_lat, facility_lon = facility['latitude'], facility['longitude']
            min_distance = min(min_distance, haversine(collision_lat, collision_lon, facility_lat, facility_lon))
        total_cost += min(min_distance, haversine(collision_lat, collision_lon, new_lat, new_lon))
    return total_cost

# Optimize location for a new facility
def optimize_new_facility(collisions, facilities):
    results = Parallel(n_jobs=-1)(
        delayed(hill_climbing)(
            collisions, facilities, (facility['latitude'], facility['longitude'])
        ) for _, facility in facilities.iterrows()
    )
    best_location, best_cost = min(results, key=lambda x: x[1])
    return best_location, best_cost

# Nearest hospital function
def find_nearest_facility(collision, facilities):
    collision_location = (collision['latitude'], collision['longitude'])
    distances = facilities.apply(
        lambda x: geodesic(collision_location, (x['latitude'], x['longitude'])).meters, axis=1
    )
    nearest_index = distances.idxmin()
    return facilities.loc[nearest_index, 'facname'], distances[nearest_index]

# Brooklyn bounding box coordinates for filtering
brooklyn_lat_min = 40.5774
brooklyn_lat_max = 40.7399
brooklyn_lon_min = -74.0473
brooklyn_lon_max = -73.9105

# Filter collision data for Brooklyn (latitude, longitude constraints)
df_brooklyn = df[(df['latitude'] >= brooklyn_lat_min) & (df['latitude'] <= brooklyn_lat_max) &
                 (df['longitude'] >= brooklyn_lon_min) & (df['longitude'] <= brooklyn_lon_max)]

# Filter hospitals data for Brooklyn (latitude, longitude constraints)
hospitals_brooklyn = hospitals_df[(hospitals_df['latitude'] >= brooklyn_lat_min) & (hospitals_df['latitude'] <= brooklyn_lat_max) &
                                  (hospitals_df['longitude'] >= brooklyn_lon_min) & (hospitals_df['longitude'] <= brooklyn_lon_max)]

# Calculate nearest hospital for each collision in Brooklyn
df_brooklyn['nearest_hospital'], df_brooklyn['distance_to_hospital'] = zip(*df_brooklyn.apply(
    lambda x: find_nearest_facility(x, hospitals_brooklyn), axis=1
))

# Optimize for the best new location for a facility
best_location, best_cost = optimize_new_facility(df_brooklyn, hospitals_brooklyn)

# Create map for Brooklyn
map_center = [df_brooklyn['latitude'].mean(), df_brooklyn['longitude'].mean()]
collision_map = folium.Map(location=map_center, zoom_start=12)

# Assign unique "Set2" colors for each hospital
set2_colors = sns.color_palette("Set2", len(hospitals_brooklyn)).as_hex()
hospital_color_dict = {
    hospitals_brooklyn.iloc[i]['facname']: set2_colors[i]
    for i in range(len(hospitals_brooklyn))
}

# Add hospital markers
for _, row in hospitals_brooklyn.iterrows():
    hospital_name = row['facname']
    hospital_color = hospital_color_dict[hospital_name]
    folium.Marker(
        [row['latitude'], row['longitude']],
        popup=(
            f"<b>{hospital_name}</b><br>"
            f"Associated Collision Color: <span style='color:{hospital_color}'>{hospital_color}</span>"
        ),
        icon=folium.Icon(color='blue', icon='hospital', prefix='fa')
    ).add_to(collision_map)

# Add collision markers
for _, row in df_brooklyn.iterrows():
    nearest_hospital = row['nearest_hospital']
    collision_color = hospital_color_dict[nearest_hospital]
    folium.CircleMarker(
        location=[row['latitude'], row['longitude']],
        radius=3,
        color=collision_color,
        fill=True,
        fill_color=collision_color,
        fill_opacity=0.6
    ).add_to(collision_map)

# Add marker for new facility
folium.Marker(
    location=best_location,
    popup=(
        f"<b>[SUGGESTED] New Facility Location</b><br>"
        f"Latitude: {best_location[0]:.6f}<br>"
        f"Longitude: {best_location[1]:.6f}<br>"
        f"Lowest Cost: {best_cost:.0f} m<br><br>"
       
    ),
    icon=folium.Icon(color='green', icon='plus', prefix='fa')
).add_to(collision_map)

# Save map to HTML file
collision_map.save("brooklyn_new_facility_map.html")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_brooklyn['nearest_hospital'], df_brooklyn['distance_to_hospital'] = zip(*df_brooklyn.apply(
