In [24]:
import pandas as pd
from sklearn.cluster import KMeans
from geopy.distance import geodesic
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings("ignore")

In [25]:
# Load the dataset
df = pd.read_csv('df_like_real.csv')
df.head()

Unnamed: 0,Patient ID,Pickup Location,Dropoff Location,Appointment Time,Medical Conditions
0,1,"(28.43340643775268, 77.13138722372226)","(28.462880966624688, 77.03950512512215)",18:00:00,Heart Disease
1,2,"(28.44928965130358, 77.0191069469172)","(28.53360133587291, 77.1860128834832)",16:00:00,Asthma
2,3,"(28.451872087050404, 77.12603335821352)","(28.457746410467998, 77.11396318545778)",16:00:00,Heart Disease
3,4,"(28.545497735582117, 76.95032705102054)","(28.462880966624688, 77.03950512512215)",13:30:00,Diabetes
4,5,"(28.564781627009793, 77.08259851795242)","(28.462880966624688, 77.03950512512215)",17:30:00,Diabetes


In [26]:
import numpy as np
# Convert appointment times to minutes since midnight
df['Appointment Time (minutes)'] = pd.to_datetime(df['Appointment Time']).dt.hour * 60 + pd.to_datetime(df['Appointment Time']).dt.minute
# One-hot encode medical conditions
encoder = OneHotEncoder(sparse_output=False)
encoded_conditions = encoder.fit_transform(df[['Medical Conditions']])
# Create DataFrame for encoded medical conditions
encoded_conditions_df = pd.DataFrame(encoded_conditions, columns=encoder.get_feature_names_out(['Medical Conditions']))
# Combine with the original DataFrame
df = pd.concat([df, encoded_conditions_df], axis=1)
# Features for clustering
features = np.hstack((df[['Appointment Time (minutes)']].values, encoded_conditions))

### Step 2: Apply K-means Clustering
# Choose the number of clusters
num_clusters = 10
# Apply K-means
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(features)
# Add cluster labels to DataFrame
df['Cluster'] = kmeans.labels_
# Print DataFrame with cluster labels
df.head()


Unnamed: 0,Patient ID,Pickup Location,Dropoff Location,Appointment Time,Medical Conditions,Appointment Time (minutes),Medical Conditions_Asthma,Medical Conditions_Diabetes,Medical Conditions_Heart Disease,Cluster
0,1,"(28.43340643775268, 77.13138722372226)","(28.462880966624688, 77.03950512512215)",18:00:00,Heart Disease,1080,0.0,0.0,1.0,5
1,2,"(28.44928965130358, 77.0191069469172)","(28.53360133587291, 77.1860128834832)",16:00:00,Asthma,960,1.0,0.0,0.0,3
2,3,"(28.451872087050404, 77.12603335821352)","(28.457746410467998, 77.11396318545778)",16:00:00,Heart Disease,960,0.0,0.0,1.0,3
3,4,"(28.545497735582117, 76.95032705102054)","(28.462880966624688, 77.03950512512215)",13:30:00,Diabetes,810,0.0,1.0,0.0,1
4,5,"(28.564781627009793, 77.08259851795242)","(28.462880966624688, 77.03950512512215)",17:30:00,Diabetes,1050,0.0,1.0,0.0,5


In [27]:
import ast

# Convert 'Pickup Location' and 'Dropoff Location' from strings to tuples
df['Pickup Location'] = df['Pickup Location'].apply(ast.literal_eval)
df['Dropoff Location'] = df['Dropoff Location'].apply(ast.literal_eval)

In [28]:
import folium
# Plot the results
colors = ['blue', 'green', 'purple', 'orange', 'darkred', 'lightred', 'beige', 'darkblue', 'darkgreen', 'cadetblue']
# Create a map centered on Gurugram
gurugram_center = [28.4595, 77.0266]  # Approximate center of Gurugram
m = folium.Map(location=gurugram_center, zoom_start=12)

# Add markers for pickup and dropoff locations
for idx, row in df.iterrows():
    pickup_marker = folium.Marker(location=row['Pickup Location'],
                                  popup=f"Pickup: {row['Patient ID']}",
                                  icon=folium.Icon(color=colors[row['Cluster']]))
    dropoff_marker = folium.Marker(location=row['Dropoff Location'],
                                   popup=f"Dropoff: {row['Patient ID']}",
                                   icon=folium.Icon(color='red'))
    pickup_marker.add_to(m)
    dropoff_marker.add_to(m)

# Save map to an HTML file
m.save('gurugram_map_with_onehot_cluster.html')

In [29]:
# Function to calculate travel time based on coordinates (assuming constant speed)
def calculate_travel_time(coord1, coord2, speed=30):
    distance = geodesic(coord1, coord2).kilometers
    return distance / speed * 60  # Convert to minutes

# Ensure coordinates are in tuple format
def parse_coordinates(coord_str):
    return tuple(map(float, coord_str.strip("()").split(",")))

# Apply the parsing function if needed
if isinstance(df['Pickup Location'][0], str):
    df['Pickup Location'] = df['Pickup Location'].apply(parse_coordinates)
    df['Dropoff Location'] = df['Dropoff Location'].apply(parse_coordinates)

# Calculate travel time
df['Travel Time'] = df.apply(lambda row: calculate_travel_time(row['Pickup Location'], row['Dropoff Location']), axis=1)

# Display the first few rows of the dataframe
df.head()

Unnamed: 0,Patient ID,Pickup Location,Dropoff Location,Appointment Time,Medical Conditions,Appointment Time (minutes),Medical Conditions_Asthma,Medical Conditions_Diabetes,Medical Conditions_Heart Disease,Cluster,Travel Time
0,1,"(28.43340643775268, 77.13138722372226)","(28.462880966624688, 77.03950512512215)",18:00:00,Heart Disease,1080,0.0,0.0,1.0,5,19.148992
1,2,"(28.44928965130358, 77.0191069469172)","(28.53360133587291, 77.1860128834832)",16:00:00,Asthma,960,1.0,0.0,0.0,3,37.649709
2,3,"(28.451872087050404, 77.12603335821352)","(28.457746410467998, 77.11396318545778)",16:00:00,Heart Disease,960,0.0,0.0,1.0,3,2.699255
3,4,"(28.545497735582117, 76.95032705102054)","(28.462880966624688, 77.03950512512215)",13:30:00,Diabetes,810,0.0,1.0,0.0,1,25.302881
4,5,"(28.564781627009793, 77.08259851795242)","(28.462880966624688, 77.03950512512215)",17:30:00,Diabetes,1050,0.0,1.0,0.0,5,24.111153


In [32]:
# Define a function to calculate similarity score based on appointment times and medical conditions
def similarity_score(patient1, patient2):
    time_diff = abs(patient1['Appointment Time (minutes)'] - patient2['Appointment Time (minutes)'])
    condition_sim = 1 if patient1['Medical Conditions'] == patient2['Medical Conditions'] else 0
    return time_diff + (1 - condition_sim) * 100  # Higher score for different conditions

# Group patients within each cluster
ride_groups = []
for cluster_id in df['Cluster'].unique():
    cluster_df = df[df['Cluster'] == cluster_id]
    patients = cluster_df.to_dict('records')
    
    # Create ride groups
    current_group = []
    while patients:
        patient = patients.pop(0)
        current_group.append(patient)
        remaining_patients = []
        for other_patient in patients:
            score = similarity_score(patient, other_patient)
            if score < 200:  # Example threshold
                current_group.append(other_patient)
            else:
                remaining_patients.append(other_patient)
        patients = remaining_patients
    
    ride_groups.append(current_group)

# Output ride groups and total travel time
for i, group in enumerate(ride_groups):
    group_travel_time = sum([p['Travel Time'] for p in group])
    print(f"Ride Group {i + 1}:")
    for patient in group:
        print(f"  Patient ID: {patient['Patient ID']}")
    print(f"  Total Travel Time: {group_travel_time} minutes\n")

Ride Group 1:
  Patient ID: 1
  Patient ID: 5
  Patient ID: 11
  Patient ID: 56
  Patient ID: 77
  Patient ID: 79
  Patient ID: 85
  Total Travel Time: 139.34996573492498 minutes

Ride Group 2:
  Patient ID: 2
  Patient ID: 3
  Patient ID: 6
  Patient ID: 20
  Patient ID: 22
  Patient ID: 31
  Patient ID: 39
  Patient ID: 44
  Patient ID: 66
  Patient ID: 69
  Patient ID: 82
  Patient ID: 87
  Patient ID: 91
  Patient ID: 94
  Patient ID: 95
  Total Travel Time: 363.9206725532358 minutes

Ride Group 3:
  Patient ID: 4
  Patient ID: 10
  Patient ID: 13
  Patient ID: 14
  Patient ID: 17
  Patient ID: 21
  Patient ID: 24
  Patient ID: 26
  Patient ID: 28
  Patient ID: 30
  Patient ID: 32
  Patient ID: 40
  Patient ID: 41
  Patient ID: 48
  Patient ID: 71
  Patient ID: 72
  Patient ID: 86
  Patient ID: 88
  Patient ID: 92
  Patient ID: 97
  Patient ID: 100
  Total Travel Time: 517.6657869175868 minutes

Ride Group 4:
  Patient ID: 7
  Patient ID: 9
  Patient ID: 12
  Patient ID: 16
  Patie