<a href="https://colab.research.google.com/github/AdrianaCaetano/MasterProject/blob/main/rideshare_workflow_all_days.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Rideshare Project - Workflow - All Days
## Create schedules for MTWRFS, MWF, TR, M, T, W, R, F, S, round-trip
## KEEPING PASSENGERS IN THE POOL

Fall/2022

## Check Google Colab GPU Connection

In [None]:
# Check what GPU you've been assigned at any time by executing the following cell

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Not connected to a GPU


## Install all necessary libraries

In [None]:
# Trying to solve the problem with ploting OSMnx with a different version of matplotlib
#!pip uninstall matplotlib 

# Then, install the stable version
#!pip install -U matplotlib 

# Restart runtime when done

In [None]:
# Intall osmnx, networkx and geopandas

!pip install osmnx
!pip install networkx
!pip install geopandas
!pip install pgeocode
!pip install geopy
!pip install pyzipcode

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting osmnx
  Downloading osmnx-1.2.2-py2.py3-none-any.whl (92 kB)
[K     |████████████████████████████████| 92 kB 176 kB/s 
[?25hCollecting networkx>=2.8
  Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 51.7 MB/s 
[?25hCollecting Rtree>=1.0
  Downloading Rtree-1.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[K     |████████████████████████████████| 1.0 MB 54.8 MB/s 
Collecting geopandas>=0.11
  Downloading geopandas-0.12.1-py3-none-any.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 51.0 MB/s 
[?25hCollecting matplotlib>=3.5
  Downloading matplotlib-3.6.2-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (9.4 MB)
[K     |████████████████████████████████| 9.4 MB 36.6 MB/s 
[?25hCollecting pandas>=1.4
  Downloading pandas-1.5.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pgeocode
  Downloading pgeocode-0.3.0-py3-none-any.whl (8.5 kB)
Installing collected packages: pgeocode
Successfully installed pgeocode-0.3.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyzipcode
  Downloading pyzipcode-3.0.1.tar.gz (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 8.8 MB/s 
[?25hBuilding wheels for collected packages: pyzipcode
  Building wheel for pyzipcode (setup.py) ... [?25l[?25hdone
  Created wheel for pyzipcode: filename=pyzipcode-3.0.1-py3-none-any.whl size=1932413 sha256=e72edefa40810

In [None]:
from datetime import date, datetime, time, timedelta

from google.colab import files

import pandas as pd
import geopandas as gpd
import osmnx as ox
import networkx as nx
from geopy.geocoders import Nominatim
from pyzipcode import ZipCodeDatabase


%matplotlib inline
ox.__version__
ox.config(use_cache=True, log_console=True)

# Initialize objects
geolocator = Nominatim(user_agent="carpool")
zcdb = ZipCodeDatabase() 

# Retrieve the drive network graph within the limits of the bounding box
graph_path = '/content/drive/MyDrive/Colab_Notebooks/Rideshare/OpenStreetMaps/box.graphml'

# Load the graph
G = ox.load_graphml(graph_path)

# Make CA zip code information available 
file_path = "/content/drive/MyDrive/Colab_Notebooks/Rideshare/CA_zip_boundaries_shapefiles/ZCTA2010.shp"

# Read file using gpd.read_file()
ca_zip_gdf = gpd.read_file(file_path)




# Helper functions

Save df as csv file

In [None]:
#Save students df as a csv file


# Save into Google Drive
def upload_csv_to_gdrive(df, file_name):
    ''' Upload the dataframe content as a csv file to Google Drive''' 

    path = '/content/drive/MyDrive/Colab_Notebooks/Rideshare/data/'
    file_path = path + file_name
    df.to_csv(file_path, index= False)


    print('Uploaded to Google Drive on', date.today())
    

# Download locally
def download_as_csv(df, file_name):
    ''' Download locally a csv file created with a dataframe content'''

    df.to_csv(file_name, index= False)
    files.download(file_name)
    print('Downloaded on', date.today())

limit_radius()

In [None]:
# Limit the radius from locations to campus
def limit_radius(df, radius):
    '''Limit the radius from locations to campus
       PARAMETER: dataframe: student locations with 'distance' from campus column
                  float: radius in miles
       RETURN: student_count and different_zip_count within this radius 
    '''

    # sort by distance
    df.sort_values(by=['distance'], ascending=False, inplace=True)

    index_max_dist = df[ (df['distance'] > radius) ].index
    df.drop(index_max_dist , inplace=True)
    #df = df[(df['distance'] < radius)]

    #print("Further distance is ",  df.iloc[0, df.columns.get_loc('distance')], 
    #      'with', len(close_loc_df.index), 'datapoints')
    
    student_count = df['stud_count'].sum()
    dif_locations = len(df['postal_code'])

    #print("There are %d students from %d locations within %d mile radius from CSUSM" % 
    #      (student_count, dif_locations, radius) )
    
    # save into a file
    #file_name = 'loc_' + str(radius) + '.csv'
    #upload_csv_to_gdrive(df, file_name)
    #download_as_csv(df, file_name)
    
    #print(df.head(1))

    return student_count, dif_locations


shortest_route to/from campus

In [None]:
# Check Euclidean Distance between two students
def check_distance(stud_A, stud_B):
    ''' Check Euclidean Distance between stud_A and stud_B
    Return: distance in miles'''

    # get stud_A location based on zip code
    location_A = ox.geocoder.geocode_to_gdf(stud_A['postal_code'])
    x_A = location_A['longitude']
    y_A = location_A['latitude']

    # get stud_B location based on zip code
    location_B = ox.geocoder.geocode_to_gdf(stud_B['postal_code'])
    x_B = location_B['longitude']
    y_B = location_B['latitude']

    # compute euclidean distance between stud_A and stud_B nodes
    distance_in_degrees = ox.distance.euclidean_dist_vec(y_A, x_A, y_B, x_B)
    distance_in_miles = distance_in_degrees * 69.2 

    return distance_in_miles


In [None]:
# Outbound

# Get the shortest path from origin to CSUSM
def shortest_route_to_csusm(graph, origin_lat, origin_lng):
    ''' Get a list with intermediary nodes of a route from origin to CSUSM
    Parameters: origin_lat: the y(latitude) coordinate
                origin_lng: the x(longitude) coordinate
                graph: the base network to find the route
    Return: a list of the shortest route by distance 
            a list of the shortest route by time
            travel time as a string
            total distance of the route in miles
    '''
    # X_longitude, Y_latitude from destination
    csusm = {'x': -117.1587, 'y': 33.1298}   
    # In the graph, get the nodes closest to the points
    destination_node = ox.distance.nearest_nodes(graph, X= csusm.get('x'), Y= csusm.get('y'))

    x = origin_lng # x(longitude)
    y = origin_lat # y(latitude)
    # In the graph, get the nodes closest to the points
    origin_node = ox.distance.nearest_nodes(graph, X= x, Y= y,)
    
    # Get the shortest route by distance and by time from origin to destination
    shortest_route_by_distance = ox.shortest_path(graph, origin_node, destination_node, weight='length')
    shortest_route_by_travel_time = ox.shortest_path(graph, origin_node, destination_node, weight='travel_time')

    # Get the travel time, in seconds
    # use "nx" (networkx), not "ox" (osmnx)
    try:
        travel_time_in_seconds = nx.shortest_path_length(graph, origin_node, destination_node, weight='travel_time')       
        # The travel time in "HOURS:MINUTES:SECONDS" format
        travel_time_str = str(timedelta(seconds=travel_time_in_seconds))
        # print('\tTravel time' , travel_time_str)
    except nx.NetworkXNoPath:
        travel_time_str= '00:00:00'
        #print('Travel time cannot be computed from origin (%f,%f)' % (origin_lat, origin_lng))
        pass
        
    # Get the distance in meters
    try:
        distance_in_meters = nx.shortest_path_length(graph, origin_node, destination_node, weight='length')
        # Distance in kilometers and miles
        distance_in_kilometers = distance_in_meters / 1000
        distance_in_miles = distance_in_kilometers * 0.62137
        # print('\tDistance: %.2f Km or %.2f miles' % (distance_in_kilometers, distance_in_miles) )
    except nx.NetworkXNoPath:
        distance_in_miles = 0
        #print('Path length cannot be computed from origin (%f,%f)' % (origin_lat, origin_lng))
        pass

    return shortest_route_by_distance, shortest_route_by_travel_time, travel_time_str, distance_in_miles

In [None]:
# Return 

# Get the shortest path from CSUSM to destination
def shortest_route_from_csusm(graph, destination_lat, destination_lng):
    ''' Get a list with intermediary nodes of a route from CSUSM to destination
    Parameters: destination_lat: the y(latitude) coordinate
                destination_lng: the x(longitude) coordinate
                graph: the base network to find the route
    Return: a list of the shortest route by distance 
            a list of the shortest route by time
            travel time as a string
            total distance of the route in miles
    '''
    # X_longitude, Y_latitude from origin
    csusm = {'x': -117.1587, 'y': 33.1298}   
    # In the graph, get the nodes closest to the points
    origin_node = ox.distance.nearest_nodes(graph, X= csusm.get('x'), Y= csusm.get('y'))
    
    # Destination
    x = destination_lng # x(longitude)
    y = destination_lat # y(latitude)
    # In the graph, get the nodes closest to the points
    destination_node = ox.distance.nearest_nodes(graph, X= x, Y= y,)
    
    # Get the shortest route by distance and by time from origin to destination
    shortest_route_by_distance = ox.shortest_path(graph, origin_node, destination_node, weight='length')
    shortest_route_by_travel_time = ox.shortest_path(graph, origin_node, destination_node, weight='travel_time')

    # Get the travel time, in seconds
    # use "nx" (networkx), not "ox" (osmnx)
    try:
        travel_time_in_seconds = nx.shortest_path_length(graph, origin_node, destination_node, weight='travel_time')       
        # The travel time in "HOURS:MINUTES:SECONDS" format
        travel_time_str = str(timedelta(seconds=travel_time_in_seconds))
        # print('\tTravel time' , travel_time_str)
    except nx.NetworkXNoPath:
        travel_time_str= '00:00:00'
        #print('Travel time cannot be computed from destination (%f,%f)' % (destination_lat, destination_lng))
        pass

    # Get the distance in meters
    try:
        distance_in_meters = nx.shortest_path_length(graph, origin_node, destination_node, weight='length')
        # Distance in kilometers and miles
        distance_in_kilometers = distance_in_meters / 1000
        distance_in_miles = distance_in_kilometers * 0.62137
        # print('\tDistance: %.2f Km or %.2f miles' % (distance_in_kilometers, distance_in_miles) )
    except nx.NetworkXNoPath:
        distance_in_miles = 0
        #print('Path length cannot be computed from destination (%f,%f)' % (destination_lat, destination_lng))
        pass
        
    return shortest_route_by_distance, shortest_route_by_travel_time, travel_time_str, distance_in_miles

In [None]:
# convert string into list (helper function for the next function)
def convert_to_list(string):
    ''' Receives a list in a string format and converts it back to a list'''
    lst = list(string.split(","))
    first = lst[0]
    last = lst[-1]

    if (first[0] == '['): 
        first = first[1:]
        lst[0] = first
    if (last[-1] == ']'): 
        last = last[:-1] 
        lst[-1] = last

    return lst


def find_zips_on_route(graph, route):
    ''' Retrieve coordinates from nodes on the route, then find zip codes for each coord
        Return: set of zip codes on the route
    '''
    route_lst=[]
    if (type(route) == str):
        route_lst = convert_to_list(route) #convert from str to list
    else:
        route_lst = route

    path_coordinates = []

    for node in route_lst:
        node = int(node) #make sure the node is a number

        # get node attributes in the Graph using
        lat = graph.nodes[node]['y']
        lng = graph.nodes[node]['x']        
        path_coordinates.append((lat,lng))

    # create a list zip codes on the route (whe need to preserve the order)
    path_zip_codes = []

    for coord in path_coordinates:
        try:
            location = geolocator.reverse(coord)
            #print('postal code:', location.raw['address']['postcode'])
            path_zip_codes.append(location.raw['address']['postcode'])
        except:
            # print('Location:', location)
            # print("Postal code not found for node ", node)
            pass

    # keep only unique values into the list
    path_zip_codes = list(dict.fromkeys(path_zip_codes)) 

    #print(path_zip_codes)
    return path_zip_codes


Check schedule constraint

In [None]:
# Arrival and Departure Schedule for weekdays

def check_schedule(stud_A, stud_B):
    ''' Check if stud_A and stud_B arrival and departure time for all weekdays are the same. 
    Return: true or false '''

    if check_schedule_M(stud_A, stud_B) and  check_schedule_T(stud_A, stud_B) and check_schedule_W(stud_A, stud_B) and check_schedule_R(stud_A, stud_B) and check_schedule_F(stud_A, stud_B) and check_schedule_S(stud_A, stud_B): 
        return True
    else: 
        return False

    # if stud_A['M_arr'] == stud_B['M_arr'] and stud_A['M_dep'] == stud_B['M_dep'] :
    #     if stud_A['T_arr'] == stud_B['T_arr'] and stud_A['T_dep'] == stud_B['T_dep'] :
    #         if stud_A['W_arr'] == stud_B['W_arr'] and stud_A['W_dep'] == stud_B['W_dep'] :
    #             if stud_A['R_arr'] == stud_B['R_arr'] and stud_A['R_dep'] == stud_B['R_dep'] :
    #                 if stud_A['F_arr'] == stud_B['F_arr'] and stud_A['F_dep'] == stud_B['F_dep'] :
    #                     if stud_A['S_arr'] == stud_B['S_arr'] and stud_A['S_dep'] == stud_B['S_dep'] :
    #                         return True
    # else : 
    #     return False


def check_schedule_MWF(stud_A, stud_B):
    ''' Check if stud_A and stud_B arrival and departure time for Mondays, Wednesdays, 
    and Fridays are the same. 
    Return: true or false '''

    if check_schedule_M(stud_A, stud_B) and check_schedule_W(stud_A, stud_B) and check_schedule_F(stud_A, stud_B): 
        return True
    else: 
        return False


    # if (stud_A['M_arr'] == stud_B['M_arr'] and stud_A['M_dep'] == stud_B['M_dep'] ) :
    #     if (stud_A['W_arr'] == stud_B['W_arr'] and stud_A['W_dep'] == stud_B['W_dep'] ) :
    #         if (stud_A['F_arr'] == stud_B['F_arr'] and stud_A['F_dep'] == stud_B['F_dep'] ) :
    #             return True
    # else : 
    #     return False


def check_schedule_TR (stud_A, stud_B):
    ''' Check if stud_A and stud_B arrival and departure time for Tuesdays 
    and Thursdays are the same. 
    Return: true or false '''

    if check_schedule_T(stud_A, stud_B) and check_schedule_R(stud_A, stud_B): 
        return True
    else: 
        return False

    # if (stud_A['T_arr'] == stud_B['T_arr'] and stud_A['T_dep'] == stud_B['T_dep'] ) :
    #     if (stud_A['R_arr'] == stud_B['R_arr'] and stud_A['R_dep'] == stud_B['R_dep'] ) :
    #         return True
    # else : 
    #     return False


def check_schedule_S (stud_A, stud_B):
    ''' Check if stud_A and stud_B arrival and departure time for Saturday are the same. 
    Return: true or false '''
    
    if (stud_A['S_arr'] == stud_B['S_arr'] and stud_A['S_dep'] == stud_B['S_dep'] ) :
        return True
    else : 
        return False


def check_schedule_M (stud_A, stud_B):
    ''' Check if stud_A and stud_B arrival and departure time for Monday are the same. 
    Return: true or false '''

    if (stud_A['M_arr'] == stud_B['M_arr'] and stud_A['M_dep'] == stud_B['M_dep'] ) :
        return True
    else : 
        return False


def check_schedule_T (stud_A, stud_B):
    ''' Check if stud_A and stud_B arrival and departure time for Tuesday are the same. 
    Return: true or false '''

    if (stud_A['T_arr'] == stud_B['T_arr'] and stud_A['T_dep'] == stud_B['T_dep'] ) :
        return True
    else : 
        return False


def check_schedule_W (stud_A, stud_B):
    ''' Check if stud_A and stud_B arrival and departure time for Wednesday are the same. 
    Return: true or false '''

    if (stud_A['W_arr'] == stud_B['W_arr'] and stud_A['W_dep'] == stud_B['W_dep'] ) :
        return True
    else : 
        return False


def check_schedule_R (stud_A, stud_B):
    ''' Check if stud_A and stud_B arrival and departure time for Thursday are the same. 
    Return: true or false '''

    if (stud_A['R_arr'] == stud_B['R_arr'] and stud_A['R_dep'] == stud_B['R_dep'] ) :
        return True
    else : 
        return False


def check_schedule_F (stud_A, stud_B):
    ''' Check if stud_A and stud_B arrival and departure time for Friday are the same. 
    Return: true or false '''

    if (stud_A['F_arr'] == stud_B['F_arr'] and stud_A['F_dep'] == stud_B['F_dep'] ) :
        return True
    else : 
        return False

In [None]:
# Arrival Schedule for weekdays

def check_arrival(stud_A, stud_B):
    ''' Check if stud_A and stud_B arrival time for all weekdays are the same. 
    Return: true or false '''

    if stud_A.M_arr == stud_B.M_arr:
        if stud_A.T_arr == stud_B.T_arr :
            if stud_A.W.arr == stud_B.W_arr :
                if stud_A.R.arr == stud_B.R_arr :
                    if stud_A.F_arr == stud_B.F_arr :
                        if stud_A.S_arr == stud_B.S_arr :
                            return True
    else : 
        return False


def check_arrival_MWF(stud_A, stud_B):
    ''' Check if stud_A and stud_B arrival time for Mondays, Wednesdays, 
    and Fridays are the same. 
    Return: true or false '''

    if (stud_A.M_arr == stud_B.M_arr ) :
        if (stud_A.W.arr == stud_B.W_arr ) :
            if (stud_A.F_arr == stud_B.F_arr ) :
                return True
    else : 
        return False


def check_arrival_TR (stud_A, stud_B):
    ''' Check if stud_A and stud_B arrival time for Tuesdays 
    and Thursdays are the same. 
    Return: true or false '''

    if (stud_A.T_arr == stud_B.T_arr ) :
        if (stud_A.R.arr == stud_B.R_arr ) :
            return True
    else : 
        return False


def check_arrival_S (stud_A, stud_B):
    ''' Check if stud_A and stud_B arrival time for Saturday are the same. 
    Return: true or false '''

    if (stud_A.S_arr == stud_B.S_arr ) :
        return True
    else : 
        return False


def check_arrival_M (stud_A, stud_B):
    ''' Check if stud_A and stud_B arrival time for Monday are the same. 
    Return: true or false '''

    if (stud_A.M_arr == stud_B.M_arr ) :
        return True
    else : 
        return False


def check_arrival_T (stud_A, stud_B):
    ''' Check if stud_A and stud_B arrival time for Tuesday are the same. 
    Return: true or false '''

    if (stud_A.T_arr == stud_B.T_arr ) :
        return True
    else : 
        return False


def check_arrival_W (stud_A, stud_B):
    ''' Check if stud_A and stud_B arrival time for Wednesday are the same. 
    Return: true or false '''

    if (stud_A.W_arr == stud_B.W_arr ) :
        return True
    else : 
        return False


def check_arrival_R (stud_A, stud_B):
    ''' Check if stud_A and stud_B arrival time for Thursday are the same. 
    Return: true or false '''

    if (stud_A.R_arr == stud_B.R_arr ) :
        return True
    else : 
        return False


def check_arrival_F (stud_A, stud_B):
    ''' Check if stud_A and stud_B arrival time for Friday are the same. 
    Return: true or false '''

    if (stud_A.F_arr == stud_B.F_arr ) :
        return True
    else : 
        return False

In [None]:
# Departure Schedule for weekdays

def check_departure(stud_A, stud_B):
    ''' Check if stud_A and stud_B departure time for all weekdays are the same. 
    Return: true or false '''

    if stud_A.M_dep == stud_B.M_dep :
        if stud_A.T_dep == stud_B.T_dep :
            if stud_A.W_dep == stud_B.W_dep :
                if stud_A.R_dep == stud_B.R_dep :
                    if stud_A.F_dep == stud_B.F_dep :
                        if stud_A.S_dep == stud_B.S_d :
                            return True
    else : 
        return False


def check_departure_MWF(stud_A, stud_B):
    ''' Check if stud_A and stud_B departure time for Mondays, Wednesdays, 
    and Fridays are the same. 
    Return: true or false '''

    if ( stud_A.M_dep == stud_B.M_dep ) :
        if (stud_A.W_dep == stud_B.W_dep ) :
            if (stud_A.F_dep == stud_B.F_dep ) :
                return True
    else : 
        return False


def check_departure_TR (stud_A, stud_B):
    ''' Check if stud_A and stud_B departure time for Tuesdays 
    and Thursdays are the same. 
    Return: true or false '''

    if (stud_A.T_dep == stud_B.T_dep ) :
        if (stud_A.R_dep == stud_B.R_dep ) :
            return True
    else : 
        return False


def check_departure_S (stud_A, stud_B):
    ''' Check if stud_A and stud_B departure time for Saturday are the same. 
    Return: true or false '''

    if (stud_A.S_dep == stud_B.S_dep ) :
        return True
    else : 
        return False


def check_departure_M (stud_A, stud_B):
    ''' Check if stud_A and stud_B departure time for Monday are the same. 
    Return: true or false '''

    if (stud_A.M_dep == stud_B.M_dep ) :
        return True
    else : 
        return False


def check_departure_T (stud_A, stud_B):
    ''' Check if stud_A and stud_B departure time for Tuesday are the same. 
    Return: true or false '''

    if (stud_A.T_dep == stud_B.T_dep ) :
        return True
    else : 
        return False


def check_departure_W (stud_A, stud_B):
    ''' Check if stud_A and stud_B departure time for Wednesday are the same. 
    Return: true or false '''

    if (stud_A.W_dep == stud_B.W_dep ) :
        return True
    else : 
        return False


def check_departure_R (stud_A, stud_B):
    ''' Check if stud_A and stud_B departure time for Thursday are the same. 
    Return: true or false '''

    if (stud_A.R_dep == stud_B.R_dep ) :
        return True
    else : 
        return False


def check_departure_F (stud_A, stud_B):
    ''' Check if stud_A and stud_B departure time for Friday are the same. 
    Return: true or false '''

    if ( stud_A.F_dep == stud_B.F_dep ) :
        return True
    else : 
        return False

Check compatibility

In [None]:
def check_same_gender (stud_A, stud_B):
    ''' Check if student A and student B are of the same gender. 
    Return: true or false '''

    if (stud_A['Gender'] == stud_B['Gender'] ):
        return True
    else : 
        return False


def check_same_college (stud_A, stud_B):
    ''' Check if student A and student B are of the same college. 
    Return: true or false '''

    if (stud_A['College'] == stud_B['College'] ):
        return True
    else : 
        return False


def check_same_age_group(stud_A, stud_B):
    ''' Check if student A and student B are of the same age group. 
    Return: true or false '''

    if (stud_A['Under_25'] == stud_B['Under_25'] ):
        return True
    else : 
        return False


def check_same_level (stud_A, stud_B) :
    ''' Check if student A and student B are of the same level.
    Undergrad == True if ['Senior', 'Junior','Sophomore','Freshman']
    Undergrad == False if ['Graduate', 'Post-Bacc']
    Return: true or false '''

    if (stud_A['Undergrad'] == stud_B['Undergrad'] ):
        return True
    else : 
        return False  


def check_smoker (stud_A, stud_B):
    ''' Check if student A and student B are of both smokers or both non_smokers. 
    Return: true or false '''

    if stud_A['Smoker'] == stud_B['Smoker'] :
        return True
    else : 
        return False        

In [None]:
def comp_score(stud_A, stud_B, factor=2):
    ''' Compute the compatibility score between two ride-mates
    Parameters: stud_A and stud_B = potential ride-mates, 
                factor = weight for preferences
        Base value:    1     for feature with same characteristic, 
                      -1     for feature with different characteristics
        Factor:        1     for no preference
                   'factor'  weight for selected feature preference   
    Return score: the sum of the base value of each feature multiplied by students' preference factor 
    '''

    # factor must be between 2 and 5
    if factor <= 1: factor = 2     # Score range = [-17, 17]
    if factor > 5: factor = 5      # Score range = [-101, 101]
    
    # Default base values, assume no characteristic match
    base_age = base_gender = base_level = base_smoker = base_college = -1
    # Default factor values, assume student A has no preferences
    factor_age_A = factor_gender_A = factor_level_A = factor_nonsmoker_A = 1
    # Default factor values, assume student B has no preferences
    factor_age_B = factor_gender_B = factor_level_B = factor_nonsmoker_B = 1

    # Update base values when feature matches
    if (check_same_age_group(stud_A, stud_B)) : base_age = 1
    if (check_same_gender(stud_A, stud_B)) : base_gender = 1
    if (check_same_level(stud_A, stud_B)) :  base_level = 1 
    if (check_smoker(stud_A, stud_B)) : base_smoker = 1
    if (check_same_college(stud_A, stud_B)) : base_college = 1
    
    # Update factor value for stud_A preferences
    if (stud_A['Pref_Age'])    : factor_age_A = factor
    if (stud_A['Pref_Gender']) : factor_gender_A = factor
    if (stud_A['Pref_Status']) : factor_level_A = factor
    if (stud_A['Pref_NSm'])    : factor_nonsmoker_A = factor

    # Update factor value for stud_B preferences
    if (stud_B['Pref_Age'])    : factor_age_B = factor
    if (stud_B['Pref_Gender']) : factor_gender_B = factor
    if (stud_B['Pref_Status']) : factor_level_B = factor
    if (stud_B['Pref_NSm'])    : factor_nonsmoker_B = factor

    # Compute weighted values
    w_age = (factor_age_A * factor_age_B * base_age)
    w_gender = (factor_gender_A * factor_gender_B *base_gender)
    w_status = (factor_level_A * factor_level_B * base_level)
    w_NSmoker= (factor_nonsmoker_A * factor_nonsmoker_B * base_smoker)

    # final score = sum(weighted values) + base_college
    final_score = w_age + w_gender + w_status + w_NSmoker + base_college
   
    # (command + /) to comment blocs of lines
    # print("(factor_age_A:", factor_age_A ,
    #       "* factor_age_B:", factor_age_B  ,
    #       "* base_age:", base_age, ") = ", w_age,
    #       "\n+ (factor_gender_A:", factor_gender_A ,
    #       "* factor_gender_B:", factor_gender_B  ,
    #       "* base_gender:", base_gender, ") = ", w_gender, 
    #       "\n+ (factor_level_A:", factor_level_A ,
    #       "* factor_level_B:", factor_level_B  ,
    #       "* base_level:", base_level, ") = ", w_status, 
    #       "\n+ (factor_nonsmoker_A:", factor_nonsmoker_A ,
    #       "* factor_nonsmoker_B:", factor_nonsmoker_B  ,
    #       "* base_smoker:", base_smoker, ") = ", w_NSmoker,
    #       "\n+ base_college:", base_college
    #     )
    
    return final_score


Check role

In [None]:
def check_driver(stud):
    ''' Check student's role for driver or either. 
    Return: true or false '''

    if (stud['Role'] == 'D' or stud['Role'] == 'E') :
        return True
    else : # Student is a passenger
        return False

In [None]:
def check_passenger(stud):
    ''' Check student's role for passenger or either. 
    Return: true or false '''

    if (stud['Role'] == 'P' or stud['Role'] == 'E') :
        return True
    else : # Student is a driver
        return False

# Workflow Functions

In [None]:
def select_driver(df):
    ''' Select a driver from the pool df
    Return: driver's index and driver dict'''

    d_ind = None
    driver = None

    while True:
        # d_ind = df.sample(1, random_state=42).index # increases the computing time from secs to mins
        d_ind = df.sample(1).index
        driver = df.loc[d_ind].to_dict(orient='index')
        
        d_ind = list(driver.keys()) 
        d_ind = d_ind[0]

        if check_driver(driver[d_ind]): 
            break

    #d_ind = list(driver.keys())
    #d_ind = d_ind[0]

    return d_ind, driver[d_ind]


def remove_from_pool(remove_lst, df):
    ''' Remove students from pool
    Parameters: remove_lst: list of indices of students to be removed from df
                df: pool of students dataframe '''
    #print('remove_from_pool function\nType:',type(remove_lst), 'Length:', len(remove_lst), 'Remove:',remove_lst)
    for cand in remove_lst:
        try:
            df.drop(index= [cand], axis=0, inplace=True)
        except KeyError:
            #print('Candidate',cand ,'not found in pool')
            pass


def create_carpool_dict(driver):
    ''' Create a carpool dict based on driver's information 
        Return: carpool dict with values for driver ID, and vacant seats, waiting for candidates' information'''
    #carpool = {'driver': driver['ID'], 'vacant_seats': driver['Seats'], 'candidates': {} }

    # MODIF: added driver schedule into dict
    carpool = {'driver': driver['ID'], 'driver_schedule': check_driver_schedule(driver) , 'vacant_seats': driver['Seats'], 'candidates': {} } 

    return carpool


def get_route(origin, routes_df):
    ''' Get driver's route from the origin zip code 
        Return a list with the zip codes that the route crosses '''
    
    # Get route from origin 
    #route_lst = routes_df[routes_df['postal_code'] == origin]['zips_route'].values.tolist()
    route_lst = routes_df.loc[routes_df['postal_code'] == str(origin), ['zips_route']]
    #print('in get_route', type(route_lst), len(route_lst), '\n', route_lst)
    if (len(route_lst) > 0):
        route_lst = route_lst.iloc[0][0]
        #print('iloc', type(route_lst), len(route_lst), '\n', route_lst)

    if len(route_lst) == 1:
        #only one element in list
        route_lst = route_lst[0]
        #print('len(route_lst) == 1', type(route_lst), len(route_lst), '\n', route_lst)

    if type(route_lst) == str:
        #print('type(route_lst) == str', type(route_lst), len(route_lst), '\n', route_lst)
        #list is a sring
        zip = ''      #empty string
        zip_lst = []  # empty list 

        for char in route_lst:
            if (char.isdigit()):
                zip += char
            if len(zip) == 5:
                zip_lst.append(zip)
                zip = ''       
        route_lst = zip_lst
        #print('after parsing', type(route_lst), len(route_lst), '\n', route_lst)
    
    if len(route_lst) == 0:
        print('len(route_lst) == 0', type(route_lst), len(route_lst), '\n', route_lst)
        # Find the shortest path to campus
        print('Route from', origin, 'not found, please wait...')
        start_time = datetime.now() # store the execution start time

        # Get zip code's coordinates using geolocator Nominatim
        #loc_info = geolocator.geocode(str(origin))
        #lat = loc_info.latitude
        #lng = loc_info.longitude

        # Get zip code's coordinates in CA zip code dataset
        zip_coord = ca_zip_gdf.loc[ca_zip_gdf['ZCTA'] == str(origin), ['LATITUDE','LONGITUDE']]
        lat = zip_coord.iloc[0][0]
        lng = zip_coord.iloc[0][1]

        # compute shortest routes by length/time and travel time/dist
        short_route_by_dist, short_route_by_time, travel_time, travel_dist = shortest_route_to_csusm(G, lat, lng)

        # Fix string format for travel_time
        if len(travel_time) ==7 or len(travel_time) > 8 :
            travel_time = travel_time[:7].zfill(8)
        
        # Save new route into routes dataframe
        index = len(routes_df.index)
        routes_df.at[index,'postal_code'] = origin
        routes_df.at[index,'latitude'] = lat
        routes_df.at[index,'longitude'] = lng
        routes_df.at[index,'s_dist_route'] = short_route_by_dist
        routes_df.at[index,'s_time_route'] = short_route_by_time
        routes_df.at[index,'travel_time'] = travel_time
        routes_df.at[index,'travel_dist'] = travel_dist

        # find all zip codes on the shortest route by time
        zips_on_route = find_zips_on_route(G, short_route_by_time)

        # save values into df
        routes_df.at[index,'zips_route'] = zips_on_route

        end_time = datetime.now() # store execution end time
        elapsed_time = end_time - start_time 
        print("Done in %.3f seconds" % elapsed_time.seconds)
        
        route_lst = zips_on_route
        # end else for new route

    #print('get_route returns:', type(route_lst), len(route_lst), '\n', route_lst)
    return route_lst


def find_zips_around(location, radius=5):
    ''' Using pyzipcode to search in its ZipCodeDatabase, 
    find all zip codes around this location within this radius
    Return: a list of zip codes'''
    
    zips_in_radius = []
    try:
        zips_in_radius = [z.zip for z in zcdb.get_zipcodes_around_radius(location, radius)] # ('ZIP', radius in miles)
        radius_utf = [x.encode('UTF-8') for x in zips_in_radius] # unicode list to utf list
    
    except:
        zips_in_radius = [location]
    
    return zips_in_radius
    # source: https://stackoverflow.com/questions/35047031/could-i-use-python-to-retrieve-a-number-of-zip-code-within-a-radius


def find_candidates(df, zips_in_radius):
    ''' Look for candidates in the pool with zip code within a radius
    Return: candidates dictionary'''

    df['Zip_code'] = df['Zip_code'].apply(str) # make sure zip code is string
   
    # Find all students within this radius
    candidates = df[df["Zip_code"].isin(zips_in_radius)] #.to_dict(orient='index')
    candidates= candidates.to_dict(orient='index')
    
    #remove candidates that are only drivers
    remove = []
    if (len(candidates) > 0):      
        for key, value in candidates.items():
            if (value['Role'] == 'D'):
                remove.append(key)
        
    if len(remove) > 0: 
        for key in remove:
            try:
                del candidates[key]
            except KeyError:
                print('Cannot delete ', key)

    return candidates
    

# MODIF: include fucntion
def check_driver_schedule(stud):
    '''Check which days of the week the driver goes to campus'''

    schedule = ''
    # student goes to school if arrival and departure are not the same
    if stud['M_arr'] != stud['M_dep']: schedule += 'M'
    if stud['T_arr'] != stud['T_dep']: schedule += 'T'
    if stud['W_arr'] != stud['W_dep']: schedule += 'W'
    if stud['R_arr'] != stud['R_dep']: schedule += 'R'
    if stud['F_arr'] != stud['F_dep']: schedule += 'F'
    if stud['S_arr'] != stud['S_dep']: schedule += 'S'
    
    return schedule


def contains_all(str, lst):
    for c in lst:
        if c not in str: return 0
    return 1

def compare_schedules(candidates, driver):
    ''' Compare candidate's and driver's schedule
    Return: a dict with lists with the indices of candidates for all weekdays roud-trip carpool, 
    MWF round-trip carpool, and TR round-trip carpool, and one list for each individual weekday'''

    cand_week =[]
    cand_MWF = []
    cand_TR = []
    cand_M = []
    cand_T = []
    cand_W = []
    cand_R = []
    cand_F = []
    cand_S = []
    remove_lst =[]

    # MODIF: only check for the days driver goes to school
    driver_schedule = check_driver_schedule(driver)

    if len(candidates) > 0:
        for cand in candidates.keys():
            # every weekday round trip
            if  check_schedule(driver, candidates[cand]) :
                cand_week.append(cand) # add to week list

            # MWF round trip
            # ( contains_all(driver_schedule, ['M','W','F'])
            elif ('M' and 'W' and 'F' in driver_schedule) and check_schedule_MWF(driver, candidates[cand]):
                cand_MWF.append(cand)

            # TR round trip
            elif ('T' and 'R' in driver_schedule) and  check_schedule_TR(driver, candidates[cand]):
                #print(cand, 'saved into TR list')
                cand_TR.append(cand)
                #print(cand, 'is from candidates:', (cand in list(dict.fromkeys(candidates))))
            
            # M round trip
            if ('M' in driver_schedule) and check_schedule_M(driver, candidates[cand]) :
                cand_M.append(cand)

            # T round trip
            elif('T' in driver_schedule) and check_schedule_T(driver, candidates[cand]) :
                cand_T.append(cand) 

            # W round trip
            elif ('W' in driver_schedule) and check_schedule_W(driver, candidates[cand]) :
                cand_W.append(cand)        

            # R round trip
            elif ('R' in driver_schedule) and check_schedule_R(driver, candidates[cand]):
                cand_R.append(cand)

            # F round trip
            elif ('F' in driver_schedule) and check_schedule_F(driver, candidates[cand]) :
                cand_F.append(cand)

            # S round trip
            elif ('S' in driver_schedule) and check_schedule_S(driver, candidates[cand]) :
                cand_S.append(cand)
            
            else: 
                # remove all remaining candidates
                remove_lst.append(cand)

    # remove candidates which schedule does not match
    for cand in remove_lst:
        del candidates[cand]

    # create a dict to hold a list of all candidates found per day
    cand_dict = { 'week': cand_week, 
                 'MWF': cand_MWF, 
                 'TR': cand_TR, 
                 'M': cand_M,
                 'T': cand_T,
                 'W': cand_W, 
                 'R': cand_R, 
                 'F': cand_F,
                 'S': cand_S
                 }
    return cand_dict


def compute_comp_score(driver, candidates, cand_lst):
    ''' Call comp_score with the driver and each candidate from the list
    Return: list of candidates sorted from highest to lowest score'''

    candidates_dict = dict()

    for cand in cand_lst:
        cs = comp_score(driver, candidates[cand])
        candidates[cand]['score'] = cs
        candidates_dict[cand]= candidates[cand].copy() 
        #print(candidates_week)

    # Sort candidates by highest score
    cand_lst = sorted(candidates_dict, key = lambda x: candidates_dict[x]['score'], reverse=True)

    return cand_lst


def add_candidate_to(carpool, cand_lst, cand_dict, schedule, location):
    ''' Add candidates to carpool with ID, score, and meeting point for specified schedule
    Return: a list with indices of candidates that got a seat and should be removed from the pool''' 
    remove_lst = []
    cand_keys_lst = list(dict.fromkeys(cand_dict))
    #print('Candidate keys:', cand_keys_lst)

    # remove candidates that are already in the carpool
    for cand in cand_lst:
        if cand in carpool['candidates'].keys():
            # print(cand, 'already in the carpool')
            cand_lst.remove(cand)
            #del cand_lst[cand]

            # try:
            #     cand_lst.remove(cand)
            # except:
            #     pass

    for cand in cand_lst:
        if (carpool['vacant_seats'] <= 0): 
            break   
        candidate = {'ID': cand_dict[cand]['ID'], 'score':cand_dict[cand]['score'], 'schedule': schedule, 'meeting_point': location}
        # print(candidate)
        carpool['candidates'][cand] = candidate
        remove_lst.append(cand)
        carpool['vacant_seats'] = carpool['vacant_seats'] - 1
        # print(carpool['candidates'][cand])


    # print('Available seats:', carpool['vacant_seats'])

    return remove_lst

# MODIF: add carpool_id to parameters
def save_carpool_to(df, carpool_dict, carpool_id): 
    ''' Save the carpool dict into the carpool dataframe'''
    # df_columns= ['carpool_id', 'driver', 'driver_schedule',
    #              'candidate_1', 'score_1', 'schedule_1', '1_carpools',
    #              'candidate_2', 'score_2', 'schedule_2', '2_carpools',
    #              'candidate_3', 'score_3', 'schedule_3', '3_carpools',
    #             ]
    # dict_keys = {'ID', 'score', 'schedule', 'meeting_point'}

    index = len(df.index)

    df.at[index, 'carpool_id'] = carpool_id # MODIF: not index +1, but carpool_id
    df.at[index, 'driver_id'] = carpool_dict['driver']
    df.at[index, 'driver_schedule'] = carpool_dict['driver_schedule'] # MODIF: add driver schedule to df
    df.at[index, 'direction'] = 'RT'
    df.at[index, 'create_date'] = datetime.now()


    n = 0
    #print(type(carpool_dict['candidates']), carpool_dict['candidates'])
    for cand in carpool_dict['candidates'].values():
        #print('cand=', type(cand), cand)
        n += 1

        col = 'candidate_' + str(n)
        df.at[index, col] = cand['ID']
        #print('\t', col ,':', cand['ID'])
        col = 'score_' + str(n)
        df.at[index, col] = cand['score']
        #print('\t', col ,':', cand['score'])
        col = 'schedule_' + str(n)
        df.at[index, col] = cand['schedule']
        #print('\t', col ,':', cand['schedule'])
        # col = 'meeting_point_' + str(n)
        # df.at[index, col] = cand['meeting_point']
        # print('\t', col ,':', cand['meeting_point'])


In [None]:
from pandas.core.dtypes.common import classes_and_not_datetimelike

def create_carpool():
    start_time = datetime.now() # store the execution start time

    # Create carpool while there are available drivers or until the pool is empty
    total_students = stud_pool_df.shape[0]
    total_drivers = len(stud_pool_df[stud_pool_df['Role']=='D']) + len(stud_pool_df[stud_pool_df['Role']=='E']) 

    carpool_id = 0
    # MODIF: do NOT check len(pool_df) to allow all passengers to have a chance with other drivers
    while (total_drivers > 0): # and (len(stud_pool_df.index)) > 0:
        carpool_id += 1
        # print("\tCARPOOL", carpool_id)
        #Select driver
        d_ind, driver = select_driver(stud_pool_df)

        # Remove driver from pool
        remove_from_pool([d_ind], stud_pool_df)

        # MODIF: do not use remove_lst to remove passengers from the pool
        remove_lst = [] # create list for removing passengers

        # Create the a carpool with this driver
        carpool_dict = create_carpool_dict(driver)

        # Get driver's route to school
        route_lst = get_route(driver['Zip_code'], routes_df)
        if (route_lst):
            #print(type(route_lst), route_lst)
            #check if origin is in the list
            if driver['Zip_code'] != route_lst[0]:
                route_lst.insert(0,driver['Zip_code'])   

        # Find candidates
        stop = 0
        while carpool_dict['vacant_seats'] > 0 and len(route_lst) > 0:
            # Get driver's location
            location = route_lst.pop(0)

            if location == None:
                print("Cannot find this location")
                if len(route_lst) > 0 : 
                    # get another location on this route
                    location = route_lst.pop(0)
                    print("New location:", location)
                else:
                    # start over
                    print('Start over')
                    continue
            stop += 1
            # print('Stop ', stop, 'at', location) # How many 'stops' from origin

            # Broaden the search to a radius around this location
            zips_around = find_zips_around(location,5) # <------------------------- EXPERIMENT RADIUS

            # Find candidates around this location 
            candidates = find_candidates(stud_pool_df, zips_around)
        
            # Compare schedules and find possible matches for the whole week, MWF, and TR
            candidates_dict = compare_schedules(candidates, driver)

            # Check if any candidate is present in more than one list
            # result = [item for item in list2 if item in list1]

            # MODIF: comment out all comparisons of repeatted in more than one list

            # print('candidates_dict', candidates_dict)
            #all week vs MWF
            repeated = [ cand for cand in candidates_dict['MWF'] if cand in candidates_dict['week']]
            for cand in repeated:
                candidates_dict['MWF'].remove(cand)
            # All week vs TR
            repeated = [ cand for cand in candidates_dict['TR'] if cand in candidates_dict['week']]
            for cand in repeated: 
                candidates_dict['TR'].remove(cand)
            # All week vs M or MWF vs M
            repeated = [ cand for cand in candidates_dict['M'] if (cand in candidates_dict['week'] or cand in candidates_dict['MWF'])]
            for cand in repeated: 
                candidates_dict['M'].remove(cand)
            # All week vs W or MWF vs W
            repeated = [ cand for cand in candidates_dict['W'] if (cand in candidates_dict['week'] or cand in candidates_dict['MWF'])]
            for cand in repeated: 
                candidates_dict['W'].remove(cand)
            # All week vs F or MWF vs F
            repeated = [ cand for cand in candidates_dict['F'] if (cand in candidates_dict['week'] or cand in candidates_dict['MWF'])]
            for cand in repeated: 
                candidates_dict['F'].remove(cand)
            # All week vs T or TR vs T
            repeated = [ cand for cand in candidates_dict['T'] if (cand in candidates_dict['week'] or cand in candidates_dict['TR'])]
            for cand in repeated: 
                candidates_dict['T'].remove(cand)
            # All week vs R or TR vs R
            repeated = [ cand for cand in candidates_dict['R'] if (cand in candidates_dict['week'] or cand in candidates_dict['TR'])]
            for cand in repeated:
                candidates_dict['R'].remove(cand)
            # All week vs S
            repeated = [ cand for cand in candidates_dict['S'] if cand in candidates_dict['week'] ]
            # if len(repeated) > 0:
            #     print('repeated', repeated)
            #     print('candidates_dict S', type(candidates_dict['S']), candidates_dict['S'])
            for cand in repeated: 
                # print('cand', cand)
                # # print('try to delete:', type(candidates_dict['S'][cand]), candidates_dict['S'][cand]) 
                # # del candidates_dict['S'][cand]
                candidates_dict['S'].remove(cand)
                # print('after removing', cand, 'list is:', candidates_dict['S'])

            # Compute compatibility scores and get a sorted list from highest to lowest score for each schedule
            schedules = ['week', 'MWF', 'TR', 'M', 'T', 'W', 'R', 'F', 'S']
            for schedule in schedules:
                candidates_dict[schedule] = compute_comp_score(driver, candidates, candidates_dict[schedule])

            # Add candidates into carpool from most matching days if carpool still has vancant seats
            for schedule in schedules:
                if (len(candidates_dict[schedule]) > 0) and carpool_dict['vacant_seats'] > 0: 
                    remove = add_candidate_to(carpool_dict,candidates_dict[schedule],candidates,schedule,location)
                #print('Add to carpool and remove from pool (week):', remove)
                    for cand in remove:
                        remove_lst.append(cand)

        #if (carpool_dict['vacant_seats'] > 0): 
        #    print('There are no more matches for driver', driver['ID'])

        # Remove riders from the pool
        # remove_from_pool(remove_lst, stud_pool_df) # MODIF: do NOT remove candidates from pool_df

        # MODIF: use remove_lst to update pool_df adding this carpool to carpool_lst to all passengers
        # print('who are in the remove_lst?', remove_lst)
        for cand in remove_lst:
            # cand is the index from stud_pool_df
            # add this carpool to the list of carpools of this cand
            stud_pool_df.at[cand, 'carpools_lst'].append(carpool_id)

        # Save carpool into dataframe
        save_carpool_to(carpool_df, carpool_dict, carpool_id) # MODIF: add carpool_id to parameters

        # update total drivers counting all 'E' and 'D' that are remaining
        total_drivers = len(stud_pool_df[stud_pool_df['Role']=='D']) + len(stud_pool_df[stud_pool_df['Role']=='E']) 

    # How long did it take to process the whole dataset? 
    end_time = datetime.now() # store execution end time
    elapsed_time = end_time - start_time 


    print('carpool_df shape', carpool_df.shape)
    print('stud_pool_df shape', stud_pool_df.shape)
    # Count students
    alone_drivers = carpool_df['candidate_1'].isnull().sum()
    # MODIF: all passengers are stilll in the pool_Df, so shape[0] still has all passengers in it
    # unmatched = stud_pool_df.shape[0]
    # MODIF: sum empty cells of new column to check how many passengers did not find a ride
    unmatched_passengers = 0
    for index, row in stud_pool_df.iterrows():
        if len(row['carpools_lst']) == 0:
            # it means this person did not find any carpool after checking against all drivers
            unmatched_passengers += 1 
    unmatched = unmatched_passengers + alone_drivers
    total_carpools = carpool_df.shape[0] - alone_drivers 
    riders_drivers = total_students - unmatched

    print("%d carpools created in %.2f seconds" % (carpool_df.shape[0], elapsed_time.seconds))
    print("Total students in the pool:", total_students)
    print('Alone drivers:', alone_drivers)
    print('Unmatched passengers:', unmatched_passengers)
    # print('Remaining passengers in the pool:', stud_pool_df.shape[0] )
    print('Unmatched passengers + alone drivers: %d (%.1f %%)' % (unmatched, (unmatched/total_students)*100))
    print('Matched passengers + drivers: %d (%.1f %%)' % (riders_drivers, (riders_drivers/total_students)*100))
    print('Carpools with at least one passenger: %d'% total_carpools)

    return total_carpools, unmatched, riders_drivers, elapsed_time

# Putting it all together

### Carpool dict to save results




In [None]:
# Create the dict to save all results
results_dict = dict()
results_dict

{}

### Create pool, get routes

In [None]:
# Read data into datastructures 
stud_pool_df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/Rideshare/data/Spring_2022_CA_students_pref.csv')
routes_df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/Rideshare/data/routes_from_zip.csv')

# Create a carpool datastructure to save the carpools
df_columns= ['carpool_id', 'driver_id', 'driver_schedule', # MODIF: add driver schedule
             'candidate_1', 'score_1', 'schedule_1', 
             'candidate_2', 'score_2', 'schedule_2', 
             'candidate_3', 'score_3', 'schedule_3', 
            ]
carpool_df = pd.DataFrame(columns= df_columns)

# make zip a string
stud_pool_df['Zip'] = stud_pool_df['Zip'].astype(str) 
routes_df['postal_code'] = routes_df['postal_code'].astype(str)

print("Total students:", len(stud_pool_df))

# School location
csusm_lat = 33.1298
csusm_lng = -117.1587
csusm_zip = 92096

# Limit the number of students by radius distance from campus
radius = 50
print('Radius around CSUSM:', radius)

zip_list_radius = [z.zip for z in zcdb.get_zipcodes_around_radius(csusm_zip, radius)]
print('Locations within', radius, 'miles:', len(zip_list_radius))

stud_pool_df = stud_pool_df[stud_pool_df['Zip'].isin(zip_list_radius)]
print("Students within", radius, 'miles:',stud_pool_df.shape[0])


Total students: 9953
Radius around CSUSM: 50
Locations within 50 miles: 261
Students within 50 miles: 7893


In [None]:
carpool_df

Unnamed: 0,carpool_id,driver_id,driver_schedule,candidate_1,score_1,schedule_1,candidate_2,score_2,schedule_2,candidate_3,score_3,schedule_3


In [None]:
stud_pool_df.columns

Index(['ID', 'Zip', 'Role', 'Seats', 'Gender', 'Under_25', 'Undergrad',
       'Smoker', 'College', 'Pref_Gender', 'Pref_Age', 'Pref_Status',
       'Pref_NSm', 'M_arr', 'M_dep', 'T_arr', 'T_dep', 'W_arr', 'W_dep',
       'R_arr', 'R_dep', 'F_arr', 'F_dep', 'S_arr', 'S_dep'],
      dtype='object')

In [None]:
# Get students from file 
# stud_pool_df = pd.read_csv('/content/candidates100.csv')

In [None]:
# rename column Zip to Zip_code
stud_pool_df.rename(columns={'Zip':'Zip_code'}, inplace=True)
stud_pool_df.drop(columns=['Unnamed: 0'], inplace=True) 
stud_pool_df.columns

Index(['ID', 'Zip_code', 'Role', 'Seats', 'Gender', 'Under_25', 'Undergrad',
       'Smoker', 'College', 'Pref_Gender', 'Pref_Age', 'Pref_Status',
       'Pref_NSm', 'M_arr', 'M_dep', 'T_arr', 'T_dep', 'W_arr', 'W_dep',
       'R_arr', 'R_dep', 'F_arr', 'F_dep', 'S_arr', 'S_dep'],
      dtype='object')

In [None]:
print('Number of students:', stud_pool_df.shape[0])
print('Different locations:', stud_pool_df['Zip_code'].nunique())

Number of students: 7893
Different locations: 171


In [None]:
routes_df.columns

Index(['postal_code', 'latitude', 'longitude', 's_dist_route', 's_time_route',
       'travel_time', 'travel_dist', 'zips_route'],
      dtype='object')

In [None]:
    import numpy as np
    #MODIF: add new column with carpool empty list to the dataframe to save which carpools a passenger belongs to
    # df['empty_list'] =  np.empty((len(df), 0)).tolist()
    stud_pool_df["carpools_lst"] =  np.empty((len(stud_pool_df), 0)).tolist()
    stud_pool_df.columns

Index(['ID', 'Zip_code', 'Role', 'Seats', 'Gender', 'Under_25', 'Undergrad',
       'Smoker', 'College', 'Pref_Gender', 'Pref_Age', 'Pref_Status',
       'Pref_NSm', 'M_arr', 'M_dep', 'T_arr', 'T_dep', 'W_arr', 'W_dep',
       'R_arr', 'R_dep', 'F_arr', 'F_dep', 'S_arr', 'S_dep', 'carpools_lst'],
      dtype='object')

### Limit the number of students with a random sample

In [None]:
n_stud = 5000
stud_pool_df = stud_pool_df.sample(n_stud, random_state=42) # enforce same samples for testing
dif_locations = stud_pool_df['Zip_code'].nunique()
print('Number of students:', stud_pool_df.shape[0])
print('Different locations:', dif_locations)

Number of students: 5000
Different locations: 154


In [None]:
# save into csv file to upload into database
download_as_csv(stud_pool_df, 'all_candidates_within_50_miles.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded on 2022-12-07


### Create the carpools

In [None]:
total_carpools, unmatched, riders_drivers, elapsed_time = create_carpool()



In [None]:
results_dict = {
                    'total_carpools': total_carpools, 
                    'unmatched': unmatched, 
                    'matched': riders_drivers,
                    'time': elapsed_time.seconds
                    }

print()
results_dict




{'total_carpools': 3924, 'unmatched': 999, 'matched': 6894, 'time': 420}

In [None]:
carpool_df

Unnamed: 0,carpool_id,driver_id,driver_schedule,candidate_1,score_1,schedule_1,candidate_2,score_2,schedule_2,candidate_3,score_3,schedule_3,direction,create_date
0,1,830,MTWRF,3899,3,MWF,,,,,,,RT,2022-12-05 04:02:45.629270
1,2,1947,M,1383,4,M,,,,,,,RT,2022-12-05 04:02:45.721157
2,3,6354,TW,9371,4,T,9067,3,T,,,,RT,2022-12-05 04:02:45.808800
3,4,142,WR,17,7,W,9253,-1,W,,,,RT,2022-12-05 04:02:45.865396
4,5,2492,MT,1645,7,T,,,,,,,RT,2022-12-05 04:02:46.005469
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,146,4551,F,4464,5,week,,,,,,,RT,2022-12-05 04:02:58.137213
146,147,5275,TR,8473,4,week,208,0,week,8211,5,T,RT,2022-12-05 04:02:58.190143
147,148,8350,TR,,,,,,,,,,RT,2022-12-05 04:02:58.459708
148,149,1334,TR,854,6,TR,2977,5,T,,,,RT,2022-12-05 04:02:58.496205


In [None]:
carpool_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 0 to 149
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   carpool_id       150 non-null    object        
 1   driver_id        150 non-null    object        
 2   driver_schedule  150 non-null    object        
 3   candidate_1      108 non-null    object        
 4   score_1          108 non-null    object        
 5   schedule_1       108 non-null    object        
 6   candidate_2      73 non-null     object        
 7   score_2          73 non-null     object        
 8   schedule_2       73 non-null     object        
 9   candidate_3      43 non-null     object        
 10  score_3          43 non-null     object        
 11  schedule_3       43 non-null     object        
 12  direction        150 non-null    object        
 13  create_date      150 non-null    datetime64[ns]
dtypes: datetime64[ns](1), object(13)
memory us

In [None]:
alone_drivers = carpool_df['candidate_1'].isnull().sum()
alone_drivers

42

In [None]:
# Save carpool into csv
# download_as_csv(carpool_df, 'carpool_100.csv')

In [None]:
stud_pool_df

Unnamed: 0,ID,Zip_code,Role,Seats,Gender,Under_25,Undergrad,Smoker,College,Pref_Gender,...,T_dep,W_arr,W_dep,R_arr,R_dep,F_arr,F_dep,S_arr,S_dep,carpools_lst
2280,885,92057,P,0,F,1,1,0,CSTEM,0,...,17:15:00,11:30:00,14:20:00,10:30:00,17:15:00,00:00:00,00:00:00,00:00:00,00:00:00,[12]
2147,2539,92024,P,0,F,1,1,0,CHABSS,0,...,10:15:00,11:30:00,17:15:00,00:00:00,00:00:00,00:00:00,00:00:00,00:00:00,00:00:00,[]
5179,2410,92027,P,0,F,1,1,0,CEHHS,0,...,00:00:00,09:00:00,11:50:00,09:00:00,14:15:00,09:30:00,12:20:00,00:00:00,00:00:00,[]
4979,1084,92113,P,0,F,1,1,0,CEHHS,0,...,17:15:00,00:00:00,00:00:00,07:30:00,20:20:00,00:00:00,00:00:00,00:00:00,00:00:00,[]
3992,8382,92131,P,0,M,1,1,0,COBA,0,...,00:00:00,09:30:00,11:20:00,17:30:00,18:45:00,09:30:00,11:20:00,00:00:00,00:00:00,[]
1072,6844,92116,P,0,F,0,0,0,CEHHS,0,...,16:50:00,00:00:00,00:00:00,00:00:00,00:00:00,00:00:00,00:00:00,00:00:00,00:00:00,[]
1519,4429,92119,P,0,F,0,0,0,CEHHS,0,...,00:00:00,08:00:00,10:50:00,00:00:00,00:00:00,00:00:00,00:00:00,00:00:00,00:00:00,[]
5142,10202,92129,P,0,M,1,1,0,CHABSS,0,...,11:45:00,09:30:00,10:20:00,10:30:00,14:50:00,09:30:00,10:20:00,00:00:00,00:00:00,[]
3909,5728,92058,P,0,F,0,1,0,CHABSS,0,...,00:00:00,11:30:00,14:20:00,00:00:00,00:00:00,00:00:00,00:00:00,00:00:00,00:00:00,"[12, 19, 35]"
5668,8456,92584,P,0,F,1,1,0,CSTEM,0,...,16:50:00,14:30:00,17:15:00,09:00:00,13:50:00,00:00:00,00:00:00,00:00:00,00:00:00,[8]
