In [5]:
import pandas as pd
import numpy as np
from s2sphere import CellId, LatLng
import json

In [7]:
df1 = pd.read_parquet('gojek/demand/201910.parquet')
df2 = pd.read_parquet('gojek/driver_log/201910.parquet')              
mymap_path = 'map/region_map.pdf'
region_names_path = 'map/Regions_With_Names.json'

In [8]:
new_df1=df1[['booking_time','booking_pickup_latitude','booking_pickup_longitude','booking_destination_latitude','booking_destination_longitude','status']]
new_df2 = df2

In [9]:
# group into 15 min intervals
grouped1 = new_df1.groupby(pd.Grouper(key='booking_time', freq='15Min'))
for key, group in grouped1:
    #print(f"Group {key}:")
    #print(group)
    break
    
    
grouped2 = new_df2.groupby(pd.Grouper(key='timestamp', freq='15Min'))
for key, group in grouped2:
    #print(f"Group {key}:")
    break

In [10]:
def is_point_inside_area(point, area_coordinates):
    # check if given point coordinates are within a certain area
    lon, lat = point
    top_left = area_coordinates[0]
    bottom_right = area_coordinates[2]

    if (
        lon >= top_left[0] and
        lon <= bottom_right[0] and
        lat >= top_left[1] and
        lat <= bottom_right[1]
    ):
        return True
    else:
        return False
    
def getTT(region):
    return round((((region-100)//50) * -5.2222 ) + 147.11),round((((region%50)-2) * 5.2083)+20.458 )

def coordinate2region(coordinates, data):
    for d in data:
        for area in d['geometry']['coordinates'][0]:
            if is_point_inside_area([coordinates[1], coordinates[0]], area):
                return d['properties']['id']
            
def sid2coordinates(sid):
    cell = CellId(sid)
    center = cell.to_lat_lng()
    latitude = center.lat().degrees
    longitude = center.lng().degrees
    return (latitude, longitude)

def sid2coordinates_new(sid):
    sid = int(sid)
    cell = CellId(sid)
    center = cell.to_lat_lng()
    latitude = center.lat().degrees
    longitude = center.lng().degrees
    return (latitude, longitude)

def normalize_array(myarray,new_min,new_max):
    # Calculate the scaling factor
    min_value = myarray.min()
    max_value = myarray.max()
    scaling_factor = (new_max - new_min) / (max_value - min_value)

    # Normalize the data to the new range
    normalized_array = (myarray - min_value) * scaling_factor + new_min
    return normalized_array

## IMAGE

In [7]:
# Heatmap dimensions
mydf = new_df1
width, height = 267, 163
region_size = 5

In [None]:
# create/populate/save heatmap
mymap_path = 'map/region_map.pdf'
region_names_path = 'map/Regions_With_Names.json'

with open(region_names_path, 'r') as json_file:
    data = json.load(json_file)
    
    # create and populate heatmap for every 15 mins
    name = 0
    for key, group in grouped1:
        # Create empty arrays for each channel
        red_channel = np.full((height, width),0, dtype=np.uint8)
        green_channel = np.full((height, width),0, dtype=np.uint8)
        blue_channel = np.full((height, width),0, dtype=np.uint8)
        
        for index in range(len(group)):
            
            row = group.iloc[index]

            # populate red
            coordinates = (row['booking_pickup_latitude'], row['booking_pickup_longitude'])
            rid = coordinate2region(coordinates, data)
            if rid is not None:
                t,tt = getTT(int(rid))

                top_left_x = t - region_size // 2
                top_left_y = tt - region_size // 2

                for y in range(top_left_y, top_left_y + region_size):
                    for x in range(top_left_x, top_left_x + region_size):
                        red_channel[x, y] += 1
            else:
                pass
            
            # populate green
            coordinates = (row['booking_destination_latitude'], row['booking_destination_longitude'])
            rid = coordinate2region(coordinates, data)
            if rid is not None:
                t,tt = getTT(int(rid))

                top_left_x = t - region_size // 2
                top_left_y = tt - region_size // 2

                for y in range(top_left_y, top_left_y + region_size):
                    for x in range(top_left_x, top_left_x + region_size):
                        green_channel[x, y] += 1
            else:
                pass
        
        blue_group = grouped2.get_group(key)
        blue_group = blue_group[blue_group['driver_status'] == 1]
        
        for index in range(len(blue_group)):
            row = blue_group.iloc[index]
            
            # populate blue
            coordinates = sid2coordinates(int(row['s2id']))
            rid = coordinate2region(coordinates, data)
            if rid is not None:
                t,tt = getTT(int(rid))

                top_left_x = t - region_size // 2
                top_left_y = tt - region_size // 2

                for y in range(top_left_y, top_left_y + region_size):
                    for x in range(top_left_x, top_left_x + region_size):
                        blue_channel[x, y] += 1
            else:
                pass
        
        # combine channels and save heatmap
        # Create a blank RGB image
        heatmap = np.zeros((height, width, 3), dtype=np.uint8)

        # Assign the red, green, and blue channels
        red_channel = normalize_array(red_channel,8,255)
        green_channel = normalize_array(green_channel,48,255)
        blue_channel = normalize_array(blue_channel,107,255)
        heatmap[:, :, 0] = red_channel
        heatmap[:, :, 1] = green_channel
        heatmap[:, :, 2] = blue_channel

        # Convert the NumPy array to a PIL Image
        heatmap_image = Image.fromarray(heatmap)
        name+=1
        heatmap_image.save(f"myfiles/images/{str(name)}.png")
        
        if name % 20 == 0:
            print(name/2920)
        #break # remove this break. for testing only.

## CSV

In [8]:
select_regions = [429,430] # select your regions

In [9]:
mymap_path = 'map/region_map.pdf'
region_names_path = 'map/Regions_With_Names.json'

In [10]:
def related_regions(rid):
    # returns a list of regions surrounding the input region. Excludes input region
    related_list = str(set([rid-1,rid+1,rid-50,rid+50,rid-49,rid+49,rid-51,rid+51]))
    related = []
    with open(f'{folder}/Regions_With_Names.json', 'r') as json_file:
        data = json.load(json_file)
        for d in data:
            currentid = d['properties']['id']
            if currentid in related_list:
                related.append(currentid)
    return related

In [11]:
new_df2 = new_df2[new_df2['driver_status'] == 1]
grouped2 = new_df2.groupby(pd.Grouper(key='timestamp', freq='15Min'))
for key, group in grouped2:
    print(f"Group {key}:")
    print(group)
    break
print(len(grouped2))

Group 2019-10-01 00:00:00:
       driver_id           timestamp          s2id  driver_status
11     700653815 2019-10-01 00:00:00  3.592208e+18              1
16     700429849 2019-10-01 00:00:00  3.592202e+18              1
19     700276871 2019-10-01 00:00:00  3.592251e+18              1
25     700136282 2019-10-01 00:00:00  3.592208e+18              1
29     700261045 2019-10-01 00:00:00  3.592203e+18              1
...          ...                 ...           ...            ...
38585  700121418 2019-10-01 00:14:00  3.592251e+18              1
38586  700285980 2019-10-01 00:14:00  3.592206e+18              1
38590  700383682 2019-10-01 00:14:00  3.592213e+18              1
38594  700316017 2019-10-01 00:14:00  3.592202e+18              1
38595  700358707 2019-10-01 00:14:00  3.592212e+18              1

[9931 rows x 4 columns]
2976


In [6]:
mymap_path = 'map/region_map.pdf'
region_names_path = 'map/Regions_With_Names.json'

with open(region_names_path, 'r') as json_file:
    data = json.load(json_file)
    for region in select_regions:
        print(f'{region} start')
        count = 0

        related = related_regions(region)
        col_names = [str(region),str(region)+"_end","weekday","dayofweek","hour","minute","day","holiday"
            ,"freetaxi",related[0],related[1],related[2],related[3],related[4],related[5],related[6],related[7]
            ,related[0]+"_end",related[1]+"_end",related[2]+"_end",related[3]+"_end"
             ,related[4]+"_end",related[5]+"_end",related[6]+"_end",related[7]+"_end"]
        processed_df = pd.DataFrame(None, columns=col_names)
        
        for key, group in grouped1:
            temp_list = [0] * len(processed_df.columns) # input all data into this list, then append to processed_df
            for index, row in group.iterrows():
                pickup_coordinates = (row['booking_pickup_latitude'], row['booking_pickup_longitude'])
                pickup_rid = coordinate2region(pickup_coordinates, data)
                if pickup_rid is not None:
                    if int(pickup_rid) == region:
                        temp_list[0] += 1
                    elif pickup_rid in related:
                        temp_index = related.index(pickup_rid)
                        temp_list[temp_index+9] += 1

                end_coordinates = (row['booking_destination_latitude'], row['booking_destination_longitude'])
                end_rid = coordinate2region(end_coordinates, data)
                if end_rid is not None:
                    if int(end_rid) == region:
                        temp_list[1] += 1
                    elif end_rid in related:
                        temp_index = related.index(end_rid)
                        temp_list[temp_index+9+8] += 1

            blue_group = grouped2.get_group(key)
            numfreetaxis = 0
            freetaxi_coordinates = blue_group['s2id'].apply(sid2coordinates_new)
            for lat, lon in freetaxi_coordinates:
                freetaxi_rid = coordinate2region((lat,lon), data)
                if freetaxi_rid is not None:
                    if int(freetaxi_rid) == region:
                        numfreetaxis+=1
            temp_list[8] = numfreetaxis

            if key.dayofweek in [5,6]:
                temp_list[2] = 0
            elif key.dayofweek in [0,1,2,3,4]:
                temp_list[2] = 1
            temp_list[3] = key.dayofweek
            temp_list[4] = key.hour
            temp_list[5] = key.minute
            temp_list[6] = key.day
            # holiday
            if key.day in [27,28]: # insert holiday dates for the month here
                temp_list[7] = 1
            else:
                temp_list[7] = 0
            processed_df.loc[key] = temp_list
            count += 1
            if count % 50 == 0:
                print(count/2976)
        print(f'{region} done')
        processed_df.to_csv(f'myfiles/csv/{region}.csv')

NameError: name 'select_regions' is not defined

## Micro

In [11]:
# RUN THIS CELL ONLY IF '{region}_locations.json' has not yet been created


mymap_path = f'map/region_map.pdf'
region_names_path = f'map/Regions_With_Names.json'
#select_regions = ['428', '610', '929']
select_regions = ['610']

def sid2coordinates(sid):
    sid = int(sid)  # Convert to integer
    cell = CellId(sid)
    center = cell.to_lat_lng()
    latitude = center.lat().degrees
    longitude = center.lng().degrees
    return (latitude, longitude)

with open(region_names_path, 'r') as json_file:
    data = json.load(json_file)

for region in select_regions:
    print(f'{region} start')
    count = 0
    myset = set()

    for _, group in grouped2:
        group = group.dropna(subset=['s2id'])
        group['coords'] = group['s2id'].apply(sid2coordinates_new)
        for _, row in group.iterrows():
            rid = coordinate2region(row['coords'], data)
            if rid != None:
                if rid == region:
                    myset.add(row['s2id'])
        
        count += 1
        if count % 50 == 0:
            print(len(myset))
            print(count / len(grouped2))

            # Convert set to a list
            mylist = list(myset)

            # Specify the file path
            output_file_path = f"locations/{region}_locations.json"

            # Write the list to the file in JSON format
            with open(output_file_path, 'w') as file:
                json.dump(mylist, file)

            print(f"Set contents saved to {output_file_path}")
            break

610 start


KeyboardInterrupt: 

In [12]:
mymap_path = f'map/region_map.pdf'
region_names_path = f'map/Regions_With_Names.json'
select_regions = ['428', '610', '929']
with open("locations/428_locations.json", 'r') as json_file:
    data_428 = json.load(json_file) 
with open("locations/610_locations.json", 'r') as json_file:
    data_610 = json.load(json_file) 
with open("locations/929_locations.json", 'r') as json_file:
    data_929 = json.load(json_file) 

mylocations = {select_regions[0]: data_428, select_regions[1]: data_610, select_regions[2]: data_929}

In [None]:
with open(region_names_path, 'r') as json_file:
    data = json.load(json_file) 
    
for region in select_regions:
    print(f'{region} start')
    count = 0
    
    col_names = mylocations[region]
    processed_df = pd.DataFrame(None, columns=col_names)
    temp_list = [-15] * len(processed_df.columns) # add data to temp_list then append to processed df
    
    for key, group in grouped2:
        temp_list = [x + 15 for x in temp_list]
        group = group.dropna(subset=['s2id'])
        group['coords'] = group['s2id'].apply(sid2coordinates_new)
        for _, row in group.iterrows():
            rid = coordinate2region(row['coords'], data)
            if rid != None:
                if rid == region:
                    if row['s2id'] in mylocations[region]:
                        myindex = mylocations[region].index(row['s2id']) # get index to append to temp_list
                        if row['driver_status'] == 1: # vacant
                            temp_list[myindex] = 0 # time elapsed since last 
        processed_df.loc[key] = temp_list
        count += 1
        if count % 50 == 0:
            print(count / len(grouped2))
            
    processed_df.to_csv(f'micro/{region}.csv')