### Data Exploration ###

Datasets: Charlottesville City, Albemarle County


In [18]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import random

In [19]:
filename_cville = "usa_va_charlottesville_city_adult_activity_location_assignment_week.csv"
df_cville = pd.read_csv(filename_cville)
df_cville.head()

Unnamed: 0,hid,pid,activity_number,activity_type,start_time,duration,lid,longitude,latitude,travel_mode
0,2208253,5586585,0,1,0,27900,1018209,-78.488467,38.043025,-1
1,2208253,5586585,2,2,28800,17100,82246,-78.41492,38.15753,-1
2,2208253,5586585,4,4,46800,1200,86726,-78.49879,38.13916,-1
3,2208253,5586585,6,2,48600,15300,82246,-78.41492,38.15753,-1
4,2208253,5586585,8,1,64800,36000,1018209,-78.488467,38.043025,-1


In [20]:
filename_albe = "usa_va_albemarle_adult_activity_location_assignment_week.csv"
df_albe = pd.read_csv(filename_albe)
df_albe.head()

Unnamed: 0,hid,pid,activity_number,activity_type,start_time,duration,lid,longitude,latitude,travel_mode
0,13480,30923,0,1,0,25200,1018178,-78.435006,38.0923,-1
1,13480,30923,2,2,27000,7680,117201,-78.44051,38.02277,-1
2,13480,30923,4,2,37200,6000,117201,-78.44051,38.02277,-1
3,13480,30923,6,4,44700,1800,118560,-78.51548,38.06175,-1
4,13480,30923,8,2,47400,17400,117201,-78.44051,38.02277,-1


### Pipeline

In [21]:
def read_file(county_name: str):
    if county_name!= "albemarle" and county_name!="charlottesville_city":
        print("Invalid County Name")
    else:
        filename = f"usa_va_{county_name}_adult_activity_location_assignment_week.csv"
        df = pd.read_csv(filename)
        return df

In [15]:
df_full = read_file("charlottesville_city")
df_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1103750 entries, 0 to 1103749
Data columns (total 10 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   hid              1103750 non-null  int64  
 1   pid              1103750 non-null  int64  
 2   activity_number  1103750 non-null  int64  
 3   activity_type    1103750 non-null  int64  
 4   start_time       1103750 non-null  int64  
 5   duration         1103750 non-null  int64  
 6   lid              1103750 non-null  int64  
 7   longitude        1103750 non-null  float64
 8   latitude         1103750 non-null  float64
 9   travel_mode      1103750 non-null  int64  
dtypes: float64(2), int64(8)
memory usage: 84.2 MB


In [22]:
def clean_data(df_day: pd.DataFrame):

    #drop meaningless columns
    
    HOME_SHIFT=1000000000

    home_activities = df_day[df_day["activity_type"]==1]["lid"]
    df_day.loc[home_activities.index, "lid"] += HOME_SHIFT
    
    return df_day

In [23]:
def pid_hour_breakdown(df: pd.DataFrame, day: int, start_hour: int, end_hour: int):
    
    HR = 3600
    
    hour_start = day*24*HR + start_hour*HR
    hour_end = day*24*HR + end_hour*HR
    
    df["end_time"] = df["start_time"]+df["duration"]
    
    df = df[(df["start_time"]<hour_end) & (df["end_time"]>hour_start)].copy()
    
    df["combined_loc"] = df.apply(lambda x: (int(x["lid"]), (x["start_time"], x["end_time"])), axis=1)
    
    # Might want to change the format of combined_locations
    df_pid = (df.groupby("pid")["combined_loc"].apply(list)).apply(
        lambda x: {(i-day*24*HR)//HR: [loc for loc in x if (loc[1][0]<=i and loc[1][1]>=i+HR)] for i in range(hour_start, hour_end, HR)})
    
    return df_pid.to_frame()

In [24]:
def get_location_directions(df: pd.DataFrame):
    return df[["lid","longitude", "latitude"]].groupby("lid").mean().to_dict('index')

def find_potential_facilities(df: pd.DataFrame):
    return set(df[df["activity_type"]!=1].lid)

In [25]:
def random_filter_location(df:pd.DataFrame, random_state=42):
    
    df["condensed"] = df["combined_loc"].apply(lambda x: [(h, l[0]) for h, loc in x.items() for l in loc])
    df["condensed_len"] = df["condensed"].apply(lambda x: len(x))

    # remove clients without any location-hour assignments
    df = df.drop(list(df[df["condensed_len"]==0].index))

    #df_selected = df["condensed"].apply(lambda x: x[random.randint(0, len(x)-1)])

    df["selected"] = df["condensed"].apply(lambda x: x[random.randint(0, len(x)-1)]).to_frame()
    df["hr"] = df["selected"].apply(lambda x: x[0])

    df["pid"] = df.index

    df["pid_loc"] = df[["pid","selected"]].apply(lambda x: (x["pid"], x["selected"][1]), axis = 1)

    df_selected = df.groupby("hr")["pid_loc"].apply(list)

    return df_selected.to_dict({})

In [42]:
def random_filter_spread(df:pd.DataFrame, spread = 7, random_state=42):
    
    random.seed(random_state)
    
    drop_pids = set(pid for pid in df.pid if random.randint(1, spread) == 1)
    print(len(drop_pids))
    
    df_sparse = df.drop(df[df["pid"].isin(drop_pids)].index, axis = 0)
    
    print(df_sparse)
    
    return df_sparse

In [40]:
def get_data(county_name: str, day:int, start_hour: int, end_hour: int):

    df_full = read_file(county_name)
    df_clean = clean_data(df_full)

    # Must be separate since some of the activity locations are recorded as home visitations (but are not potential facility locations)
    potential_facilities = find_potential_facilities(df_clean)
    location_directory = get_location_directions(df_clean)

    df_pid = pid_hour_breakdown(df_clean, day, start_hour, end_hour)
    pid_assignment = random_filter(df_pid)

    return potential_facilities, location_directory, pid_assignment

potential_facilities, location_directory, pid_assignment = get_data("charlottesville_city", 5, 6, 20)

print(len(potential_facilities), len(location_directory), len(pid_assignment))

NameError: name 'random_filter' is not defined

In [44]:
county_name = "charlottesville_city"
day = 5
start_hour = 6
end_hour = 20

df_full = read_file(county_name)
df_clean = clean_data(df_full)

potential_facilities = find_potential_facilities(df_clean)
location_directory = get_location_directions(df_clean)

df_sparse = random_filter_spread(df_clean)
df_pid = pid_hour_breakdown(df_sparse, day, start_hour, end_hour)
pid_assignment = random_filter_location(df_pid)


32682
             hid      pid  activity_number  activity_type  start_time  \
2159     2208287  5586663                0              1           0   
2160     2208287  5586663                2              4       43500   
2161     2208287  5586663                4              1       50700   
2162     2208287  5586663                5              1      100800   
2163     2208287  5586663                7              4      158400   
...          ...      ...              ...            ...         ...   
1103660  2226626  5627699               29              1      446400   
1103661  2226626  5627699               31              3      472500   
1103662  2226626  5627699               33              3      475260   
1103663  2226626  5627699               35              1      477480   
1103664  2226626  5627699               36              1      532800   

         duration         lid  longitude   latitude  travel_mode  
2159        43200  1001308403 -78.482770  38.04445

In [46]:
print({key: len(val) for key, val in pid_assignment.items()})

{6: 40, 7: 33, 8: 36, 9: 35, 10: 45, 11: 30, 12: 28, 13: 31, 14: 30, 15: 28, 16: 30, 17: 36, 18: 29, 19: 43}


## Algorithm

In [15]:
import geopy.distance
from tqdm import tqdm

def calculate_reachable(potential_facilities, location_directory, m: float):
    
    potential_facilities_list = list(potential_facilities)
    reachable_dict = {}
        
    for p in tqdm(range(len(potential_facilities_list))):
        for q in range(p, len(potential_facilities_list)):
            lid1, lid2 = potential_facilities_list[p], potential_facilities_list[q]
            coord1 = (location_directory[lid1]["latitude"], location_directions[lid1]["longitude"])
            coord2 = (location_directory[lid2]["latitude"], location_directions[lid2]["longitude"])
            dist = geopy.distance.great_circle(coord1, coord2).km
            
            if dist <= m:
                if lid1 not in reachable_dict.keys():
                    reachable_dict[lid1] = [lid2]
                else:
                    reachable_dict[lid1].append(lid2)
                
                if lid2 not in reachable_dict.keys():
                    reachable_dict[lid2] = [lid1]
                else:
                    reachable_dict[lid2].append(lid1)
    
    return reachable_dict

In [20]:
def get_data(county_name: str, day:int, start_hour: int, end_hour: int):

    df_full = read_file(county_name)
    df_clean = clean_data(df_full)

    # Must be separate since some of the activity locations are recorded as home visitations (but are not potential facility locations)
    potential_facilities = find_potential_facilities(df_clean)
    location_directory = get_location_directions(df_clean)

    df_pid = pid_hour_breakdown(df_clean, day, start_hour, end_hour)
    pid_assignment = random_filter(df_pid)

    return potential_facilities, location_directory, pid_assignment

potential_facilities, location_directory, pid_assignment = get_data("charlottesville_city", 5, 6, 20)

print(len(potential_facilities), len(location_directory), len(pid_assignment))

9728 19723 14


In [16]:
import time
start = time.time()
reachable_dict = calculate_reachable(potential_facilities, location_directory, 8)
end = time.time()

print(end-start)

100%|██████████████████████████████████████████████████████████████████████████████| 9728/9728 [28:09<00:00,  5.76it/s]

1689.5902636051178





In [17]:
print(len(reachable_dict.keys()))

9728


In [18]:
print(max([len(val) for val in reachable_dict.values()]))

4011


In [22]:
import json

with open("reachable.json", "w") as f:
    json.dump(reachable_dict, f)