### Data Exploration ###

Datasets: Charlottesville City, Albemarle County


In [9]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import random

In [3]:
filename_cville = "usa_va_charlottesville_city_adult_activity_location_assignment_week.csv"
df_cville = pd.read_csv(filename_cville)
df_cville.head()

Unnamed: 0,hid,pid,activity_number,activity_type,start_time,duration,lid,longitude,latitude,travel_mode
0,2208253,5586585,0,1,0,27900,1018209,-78.488467,38.043025,-1
1,2208253,5586585,2,2,28800,17100,82246,-78.41492,38.15753,-1
2,2208253,5586585,4,4,46800,1200,86726,-78.49879,38.13916,-1
3,2208253,5586585,6,2,48600,15300,82246,-78.41492,38.15753,-1
4,2208253,5586585,8,1,64800,36000,1018209,-78.488467,38.043025,-1


In [4]:
filename_albe = "usa_va_albemarle_adult_activity_location_assignment_week.csv"
df_albe = pd.read_csv(filename_albe)
df_albe.head()

Unnamed: 0,hid,pid,activity_number,activity_type,start_time,duration,lid,longitude,latitude,travel_mode
0,13480,30923,0,1,0,25200,1018178,-78.435006,38.0923,-1
1,13480,30923,2,2,27000,7680,117201,-78.44051,38.02277,-1
2,13480,30923,4,2,37200,6000,117201,-78.44051,38.02277,-1
3,13480,30923,6,4,44700,1800,118560,-78.51548,38.06175,-1
4,13480,30923,8,2,47400,17400,117201,-78.44051,38.02277,-1


### Pipeline

In [2]:
def read_file(county_name: str):
    if county_name!= "albemarle" and county_name!="charlottesville_city":
        print("Invalid County Name")
    else:
        filename = f"usa_va_{county_name}_adult_activity_location_assignment_week.csv"
        df = pd.read_csv(filename)
        return df

In [15]:
df_full = read_file("charlottesville_city")
df_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1103750 entries, 0 to 1103749
Data columns (total 10 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   hid              1103750 non-null  int64  
 1   pid              1103750 non-null  int64  
 2   activity_number  1103750 non-null  int64  
 3   activity_type    1103750 non-null  int64  
 4   start_time       1103750 non-null  int64  
 5   duration         1103750 non-null  int64  
 6   lid              1103750 non-null  int64  
 7   longitude        1103750 non-null  float64
 8   latitude         1103750 non-null  float64
 9   travel_mode      1103750 non-null  int64  
dtypes: float64(2), int64(8)
memory usage: 84.2 MB


In [3]:
def clean_data(df_day: pd.DataFrame):

    #drop meaningless columns
    
    HOME_SHIFT=1000000000

    home_activities = df_day[df_day["activity_type"]==1]["lid"]
    df_day.loc[home_activities.index, "lid"] += HOME_SHIFT
    
    return df_day

In [14]:
def pid_hour_breakdown(df: pd.DataFrame, day: int, start_hour: int, end_hour: int):
    
    HR = 3600
    
    hour_start = day*24*HR + start_hour*HR
    hour_end = day*24*HR + end_hour*HR
    
    df["end_time"] = df["start_time"]+df["duration"]
    
    df = df[(df["start_time"]<hour_end) & (df["end_time"]>hour_start)].copy()
    
    df["combined_loc"] = df.apply(lambda x: (int(x["lid"]), (x["start_time"], x["end_time"])), axis=1)
    
    # Might want to change the format of combined_locations
    df_pid = (df.groupby("pid")["combined_loc"].apply(list)).apply(
        lambda x: {(i-day*24*HR)//HR: [loc for loc in x if (loc[1][0]<=i and loc[1][1]>=i+HR)] for i in range(hour_start, hour_end, HR)})
    
    return df_pid.to_frame()

In [5]:
def get_location_directions(df: pd.DataFrame):
    return df[["lid","longitude", "latitude"]].groupby("lid").mean().to_dict('index')

def find_potential_facilities(df: pd.DataFrame):
    return set(df[df["activity_type"]!=1].lid)

In [6]:
def random_filter(df:pd.DataFrame, random_state=42):
    df["condensed"] = df["combined_loc"].apply(lambda x: [(h, l[0]) for h, loc in x.items() for l in loc])
    df["condensed_len"] = df["condensed"].apply(lambda x: len(x))
    df = df.drop(list(df[df["condensed_len"]==0].index))
    df_selected = df["condensed"].apply(lambda x: x[random.randint(0, len(x)-1)])
    return df_selected.to_dict({})

In [16]:
def get_data(county_name: str, day:int, start_hour: int, end_hour: int):

    df_full = read_file(county_name)
    df_clean = clean_data(df_full)

    # Must be separate since some of the activity locations are recorded as home visitations (but are not potential facility locations)
    potential_facilities = find_potential_facilities(df_clean)
    location_directory = get_location_directions(df_clean)

    df_pid = pid_hour_breakdown(df_clean, day, start_hour, end_hour)
    pid_assignment = random_filter(df_pid)

    return potential_facilities, location_directory, pid_assignment

potential_facilities, location_directory, pid_assignment = get_data("charlottesville_city", 5, 6, 20)

print(len(potential_facilities), len(location_directory), len(pid_assignment))

9728 19723 33156
