In [1]:
import numpy as np
import cv2
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline
import glob
import csv
import pandas as pd
import re

In [310]:
def extract_geodata(data,colname,new_colname_longitude,new_colname_latitude):
    """
    This function read all the longitudes and latitudes of the origin/destination (as specified) and store in two new columns.
    Instead of writing two functions to reach the origin and destination geometry data respectively, I just wrote one
    to be used in both scenario
    data: the Uber Movement Dataset
    colname: the name of the column that store the geodata: 'Origin Geometry' / 'Desination Geometry'
    new_colname_longitude: the name for the new column that stores the extracted longitude information
    new_colname_latitude: the name for the new column that stores the extracted latitude information
    """
    rows_longitudes = [] # Store the longitude info of each row
    rows_latitudes = [] # Store the latitude info of each row

    # Read the Geometry data from each line
    for line in data[colname]: # The index is subject to change when we merged multiple dataset together
        longitudes = [] # Record the longitudes in each row
        latitudes = [] # Record the latitudes in each row

        array = re.split(r'[|[|,|]',line) # Split the cell contents
        # First two components are negligible, every three units (as a pack) after contain: longitutde, latitutde, and null. 
        # Each latitude contains a ']' in the end
        # The last latitutde contains 2 ']'s
        # The last pack contains only two units: a longitude, and a latitutde
        total_coords = int((len(array)-2+1)/3); # Find the total number of pairs of longitudes and latitudes
        for i in range(0,total_coords): # Loop until the second last 
            longitudes.append(float(array[i*3+2]))
            latitude_pieces = array[i*3+3].split(']')
            latitudes.append(float(latitude_pieces[0]))
        rows_longitudes.append(longitudes)
        rows_latitudes.append(latitudes)
    data[new_colname_longitude] = rows_longitudes # Add one more column for destination longitudes
    data[new_colname_latitude]= rows_latitudes # Add one more column for destination latitudes
    
    # To read the first destination longitude of the first row:
    # data['dest_longtitudes'][0][0]
    return data

def extract_time_data(data):
    """
    This function extract the time data including the Months, Day of Week, and Hour of Day
    """
    Months = []
    Weekdays = []
    Times = []
    for line in data[colnames[6]]:
        array = line.split(',')
        Month = array[0]
        Weekday = array[1]
        Time = array[2]

        Months.append(Month)
        Weekdays.append(Weekday)
        Times.append(Time)
    data['Months']= Months
    data['Weekdays'] = Weekdays
    data['Times'] = Times
    return data

def measure_geo_distance(pt1_longitude,pt1_latitude,pt2_longtide,pt2_latitude):
    """
    This function compute the geo_distance between two points using Euclidean distance
    """
    distance = ((pt1_longitude-pt2_longtide)**2+(pt1_latitude-pt2_latitude)**2)**0.5
    return distance

def find_centroid(data_uber, data_regions):
    """
    This function find the centroid that is the closest to the origin geometry in the given uber dataset
    """
    # Get the geodata of all the origins geodata in the given dataset
    # The origins in every row in the given dataset are the same, so we can just use the first row
    data_uber_longitude = data_uber['Origin Longitude'][1]
    data_uber_latitude = data_uber['Origin Latitude'][1]
    num_origins = len(data_uber_longitude) # Number of origin coordinates

    # Measure the distance between all the origin coordinates and each of the centroids, find the average
    avg_distance_origins_centroids = [] # Initialize the list to record the average distance between the origins and centroids
    # Iterate through each centroid
    for i in range(0,len(data_regions)): 
        # Read the Geodata of each centroid
        region_latitude = data_regions["Latitude"][i] # Read the latitude
        region_longitude = data_regions["Longitude"][i] # Read the longitude
        # region_radius = data_regions["Radius"][i] # Read the Radius

        # Measure the Distance between each origin coordiante with the centroid
        distances = []
        for j in range(0,num_origins):
            #distance = ((data_uber_longitude[j]-region_longitude)**2+(data_uber_latitude[j]-region_latitude)**2)**0.5
            distance = measure_geo_distance(data_uber_longitude[j],data_uber_latitude[j],region_longitude,region_latitude)
            distances.append(distance)
        avg_distance_origins_centroids.append(sum(distances)/len(distances))

    # Find the minimum average distance between the origins and the centroids
    min_distance = min(avg_distance_origins_centroids)
    # Select the centroid that is the distance is cooresponding to, used that as our desired centroid
    # Note: by using argmin, we get the index of the centroids on the dataset, and the index starts with 0
    closest_centroid = np.argmin(avg_distance_origins_centroids)
    return min_distance,closest_centroid

def find_dest_zones_in_region(data_uber,data_regions,closest_centroid,threshold = 1):
    """
    This function find the indices of destination zones that fall into the selected region
    data_uber: one uber movement dataset - represent all the info of trips from one zone in a given time
    data_regions: the dataset storing all the centroids info
    closest_centroid: the index of the closest centroid in the data_region
    threshold: We only select the destination destrict that have enough percentage of coordinates that fall into the region
    """
    # Find the geodata of the closest centroid
    region_latitude = data_regions["Latitude"][closest_centroid] # Read the latitude
    region_longitude = data_regions["Longitude"][closest_centroid] # Read the longitude
    region_radius = data_regions["Radius"][closest_centroid] # Read the Radius

    # Initialize the list to record the percentage
    # in_region_percentage_list = []
    
    # Initialize the list to record the index of the destination district that meet the requirement (fall in the region)
    in_region_destination_list = []
    #distance_list = []

    # Loop through each destination district (each row)
    for i in range(0,len(data_uber)):
        dest_longitudes = data_uber['Destination Longitude'][i]
        dest_latitudes = data_uber['Destination Latitude'][i]
        num_coords = len(dest_longitudes) # Number of origin coordinates
        distances = [] 
        in_region_count = 0 # Reset the count of coordiantes that fall into the region
        # Loop through each coordinates of the destination district
        for j in range(0,num_coords):
            # Compare the distance of the coordinate and the centroid
            distance = measure_geo_distance(dest_longitudes[j],dest_latitudes[j],region_longitude,region_latitude)
            # distances.append(distance)
            # count if the coordinate fall into the region
            if distance <= region_radius:
                in_region_count = in_region_count + 1
            # Store the percentage of the coordinates that fall into the region
        # If the percentage of the coordinates that fall into the region exceeds the threshold, record the index of the district
        if in_region_count/num_coords >= threshold:
            in_region_destination_list.append(i)
        #in_region_percentage_list.append(in_region_count/num_coords)
        #distance_list.append(distances)
    return in_region_destination_list

def compute_variability(data,destination_list):
    """
    This function compute the trip duration variability: mean/range.
    data: a given uber movement dataset as an input
    destination_list: a list of indices of the destination that fall into the target regions
    """
    # Initialize the list to record the variability
    variability_list = []

    for i in in_region_destination_list:
        mean = data['Mean Travel Time (Seconds)'][i]
        lower_bound = data['Range - Lower Bound Travel Time (Seconds)'][i]
        upper_bound = data['Range - Upper Bound Travel Time (Seconds)'][i]
        variability_list.append(mean/(upper_bound-lower_bound))
    return variability_list

def find_avg_variability(data,destination_list):
    """
    This function find a list of variabilities of trips to the destinations of interest, and compute the average
    data: a given uber movement dataset as an input
    destination_list: a list of indices of the destination that fall into the target regions
    """
    variability_list = compute_variability(data,destination_list)
    return (sum(variability_list)/len(variability_list))

In [327]:
def process_pipeline(data_uber, data_region):
    """
    This function set up an automated pipeline of the processes on the uber dataset to extract the useful information
    """
    # Extract the Origin Information
    data_after_extraction = extract_geodata(data_uber,'Origin Geometry','Origin Longitude','Origin Latitude')
    # Extract the destination Information
    data_after_extraction = extract_geodata(data_after_extraction,'Destination Geometry','Destination Longitude','Destination Latitude')
    # Extract the time information
    data_after_extraction = extract_time_data(data_after_extraction)

    # Find the centroid that is closest to the origin district in the given dataset
    [min_distance,closest_centroid] = find_centroid(data_after_extraction,data_regions)
    centroid_longitude = data_regions["Longitude"][closest_centroid]
    centroid_latitude = data_regions["Latitude"][closest_centroid]
    # If the shortest distance between the origin district and the selected centroid is 
    # longer than the radius of the centroid, report wrong
    if (min_distance > data_regions["Radius"][closest_centroid]):
        print("The origin is outside of the selected region!")
    
    # Find the indices and number of destinations in the selected region
    in_region_destination_list = find_dest_zones_in_region(data_after_extraction,data_regions,closest_centroid)
    num_destinations_in_region = len(in_region_destination_list)
    #data_in_region.ix[in_region_destination_list]

    # Find Average Variability of the current data W.R.T. interested destination district
    avg_var = find_avg_variability(data_after_extraction,in_region_destination_list)

    # Get the Time Information
    # Remove space from String
    Weekday = data_after_extraction['Weekdays'][1].replace(' ', '')
    Time = data_after_extraction['Times'][1].replace(' ', '')
    return Weekday,Time,centroid_longitude,centroid_latitude,num_destinations_in_region,avg_var

In [314]:
# Read the Uber Movement Data
data_uber= pd.read_csv("DC_Uber_Data/Uber_DC_R1_Sun_7_9.csv")

In [315]:
# Read the pre-defined region's data
data_regions=pd.read_csv('DC_Cab_Pickup/CoordinateWrenches.csv')

In [328]:
[Weekday,Time,centroid_longitude,centroid_latitude,num_destinations_in_region,avg_var] = process_pipeline(data_uber, data_region)

In [136]:
def read_all_Uber_data():
    """
    This function load all the Uber data in the folder and merge them together into one data frame
    """
    # Read in all the dataset
    data_file_paths = glob.glob("DC_Uber_Data/Uber_DC*")
    
    all_data = []
    for fname in data_file_paths: # Read file name
        data= pd.read_csv(fname)
        all_data.append(data)
        # Find column names
        # list(data)

    # Merge all the datasets and store in one dataframe called data_merge
    data_merge = all_data[0]
    for i in range(1,len(all_data)):
        next_data = all_data[i]
        frames = [data_merge,next_data]
        data_merge = pd.concat(frames)
    return data_merge




In [6]:
# Read all the Uber data in Washington DC, merge into a data frame, and return the dataframe
data = read_all_Uber_data()
# Save data in a csv
# data.to_csv("DC_Data/Uber_All.csv")

In [65]:
data= pd.read_csv("DC_Data/Uber_All.csv")
data = data.drop('Unnamed: 0', 1) # Drop a unuseful column for index

In [7]:
# Find the data dimension
data.shape

(1030, 10)

In [81]:
data.to_csv("DC_Data/Uber_All_processed.csv")