In [1]:
import numpy as np
import cv2
%matplotlib inline
import glob
import csv
import pandas as pd
import re
import datetime

In [8]:
def measure_geo_distance(pt1_longitude,pt1_latitude,pt2_longtide,pt2_latitude):
    """
    This function compute the geo_distance between two points using Euclidean distance
    """
    distance = ((pt1_longitude-pt2_longtide)**2+(pt1_latitude-pt2_latitude)**2)**0.5
    return distance

def filter_by_origin(data_pickup,data_regions):
    """
    This function filter out the trips that are not originated from any of our selected region, and add a region tag
    data_pickup
    data_region
    """
    # Create a copy
    data_pickup_filtered = data_pickup.copy()

    # Initialize the list to record which region the origin is located
    in_which_region_list = []

    # Loop through each row
    for i in range(0, len(data_pickup)):
        pickup_latitude = data_pickup['Pickup_Block_Latitude'][i]
        pickup_longitude = data_pickup['Pickup_Block_Longitude'][i]
        in_which_region = -1 # Initialize with -1
        for j in range(0,len(data_regions)):
            distance = measure_geo_distance(pickup_longitude,pickup_latitude,data_regions['Longitude'][j],data_regions['Latitude'][j])
            if distance <= data_regions['Radius'][j]:
                in_which_region = j
        in_which_region_list.append(in_which_region)
    data_pickup_filtered['Region'] = in_which_region_list
    # Keep only those have real region indice
    data_pickup_filtered = data_pickup_filtered[data_pickup_filtered.Region != -1] 
    # Reset the indice
    data_pickup_filtered = data_pickup_filtered.dropna(how='any').reset_index(drop=True)
    return data_pickup_filtered

def filter_by_dest(data_pickup,data_regions,max_radius_multiple = 1.25):
    """
    This function filter out the trips that the destination is out of their corresponding regions by 1.25 of the radius
    data_pickup: pickup dataset
    data_regions: dataset that record our regions
    """
    # Create a copy
    data_pickup_dest_filter = data_pickup.copy()
    # Initialize the list to record whether the destination is within the same region
    in_region_list =  [] 

    for i in range(0,len(data_pickup_dest_filter)):
        in_region = 0 # Initialize with negative
        region_id = data_pickup_dest_filter['Region'][i]
        distance = measure_geo_distance(data_pickup_dest_filter['Pickup_Block_Longitude'][i],data_pickup_dest_filter['Pickup_Block_Latitude'][i],data_regions['Longitude'][region_id],data_regions['Latitude'][region_id])
        if distance <= data_regions['Radius'][region_id]*max_radius_multiple:
            in_region = 1
        in_region_list.append(in_region)
    # Add a column - for the convinience of removing rows
    data_pickup_dest_filter['Dest_in_region'] = in_region_list
    # Remove trips that are out of the region
    data_pickup = data_pickup[data_pickup_dest_filter.Dest_in_region == 1] 
    # Reset the indice
    data_pickup = data_pickup.dropna(how='any').reset_index(drop=True)

    return data_pickup

def date_to_weekday(data_pickup):
    """
    This function conver the date in the dataset to day of week
    data_pickup: the dataset of the pickup
    """
    # Initialize the list of weekday
    weekday_list = []
    # Initialize the recorders that can be used to speed up the function
    month_string_prev = '0'
    day_prev= '-1'
    weekday_prev = '-1'
    for i in range(0,len(data_pickup)):
        month_string = data_pickup['Month_of_Pickupdatetime_Tr'][i]
        day = int(data_pickup['Day_of_Pickupdatetime_Tr'][i])

        # We only compute the weekday again if either month or day has been changed
        if  month_string != month_string_prev or day != day_prev:
            year = 2016
            if month_string == 'January':
                month = 1
            elif month_string == 'February':
                month = 2
            elif month_string == 'March':
                month = 3
            else:
                print("Month name is wrong!")
                #return
            date_input = datetime.datetime(year,month,day)
            # Conver to weekday
            # Note: the weekday starts from 0 and end in 6
            weekday = date_input.weekday() +1
        else:
            # Simply use the previous weekday, since the date has not been changed
            weekday = weekday_prev

        # One cycle comes to the end: update the recorder
        month_string_prev = month_string
        day_prev = day
        weekday_prev = weekday

        # Add weekday to the lsit
        weekday_list.append(weekday)
    data_pickup['Weekday'] = weekday_list
    return data_pickup

def compute_avg_traffic(data_pickup):
    """
    This function aggregates the damand across the Hour, Day, Month, Weekday, and Region; then it compute the average by weekday,hour and region
    """
    df = data_pickup.copy()
    # Remove unuseful geodata columns
    df=df.drop(['Dropoff_Block_Latitude','Dropoff_Block_Longitude','Pickup_Block_Latitude','Pickup_Block_Longitude'],1)

    # Group by to find sum
    df = df.groupby(["Day_of_Pickupdatetime_Tr","Hour_of_Pickupdatetime_Tr","Month_of_Pickupdatetime_Tr","Weekday","Region"], as_index=False).sum()

    # Drop the Day, Hour, Month
    df=df.drop(['Day_of_Pickupdatetime_Tr','Month_of_Pickupdatetime_Tr',],1)

    # Compute the average by Weekday and Region
    df = df.groupby(['Weekday','Hour_of_Pickupdatetime_Tr','Region'], as_index=False).mean()
    df.columns = ['Weekday', 'Hour','Region','Avg_Traffic']
    return df

In [6]:
def process_pipeline(data_pickup, data_region):
    """
    This function set up an automated pipeline of the processes on the pickup dataset to extract the useful information
    """
    # Create a new copy
    df = data_pickup.copy()

    # Filter the trips by origins - whether the origins fall in one of the region
    df = filter_by_origin(df,data_regions)

    # Filter the trips by destination -  whether this is an intra-zone trip
    df = filter_by_dest(df,data_regions)

    # Convert date to weekday
    df = date_to_weekday(df)

    # Compute the average by weekday, hour, and region
    df = compute_avg_traffic(df)
    
    return df

In [None]:
# Read the Dataset
Data_Jan_Pickup=pd.read_csv('DC_Cab_Pickup/Jan_DC_Data.csv',sep = '\t')
Data_Feb_Pickup=pd.read_csv('DC_Cab_Pickup/Feb_DC_Data.csv',sep = '\t')
Data_Mar_Pickup=pd.read_csv('DC_Cab_Pickup/Mar_DC_Data.csv',sep = '\t')

# Merge three datasets together
frames = [Data_Jan_Pickup, Data_Feb_Pickup, Data_Mar_Pickup]
data_pickup = pd.concat(frames)
data_pickup = data_pickup.dropna(how='any').reset_index(drop=True)

# Read the Region Data Set
data_regions=pd.read_csv('DC_Cab_Pickup/CoordinateWrenches.csv')

In [9]:
# Process the dataset with the pipeline
df = process_pipeline(data_pickup,data_regions)

# Save Dataframe
df.to_csv("Pickup_Demands.csv")