In [1]:
import numpy as np
import cv2
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline
import glob
import csv
import pandas as pd
import re
import datetime

In [2]:
def measure_geo_distance(pt1_longitude,pt1_latitude,pt2_longtide,pt2_latitude):
    """
    This function compute the geo_distance between two points using Euclidean distance
    """
    distance = ((pt1_longitude-pt2_longtide)**2+(pt1_latitude-pt2_latitude)**2)**0.5
    return distance

def filter_by_origin(data_pickup,data_regions):
    """
    This function filter out the trips that are not originated from any of our selected region, and add a region tag
    data_pickup
    data_region
    """
    # Create a copy
    data_pickup_filtered = data_pickup.copy()

    # Initialize the list to record which region the origin is located
    in_which_region_list = []

    # Loop through each row
    for i in range(0, len(data_pickup)):
        pickup_latitude = data_pickup['Pickup_Block_Latitude'][i]
        pickup_longitude = data_pickup['Pickup_Block_Longitude'][i]
        in_which_region = -1 # Initialize with -1
        for j in range(0,len(data_regions)):
            distance = measure_geo_distance(pickup_longitude,pickup_latitude,data_regions['Longitude'][j],data_regions['Latitude'][j])
            if distance <= data_regions['Radius'][j]:
                in_which_region = j
        in_which_region_list.append(in_which_region)
    data_pickup_filtered['Region'] = in_which_region_list
    # Keep only those have real region indice
    data_pickup_filtered = data_pickup_filtered[data_pickup_filtered.Region != -1] 
    # Reset the indice
    data_pickup_filtered = data_pickup_filtered.dropna(how='any').reset_index(drop=True)
    return data_pickup_filtered

def filter_by_dest(data_pickup,data_regions,max_radius_multiple = 1.25):
    """
    This function filter out the trips that the destination is out of their corresponding regions by 1.25 of the radius
    data_pickup: pickup dataset
    data_regions: dataset that record our regions
    """
    # Create a copy
    data_pickup_dest_filter = data_pickup.copy()
    # Initialize the list to record whether the destination is within the same region
    in_region_list =  [] 

    for i in range(0,len(data_pickup_dest_filter)):
        in_region = 0 # Initialize with negative
        region_id = data_pickup_dest_filter['Region'][i]
        distance = measure_geo_distance(data_pickup_dest_filter['Pickup_Block_Longitude'][i],data_pickup_dest_filter['Pickup_Block_Latitude'][i],data_regions['Longitude'][region_id],data_regions['Latitude'][region_id])
        if distance <= data_regions['Radius'][region_id]*max_radius_multiple:
            in_region = 1
        in_region_list.append(in_region)
    # Add a column - for the convinience of removing rows
    data_pickup_dest_filter['Dest_in_region'] = in_region_list
    # Remove trips that are out of the region
    data_pickup = data_pickup[data_pickup_dest_filter.Dest_in_region == 1] 
    # Reset the indice
    data_pickup = data_pickup.dropna(how='any').reset_index(drop=True)

    return data_pickup

def date_to_weekday(data_pickup):
    """
    This function conver the date in the dataset to day of week
    data_pickup: the dataset of the pickup
    """
    # Initialize the list of weekday
    weekday_list = []
    # Initialize the recorders that can be used to speed up the function
    month_string_prev = '0'
    day_prev= '-1'
    weekday_prev = '-1'
    for i in range(0,len(data_pickup)):
        month_string = data_pickup['Month_of_Pickupdatetime_Tr'][i]
        day = int(data_pickup['Day_of_Pickupdatetime_Tr'][i])

        # We only compute the weekday again if either month or day has been changed
        if  month_string != month_string_prev or day != day_prev:
            year = 2016
            if month_string == 'January':
                month = 1
            elif month_string == 'February':
                month = 2
            elif month_string == 'March':
                month = 3
            else:
                print("Month name is wrong!")
                #return
            date_input = datetime.datetime(year,month,day)
            # Conver to weekday
            # Note: the weekday starts from 0 and end in 6
            weekday = date_input.weekday() +1
        else:
            # Simply use the previous weekday, since the date has not been changed
            weekday = weekday_prev

        # One cycle comes to the end: update the recorder
        month_string_prev = month_string
        day_prev = day
        weekday_prev = weekday

        # Add weekday to the lsit
        weekday_list.append(weekday)
    data_pickup['Weekday'] = weekday_list
    return data_pickup

def compute_avg_traffic(data_pickup):
    """
    This function aggregates the damand across the Hour, Day, Month, Weekday, and Region; then it compute the average by weekday,hour and region
    """
    # Subject to be deleted
    df = data_pickup_filtered
    # Remove unuseful geodata columns
    df=df.drop(['Dropoff_Block_Latitude','Dropoff_Block_Longitude','Pickup_Block_Latitude','Pickup_Block_Longitude'],1)

    # Group by to find sum
    df = df.groupby(["Day_of_Pickupdatetime_Tr","Hour_of_Pickupdatetime_Tr","Month_of_Pickupdatetime_Tr","Weekday","Region"], as_index=False).sum()

    # Drop the Day, Hour, Month
    df=df.drop(['Day_of_Pickupdatetime_Tr','Month_of_Pickupdatetime_Tr',],1)

    # Compute the average by Weekday and Region
    df = df.groupby(['Weekday','Hour_of_Pickupdatetime_Tr','Region'], as_index=False).mean()
    df.columns = ['Weekday', 'Hour','Region','Avg_Traffic']
    return df

In [3]:
# Read the Dataset
Data_Jan_Pickup=pd.read_csv('DC_Cab_Pickup/Jan_DC_Data.csv',sep = '\t')
Data_Feb_Pickup=pd.read_csv('DC_Cab_Pickup/Feb_DC_Data.csv',sep = '\t')
Data_Mar_Pickup=pd.read_csv('DC_Cab_Pickup/Mar_DC_Data.csv',sep = '\t')

# Read the Region Data Set
data_regions=pd.read_csv('DC_Cab_Pickup/CoordinateWrenches.csv')

In [19]:
data_pickup_jan = Data_Jan_Pickup[:1000]
data_pickup_feb = Data_Feb_Pickup[:10]
data_pickup_mar = Data_Mar_Pickup[:10]

In [33]:
frames = [data_pickup_jan, data_pickup_feb, data_pickup_mar]
data_pickup_copy = pd.concat(frames)
data_pickup_copy = data_pickup_copy.dropna(how='any').reset_index(drop=True)

In [6]:
data_pickup_copy

Unnamed: 0,Day_of_Pickupdatetime_Tr,Dropoff_Block_Latitude,Dropoff_Block_Longitude,Hour_of_Pickupdatetime_Tr,Month_of_Pickupdatetime_Tr,Pickup_Block_Latitude,Pickup_Block_Longitude,Count_of_Objectid
0,1,38.823601,-77.00889,0,January,38.823601,-77.00889,2
1,1,38.89925,-76.978409,0,January,38.830806,-77.007675,2
2,1,38.937325,-77.01936,0,January,38.843884,-76.978919,2
3,1,38.901367,-77.026265,0,January,38.856418,-76.990338,2
4,1,38.918872,-77.017792,0,January,38.864584,-76.999364,4
5,1,38.829196,-77.00678,0,January,38.865158,-76.989645,2
6,1,38.909637,-77.047716,0,January,38.867764,-77.010693,2
7,1,38.898451,-77.022424,0,January,38.867764,-77.010693,2
8,1,38.885956,-77.021911,0,January,38.867764,-77.010693,2
9,1,38.896039,-77.007142,0,January,38.868566,-76.967778,2


In [7]:
data_regions

Unnamed: 0,Latitude,Longitude,Radius
0,38.903646,-77.05208,0.015497
1,38.897879,-77.026263,0.012657
2,38.905107,-77.036653,0.012657
3,38.89336,-77.008867,0.017973
4,38.921966,-77.047168,0.018967


In [20]:
data_pickup_filtered = filter_by_origin(data_pickup_copy,data_regions)

In [25]:
data_pickup_filtered = filter_by_dest(data_pickup_filtered,data_regions)

In [26]:
len(data_pickup_filtered)

8

In [27]:
data_pickup_filtered[:3]

Unnamed: 0,Day_of_Pickupdatetime_Tr,Dropoff_Block_Latitude,Dropoff_Block_Longitude,Hour_of_Pickupdatetime_Tr,Month_of_Pickupdatetime_Tr,Pickup_Block_Latitude,Pickup_Block_Longitude,Count_of_Objectid,Region
0,1,38.879286,-77.009855,0,February,38.880285,-77.017554,2,3
1,1,38.88655,-76.938357,0,February,38.885134,-76.997316,2,3
2,1,38.90324,-77.025018,0,February,38.887564,-77.015885,2,3


In [28]:
data_pickup_filtered = date_to_weekday(data_pickup_filtered)

In [30]:
data_pickup_filtered

Unnamed: 0,Day_of_Pickupdatetime_Tr,Dropoff_Block_Latitude,Dropoff_Block_Longitude,Hour_of_Pickupdatetime_Tr,Month_of_Pickupdatetime_Tr,Pickup_Block_Latitude,Pickup_Block_Longitude,Count_of_Objectid,Region,Weekday
0,1,38.879286,-77.009855,0,February,38.880285,-77.017554,2,3,1
1,1,38.88655,-76.938357,0,February,38.885134,-76.997316,2,3,1
2,1,38.90324,-77.025018,0,February,38.887564,-77.015885,2,3,1
3,1,38.886826,-77.003516,0,March,38.887119,-77.019205,2,3,2
4,1,38.935738,-77.109038,0,March,38.887564,-77.023953,2,3,2
5,1,38.888702,-77.032497,0,March,38.887564,-77.023953,2,3,2
6,1,38.903132,-77.036541,0,March,38.894144,-77.020906,2,3,2
7,1,38.919183,-77.030786,0,March,38.894336,-77.021914,2,3,2


In [31]:
df = compute_avg_traffic(data_pickup_filtered)

In [32]:
df

Unnamed: 0,Weekday,Hour,Region,Avg_Traffic
0,1,0,3,6
1,2,0,3,10


In [34]:
#def process_pipeline(data_pickup, data_region):
"""
This function set up an automated pipeline of the processes on the pickup dataset to extract the useful information
"""
data_pickup = data_pickup_copy

# Create a new copy
df = data_pickup.copy()

# Filter the trips by origins - whether the origins fall in one of the region
df = filter_by_origin(df,data_regions)

# Filter the trips by destination -  whether this is an intra-zone trip
df = filter_by_dest(df,data_regions)

# Convert date to weekday
df = date_to_weekday(df)

# Compute the average by weekday, hour, and region
df = compute_avg_traffic(df)


In [35]:
df

Unnamed: 0,Weekday,Hour,Region,Avg_Traffic
0,1,0,3,6
1,2,0,3,10
