In [1]:
import numpy as np
import cv2
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline
import glob
import csv
import pandas as pd
import re

In [149]:
data= pd.read_csv("DC_Uber_Data/Uber_DC_R1_Sun_7_9.csv")
data[:1]

Unnamed: 0,Origin Movement ID,Origin Display Name,Origin Geometry,Destination Movement ID,Destination Display Name,Destination Geometry,Date Range,Mean Travel Time (Seconds),Range - Lower Bound Travel Time (Seconds),Range - Upper Bound Travel Time (Seconds)
0,123,"600 Independence Avenue Southeast, Southeast W...","[[-77.003515,38.889705],[-77.003515,38.889808]...",7,"4500 Ohio Drive Southwest, Southwest Washingto...","[[-77.064696,38.891852],[-77.064626,38.891837]...","01/01/2016 - 03/31/2016, Sundays, 07:00AM-09:00AM",382,164,892


In [151]:
# Extract the Origin Information
data_after_extraction = extract_origin_geodata(data,'Origin Geometry','Origin Longitude','Origin Latitude')
data_after_extraction = extract_origin_geodata(data_after_extraction,'Destination Geometry','Destination Longitude','Destination Latitude')
data[:1]

Unnamed: 0,Origin Movement ID,Origin Display Name,Origin Geometry,Destination Movement ID,Destination Display Name,Destination Geometry,Date Range,Mean Travel Time (Seconds),Range - Lower Bound Travel Time (Seconds),Range - Upper Bound Travel Time (Seconds),Origin Longitude,Origin Latitude,Destination Longitude,Destination Latitude
0,123,"600 Independence Avenue Southeast, Southeast W...","[[-77.003515,38.889705],[-77.003515,38.889808]...",7,"4500 Ohio Drive Southwest, Southwest Washingto...","[[-77.064696,38.891852],[-77.064626,38.891837]...","01/01/2016 - 03/31/2016, Sundays, 07:00AM-09:00AM",382,164,892,"[-77.003515, -77.003515, -77.003114, -77.00258...","[38.889705, 38.889808, 38.889807, 38.889808, 3...","[-77.064696, -77.064626, -77.06435, -77.064164...","[38.891852, 38.891837, 38.891795, 38.891773, 3..."


In [136]:
def read_all_Uber_data():
    """
    This function load all the Uber data in the folder and merge them together into one data frame
    """
    # Read in all the dataset
    data_file_paths = glob.glob("DC_Uber_Data/Uber_DC*")
    
    all_data = []
    for fname in data_file_paths: # Read file name
        data= pd.read_csv(fname)
        all_data.append(data)
        # Find column names
        # list(data)

    # Merge all the datasets and store in one dataframe called data_merge
    data_merge = all_data[0]
    for i in range(1,len(all_data)):
        next_data = all_data[i]
        frames = [data_merge,next_data]
        data_merge = pd.concat(frames)
    return data_merge


def extract_time_data(data):
    """
    This function extract the time data including the Months, Day of Week, and Hour of Day
    """
    Months = []
    Weekdays = []
    Times = []
    for line in data[colnames[6]]:
        array = line.split(',')
        Month = array[0]
        Weekday = array[1]
        Time = array[2]

        Months.append(Month)
        Weekdays.append(Weekday)
        Times.append(Time)
    data['Months']= Months
    data['Weekdays'] = Weekdays
    data['Times'] = Times
    return data

In [148]:
def extract_origin_geodata(data,colname,new_colname_longitude,new_colname_latitude):
    """
    This function read all the original longitudes and latitudes and store in two new columns.
    Instead of writing two functions to reach the origin and destination geometry data respectively, I just wrote one
    to be used in both scenario
    data: the Uber Movement Dataset
    colname: the name of the column that store the geodata: 'Origin Geometry' / 'Desination Geometry'
    new_colname_longitude: the name for the new column that stores the extracted longitude information
    new_colname_latitude: the name for the new column that stores the extracted latitude information
    """
    rows_longitudes = [] # Store the longitude info of each row
    rows_latitudes = [] # Store the latitude info of each row

    # Read the Destination Geometry data from each line
    for line in data[colname]: # The index is subject to change when we merged multiple dataset together
        longitudes = [] # Record the longitudes in each row
        latitudes = [] # Record the latitudes in each row

        array = re.split(r'[|[|,|]',line) # Split the cell contents
        # First two components are negligible, every three units (as a pack) after contain: longitutde, latitutde, and null. 
        # Each latitude contains a ']' in the end
        # The last latitutde contains 2 ']'s
        # The last pack contains only two units: a longitude, and a latitutde
        total_coords = int((len(array)-2+1)/3); # Find the total number of pairs of longitudes and latitudes
        for i in range(0,total_coords): # Loop until the second last 
            longitudes.append(float(array[i*3+2]))
            latitude_pieces = array[i*3+3].split(']')
            latitudes.append(float(latitude_pieces[0]))
        rows_longitudes.append(longitudes);
        rows_latitudes.append(latitudes);
    data[new_colname_longitude] = rows_longitudes # Add one more column for destination longitudes
    data[new_colname_latitude]= rows_latitudes # Add one more column for destination latitudes
    
    # To read the first destination longitude of the first row:
    # data['dest_longtitudes'][0][0]
    return data

In [6]:
# Read all the Uber data in Washington DC, merge into a data frame, and return the dataframe
data = read_all_Uber_data()
# Save data in a csv
# data.to_csv("DC_Data/Uber_All.csv")

In [65]:
data= pd.read_csv("DC_Data/Uber_All.csv")
data = data.drop('Unnamed: 0', 1) # Drop a unuseful column for index

In [7]:
# Find the data dimension
data.shape

(1030, 10)

In [10]:
# Find the data columns
colnames = list(data)
colnames

['Origin Movement ID',
 'Origin Display Name',
 'Origin Geometry',
 'Destination Movement ID',
 'Destination Display Name',
 'Destination Geometry',
 'Date Range',
 'Mean Travel Time (Seconds)',
 'Range - Lower Bound Travel Time (Seconds)',
 'Range - Upper Bound Travel Time (Seconds)']

In [81]:
data.to_csv("DC_Data/Uber_All_processed.csv")

In [13]:
data[:1]

Unnamed: 0,Origin Movement ID,Origin Display Name,Origin Geometry,Destination Movement ID,Destination Display Name,Destination Geometry,Date Range,Mean Travel Time (Seconds),Range - Lower Bound Travel Time (Seconds),Range - Upper Bound Travel Time (Seconds),latitudes,longtitudes,Months,Weekdays,Times
0,123,"600 Independence Avenue Southeast, Southeast W...","[[-77.003515,38.889705],[-77.003515,38.889808]...",1,"5400 Arnold Avenue Southwest, Southwest Washin...","[[-77.048009,38.841266],[-77.047969,38.841267]...","01/01/2016 - 03/31/2016, Sundays, 12:00AM-02:00AM",504,415,612,38.841267,-77.047969,01/01/2016 - 03/31/2016,Sundays,12:00AM-02:00AM
