In [2]:
# Import libraries
import pandas as pd
import numpy as np
import zipfile

import matplotlib.pyplot as plt

import skmob
import geopandas as gpd
from shapely.geometry import Polygon
from shapely.geometry import Point

import cenpy
import osmnx

#!pip install selenium
import selenium


## 1. Data Input

[Data source: Lyft Bay Wheels](https://www.lyft.com/bikes/bay-wheels/system-data)

Lyft provides BayWheel history data spanning year 2017 to 2021-09. Each dataset includes columns like trip duration, start/end time, start/end station ID, start/end lon&lat, user type, etc. No trajectory data is provided in this historical dataset.

The Data set includeds:
- Trip Duration (seconds)
- Start Time and Date
- End Time and Date
- Start Station ID
- Start Station Name
- Start Station Latitude
- Start Station Longitude
- End Station ID
- End Station Name
- End Station Latitude
- End Station Longitude
- Bike ID
- User Type (Subscriber or Customer – “Subscriber” = Member or “Customer” = Casual)

In [3]:
## Load Zip FIle
def LoadZipFile(file_name):
    '''
    Input: File name in string
    Output: DataFrame of the file
    '''
    # Load .zip local data
    file_name = file_name
    zip_file = zipfile.ZipFile(file_name)
    df_csv = pd.read_csv(zip_file.open(zip_file.namelist()[0])) #solve Dirty zip file
    print('Complete Loading Zip File.')
    return df_csv

def LoadZipFileRemotely(url_string):
    # Alternatively: Load .zip remotely from https
    # Take time to download

    # Example
    #LoadZipFileRemotely('https://s3.amazonaws.com/baywheels-data/202106-baywheels-tripdata.csv.zip')
    import requests, zipfile
    from urllib.request import urlopen
    from io import BytesIO

    zip_file_url = url_string
    remote_zip_file = urlopen(zip_file_url)
    zipinmemory = BytesIO(remote_zip_file.read())
    zip_file = zipfile.ZipFile(zipinmemory)

    # the zipfile namelist can be filtered for smarter file loading
    # In this case, only load the first file from the zip archive 
    df_csv_re = pd.read_csv(zip_file.open(zip_file.namelist()[0]))
    return df_csv_re

In [6]:
# Example Data
filepath = '../data/202109-baywheels-tripdata.csv.zip'
LoadZipFile(filepath).head(5)

Complete Loading Zip File.


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,9515818787165EA3,electric_bike,2021-09-22 13:30:57,2021-09-22 13:41:40,17th & Folsom Street Park (17th St at Folsom St),SF-N23,Civic Center/UN Plaza BART Station (Market St ...,SF-I25,37.763686,-122.415521,37.780897,-122.412124,member
1,4857E21E7C02369C,electric_bike,2021-09-03 11:47:57,2021-09-03 11:54:02,The Embarcadero at Pier 38,SF-H30,4th St at 16th St,SF-M30,37.783198,-122.387994,37.767314,-122.390958,member
2,6AF0D6AC2DCF9B1B,classic_bike,2021-09-22 11:51:39,2021-09-22 12:06:54,27th St at MLK Jr Way,OK-I4,Telegraph Ave at 49th St,OK-D4,37.817015,-122.271761,37.83575,-122.262654,casual
3,B92F4F9D28E81783,docked_bike,2021-09-06 12:53:27,2021-09-06 12:59:23,The Embarcadero at Pier 38,SF-H30,The Embarcadero at Pier 38,SF-H30,37.782926,-122.387921,37.782926,-122.387921,casual
4,D5C84175F52EFBEF,classic_bike,2021-09-05 11:04:51,2021-09-05 11:09:00,The Embarcadero at Pier 38,SF-H30,Berry St at 4th St,SF-K29-1,37.782926,-122.387921,37.77588,-122.39317,member


[Census Data: Cenpy API](http://cenpy-devs.github.io/cenpy/api.html)

## 2. Data Cleaning Method

In [8]:
def df_cleaning (df):
    '''Data cleaning, delete blank & NaN rows.'''
    before_cleaning = len(df)
    df.replace("", float("NaN"), inplace=True)
    df.dropna(inplace=True)
    after_cleaning = len(df)
    print('Original Dataframe has', before_cleaning, 'rows.')
    print (before_cleaning-after_cleaning, 'NaN or blank rows are dropped')
    print(after_cleaning, 'rows are left.')
    print('Percentage of invalid rows:',np.round(100*(before_cleaning-after_cleaning)/(before_cleaning), 3),'%')
    print('Complete data frame cleaning.')
    return df

## 3. Data Integration

In [9]:
#COMPLETE
# Create Station reference dataframe with assigned ID
def CreateStationDF(df_csv):
    '''
    Input: df_csv
    Output: station_df
    Dataframe with tile_ID assigned at each station for reference
    Includes both start&end_station_id
    '''
    # Get unique start & end station id
    unique_station_id = pd.concat([df_csv['start_station_id'],df_csv['end_station_id']], axis=0).unique()
    # Assign arbitary index for each station
    unique_station_tile_ID = np.arange(len(unique_station_id))
    station_df = pd.DataFrame({'station_id':unique_station_id, 'tile_ID':unique_station_tile_ID})
    station_df = station_df.dropna(axis=0) # Drop NaN station_id

    print('Created station tile reference dataframe')
    return station_df

In [10]:
# COMPLETE
def AddTileID2DF (df_csv, station_df):
    '''
    Input: df_csv, station_df
    Output: df_csv_new, with tile_ID included for start & end
    as column names ['tile_ID_start', 'tile_ID_end']
    '''
    ## USE PANDAS MERGE
    df_csv_new = pd.merge(df_csv, station_df, left_on='start_station_id', right_on='station_id')
    df_csv_new.rename(columns={'tile_ID':'tile_ID_start'}, inplace=True)
    df_csv_new = pd.merge(df_csv_new, station_df, left_on='end_station_id', right_on='station_id')
    df_csv_new.rename(columns={'tile_ID':'tile_ID_end'}, inplace=True)
    df_csv_new = df_csv_new.drop(['station_id_x',  'station_id_y'], axis=1)
    
    print('Matched tile to loaded dataframe.')
    return df_csv_new

In [11]:
pd.options.mode.chained_assignment = None  # default='warn' #CANCEL WARNING MESSAGES
# NEED TO OPTIMIZE
# Having trouble intaking large data set somehow??

def CreateStationTessDF (df_csv, station_df):
    '''
    Input: df_csv, station_df
    Output: Tessellation df. A geodataframe include tile_ID, geometry
    '''
    print('Start to create tessellation for loaded dataframe')
    station_df['geometry'] = ''
    station_df['start_frequency'] = ''
    station_df['return_frequency'] = ''
    for idx in np.arange(len(station_df)):
        current_station_id = station_df.iloc[idx]['station_id'] #'ABC-12' #FIXED KEY ERROR 213. stupid mistake of using loc. USE iloc for index.

        lat_array = []
        lng_array = []
        # Find lon lat of this station
        # Use both start & end station coordinates
        lat_array = np.append(lat_array, np.array(df_csv.loc[df_csv['start_station_id']==current_station_id]['start_lat']))
        lat_array = np.append(lat_array, np.array(df_csv.loc[df_csv['end_station_id']==current_station_id]['end_lat']))
        lng_array = np.append(lng_array, np.array(df_csv.loc[df_csv['start_station_id']==current_station_id]['start_lng']))
        lng_array = np.append(lng_array, np.array(df_csv.loc[df_csv['end_station_id']==current_station_id]['end_lng']))

        ## Geometry: Point or Polygon
        station_df['geometry'].loc[idx] = ArrayPoint(lng_array, lat_array)
        #station_df['geometry'][idx] = ArrayPolygon(lng_array, lat_array)

        #Frequency
        station_df['start_frequency'].loc[idx] = len(df_csv.loc[df_csv['start_station_id']==current_station_id])
        station_df['return_frequency'].loc[idx] = len(df_csv.loc[df_csv['end_station_id']==current_station_id])

        # drop '' or NaN
        #station_df
        #station_df = station_df.dropna()
        
        #gdf = gpd.GeoDataFrame(station_df,crs="EPSG:4326")
        gdf = gpd.GeoDataFrame(station_df)
        #gdf.replace("", float("NaN"), inplace=True)
        #gdf.dropna(inplace=True)
        
    print('Created Tessellation. ')
    return gdf

def ArrayPoint (lng_array, lat_array):
    lat_point = np.mean(lat_array)
    lng_point = np.mean(lng_array)
    return Point(lng_point, lat_point)


#####bug####
def ArrayPolygon (lng_array, lat_array):
    polygon_geom = Polygon(zip(lng_array, lat_array))
    # CAN take unorganized pair
    crs = {'init': 'epsg:4326'}
    polygon = gpd.GeoDataFrame(index=[0], crs=crs, geometry=[polygon_geom])
    return polygon

Now we have two dataframes:
1. Tessellation dataframe, which contains
    - tile_ID
    - frequency
    - geometry
    - station_id
2. df_csv. 
    - ALL columns from data we loaded
    - tile_ID (joined from reference)

We want to generate FLOW dataframe from df_csv.
Flow Dataframe includes 3 columns:

FLow, Origin, Destination

In [12]:
# Goal: Given df_csv_new, Create a flow df
# Includes
# FLow, Origin, Destination
def CreateFlowDF_1 (df_csv, station_tess_df):
    df_flow = df_csv.groupby(['tile_ID_start', 'tile_ID_end']).size().reset_index(name='flow')
    fdf = skmob.FlowDataFrame(df_flow,tessellation=station_tess_df, tile_id='tile_ID',origin='tile_ID_start', destination='tile_ID_end')
    print('Created flow data frame.')
    return fdf

## 4. Data Integration Driver Codes

In [14]:
def Point2PointPipe(filepath):
    ''' 
    A pipline for creating point to point flow diagram
    Given file path
    Return 
    1. Modified Loaded Data
    2. Tessellation for station_id
    3. Flow data frame
    '''
    df_csv = LoadZipFile(filepath)
    df_csv = df_cleaning(df_csv)
    station_df = CreateStationDF(df_csv)
    df_csv_new = AddTileID2DF(df_csv, station_df)
    station_tess_df = CreateStationTessDF(df_csv_new, station_df)
    fdf = CreateFlowDF_1(df_csv_new, station_tess_df)
    
    return df_csv_new, station_tess_df, fdf

data, station_tess, fdf = Point2PointPipe(filepath)

Complete Loading Zip File.
Original Dataframe has 212512 rows.
65699 NaN or blank rows are dropped
146813 rows are left.
Percentage of invalid rows: 30.915 %
Complete data frame cleaning.
Created station tile reference dataframe
Matched tile to loaded dataframe.
Start to create tessellation for loaded dataframe
Created Tessellation. 
Created flow data frame.


  warn("The tessellation crs is None. It will be set to the default crs WGS84 (EPSG:4326).")


In [15]:
data.head(3)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,tile_ID_start,tile_ID_end
0,9515818787165EA3,electric_bike,2021-09-22 13:30:57,2021-09-22 13:41:40,17th & Folsom Street Park (17th St at Folsom St),SF-N23,Civic Center/UN Plaza BART Station (Market St ...,SF-I25,37.763686,-122.415521,37.780897,-122.412124,member,0,67
1,BC612400B29A857F,classic_bike,2021-09-05 18:16:29,2021-09-05 18:26:42,17th & Folsom Street Park (17th St at Folsom St),SF-N23,Civic Center/UN Plaza BART Station (Market St ...,SF-I25,37.763735,-122.415524,37.781074,-122.411738,member,0,67
2,9629EF06B65FA57C,electric_bike,2021-09-28 10:01:42,2021-09-28 10:10:19,17th & Folsom Street Park (17th St at Folsom St),SF-N23,Civic Center/UN Plaza BART Station (Market St ...,SF-I25,37.763712,-122.415471,37.781008,-122.411916,casual,0,67


In [16]:
station_tess.head(3)

Unnamed: 0,station_id,tile_ID,geometry,start_frequency,return_frequency
0,SF-N23,0,POINT (-122.41551 37.76375),483,523
1,SF-H30,1,POINT (-122.38795 37.78297),635,559
2,OK-I4,2,POINT (-122.27176 37.81702),19,20


In [17]:
fdf.head(3)

Unnamed: 0,origin,destination,flow
0,0,0,10
1,0,4,1
2,0,6,1


**Heat Map**

In [19]:
heatmap_df = fdf.tessellation.copy()
heatmap_df['x'] = ''
heatmap_df['y'] = ''
heatmap_df['pair']=''
for r in np.arange(len(heatmap_df)):
    x = heatmap_df.iloc[r]['geometry'].x
    y = heatmap_df.iloc[r]['geometry'].y
    heatmap_df['x'].iloc[r] = x
    heatmap_df['y'].iloc[r] = y
    heatmap_df['pair'].iloc[r] = (x,y)
heatmap_df.head(5)

Unnamed: 0,station_id,tile_ID,geometry,start_frequency,return_frequency,x,y,pair
0,SF-N23,0,POINT (-122.41551 37.76375),483,523,-122.415515,37.76375,"(-122.41551459829364, 37.763749691086815)"
1,SF-H30,1,POINT (-122.38795 37.78297),635,559,-122.387946,37.782972,"(-122.38794590852278, 37.782972453824236)"
2,OK-I4,2,POINT (-122.27176 37.81702),19,20,-122.271762,37.817015,"(-122.27176150000003, 37.817015399999995)"
3,OK-D3-1,3,POINT (-122.26449 37.84036),100,86,-122.264488,37.840364,"(-122.26448810000002, 37.84036429999999)"
4,SF-I30,4,POINT (-122.39032 37.78052),943,939,-122.390324,37.780519,"(-122.39032374744953, 37.78051879624513)"


In [20]:
heatmap_df.to_csv('../data/heatmap_df.csv') 