In [1]:
import glob
import time
import numpy as np
import pandas as pd
from geopy import distance
import datetime as datetime
from multiprocessing import Process
from sklearn.preprocessing import MinMaxScaler



In [2]:
def create_datetime_features(input_df, column_name):

    input_df[column_name+'_hour'] = input_df[column_name].dt.hour
    input_df[column_name+'_minute'] = input_df[column_name].dt.minute
    input_df[column_name+'_quarter'] = input_df[column_name].dt.quarter
    input_df[column_name+'_month'] = input_df[column_name].dt.month
    input_df[column_name+'_year'] = input_df[column_name].dt.year
    input_df[column_name+'_week'] = input_df[column_name].dt.isocalendar().week
    input_df[column_name+'_day'] = input_df[column_name].dt.day
    input_df[column_name+'_dayofweek'] = input_df[column_name].dt.dayofweek

    return input_df

In [3]:
def calculate_geodesic_distance(df_stations, point):
    df = df_stations.copy()
    for idx, i, j  in zip(df.index, df.station_latitude, df.station_longitude):
        df.loc[idx, 'distance'] = distance.distance(point, (i,j)).m
        if int(df.loc[idx, 'distance']) == 0:
            
            return df.loc[idx, "station_name"], df.loc[idx, 'distance']
    df = df.sort_values("distance")
    return df.iloc[0]["station_name"], df.iloc[0]["distance"]

In [4]:
def get_approximate_stations_locations(df_missing_stations, start_stations, save_file, col ="start_station"):
    count = 0
    for idx, i, j in zip(df_missing_stations.index, 
                         df_missing_stations[f'{col}_latitude'], 
                         df_missing_stations[f'{col}_longitude']):
        
        approx_station, approx_distance = calculate_geodesic_distance(start_stations, 
                            (i, j))
        df_missing_stations.loc[idx,f"{col}_approx"] = approx_station
        df_missing_stations.loc[idx,f"{col}_approx_distance"] = approx_distance
        
        if count%1000 == 0:
            print(count)
            df_missing_stations.to_csv(f"{save_file}")
        count += 1
    df_missing_stations.to_csv(f"{save_file}")
    return df_missing_stations

In [5]:
def parallel_get_approximate_stations_locations(df_missing_stations, start_stations, 
                            save_file_suffix="approx_stations", batch_size=100000, sleep_time=120):

    ranges = np.arange(0, df_missing_stations.shape[0], 
                       batch_size, dtype=int)
    ranges = np.concatenate((ranges, [df_missing_stations.shape[0]]))
    x = 0
    processes = []
    for ii in range(1, len(ranges)):
        print(ranges[ii-1], ranges[ii])
        save_file = f"approximate_stations/{save_file_suffix}_{ii}.csv"
        partitioned_df = df_missing_stations.loc[ranges[ii-1]: 
                                    ranges[ii]].copy().reset_index(drop=True)    
        
        p = Process(target=get_approximate_stations_locations, 
                            args = (partitioned_df, start_stations, save_file))
        #get_approximate_stations_locations(partitioned_df, start_stations, save_file)
        x +=1
        p.start()
        print(x)
        processes.append(p)
        time.sleep(sleep_time)
    for thread in processes:
        thread.join()

# Importing and renaming columns

In [6]:
#df_2020 = pd.DataFrame()
df_2021 = pd.DataFrame()
df_2022 = pd.DataFrame()
df_2023 = pd.DataFrame()
df_initial = pd.DataFrame()

#files_2020 = [i for i in glob.glob("data/*.csv") if "2020" in i]
files_2021 = [i for i in glob.glob("data/*.csv") if "2021" in i]
files_2022 = [i for i in glob.glob("data/*.csv") if "2022" in i]
files_2023 = [i for i in glob.glob("data/*.csv") if "2023" in i]

for file in files_2021:
    df_2021 = df_2021.append(pd.read_csv(file).rename(columns={
        "started_at": "start_time",
        "ended_at": "end_time",
        "start_lat": "start_station_latitude",
        "start_lng": "start_station_longitude",
        "end_lat": "end_station_latitude",
        "end_lng": "end_station_longitude",
        "member_casual": "user_type"
    }))
for file in files_2022:
    df_2022 = df_2022.append(pd.read_csv(file).rename(columns={
        "started_at": "start_time",
        "ended_at": "end_time",
        "start_lat": "start_station_latitude",
        "start_lng": "start_station_longitude",
        "end_lat": "end_station_latitude",
        "end_lng": "end_station_longitude",
        "member_casual": "user_type"
    }))   
    
for file in files_2023:
    df_2023 = df_2023.append(pd.read_csv(file).rename(columns={
        "started_at": "start_time",
        "ended_at": "end_time",
        "start_lat": "start_station_latitude",
        "start_lng": "start_station_longitude",
        "end_lat": "end_station_latitude",
        "end_lng": "end_station_longitude",
        "member_casual": "user_type"
    }))    
    
df_initial = df_2021.append(df_2022).append(df_2023).reset_index(drop=True)

In [8]:
df_initial.to_csv("data/baywheels_raw_rides.csv")

In [13]:
df_initial.shape

(5349524, 13)

In [14]:
df_initial

Unnamed: 0,ride_id,rideable_type,start_time,end_time,start_station_name,start_station_id,end_station_name,end_station_id,start_station_latitude,start_station_longitude,end_station_latitude,end_station_longitude,user_type
0,357CDE244D24405B,electric_bike,2021-01-26 11:32:59,2021-01-26 11:38:21,,,,,37.760000,-122.410000,37.760000,-122.420000,casual
1,19A3E1F4211D0EE8,electric_bike,2021-01-26 14:16:37,2021-01-26 14:19:24,,,,,37.770000,-122.410000,37.760000,-122.410000,casual
2,27004D90ADC81AFF,electric_bike,2021-01-26 14:02:37,2021-01-26 14:06:35,,,,,37.760000,-122.420000,37.770000,-122.410000,casual
3,2F81FCA3D9CD056A,electric_bike,2021-01-26 15:03:05,2021-01-26 15:07:25,,,,,37.760000,-122.410000,37.760000,-122.420000,casual
4,72CC2218DF973489,electric_bike,2021-01-26 09:04:22,2021-01-26 09:11:36,,,,,37.790000,-122.410000,37.780000,-122.390000,casual
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5349519,569DF06C0E9CB3BA,classic_bike,2023-04-03 14:16:30,2023-04-03 14:41:22,Rockridge BART Station,OK-B4,San Pablo Park,BK-G4,37.844279,-122.251900,37.855783,-122.283127,casual
5349520,FAA47B8C591368D7,classic_bike,2023-04-19 17:50:50,2023-04-19 18:30:39,Channing Way at Shattuck Ave,BK-E7,San Pablo Park,BK-G4,37.865847,-122.267443,37.855783,-122.283127,member
5349521,C2BD6C4867680C4C,classic_bike,2023-04-29 15:33:07,2023-04-29 16:12:39,Bay Pl at Vernon St,OK-I6,San Pablo Park,BK-G4,37.811483,-122.260506,37.855783,-122.283127,casual
5349522,6E1E0148C8FDD082,classic_bike,2023-04-17 22:26:15,2023-04-17 22:40:21,Rockridge BART Station,OK-B4,San Pablo Park,BK-G4,37.844279,-122.251900,37.855783,-122.283127,casual


In [15]:
del df_2021
del df_2022
del df_2023

In [16]:
df_initial = df_initial.dropna(subset=[
    'start_station_latitude',
    'start_station_longitude',
    'end_station_latitude',
    'end_station_longitude'])

In [17]:
df_initial['start_time'] = pd.to_datetime(df_initial['start_time'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [18]:
df_initial = df_initial[df_initial['start_time'] >= datetime.datetime(2021, 1, 1)]

# Finding Standard Stations

In [19]:
start_stations = df_initial[["start_station_name", 
                             'start_station_latitude', 
                             "start_station_longitude"]]\
.sort_values(['start_station_latitude', "start_station_longitude"])\
.drop_duplicates(subset=['start_station_latitude','start_station_longitude'])\
.dropna().drop_duplicates(subset=['start_station_name'])\
.query("start_station_latitude != 0").reset_index(drop=True).rename(
    columns={"start_station_name": "station_name", 
            'start_station_latitude': "station_latitude",
            "start_station_longitude": "station_longitude"})

In [20]:
end_stations = df_initial[["end_station_name", 
                           'end_station_latitude', 
                           "end_station_longitude"]]\
.sort_values(['end_station_latitude', "end_station_longitude"])\
.drop_duplicates(subset=['end_station_latitude','end_station_longitude'])\
.dropna().drop_duplicates(subset=['end_station_name'])\
.query("end_station_latitude != 0").reset_index(drop=True).rename(
    columns={"end_station_name": "station_name", 
            'end_station_latitude': "station_latitude",
            "end_station_longitude": "station_longitude"})

In [21]:
standard_stations = start_stations.append(end_stations).sort_values(
    ['station_latitude', "station_longitude"]).drop_duplicates(subset=['station_name'])

# Trip duration calculation

In [22]:
df_initial['start_time'] = pd.to_datetime(df_initial['start_time'])
df_initial['end_time'] = pd.to_datetime(df_initial['end_time'])
df_initial['duration_sec'] = (df_initial['end_time'] - df_initial['start_time'])/ pd.Timedelta(seconds=1)


df_initial['user_type'] = df_initial['user_type'].replace({"member":"Subscriber", "casual": "Customer"})

Removing 2020 data for irrelevancy 

# Lookup missing Station Names

Approximating missing stations

In [23]:
df_missing_stations = df_initial[(df_initial['start_station_name'].isna()) |\
                                (df_initial['end_station_name'].isna())].copy()

df_existing_stations = df_initial[~((df_initial['start_station_name'].isna()) |\
                                    (df_initial['end_station_name'].isna()))].copy()


In [24]:
del df_initial

In [25]:
df_missing_start_stations_no_duplicates = df_missing_stations.drop_duplicates(
                                        subset=['start_station_latitude', 'start_station_longitude']).copy()\
                                            .reset_index(drop=True)
df_missing_start_stations_no_duplicates = df_missing_start_stations_no_duplicates[
                                            df_missing_start_stations_no_duplicates['start_station_name'].isna()].copy()\
                                            .reset_index(drop=True)

In [26]:
df_missing_end_stations_no_duplicates = df_missing_stations.drop_duplicates(
                                        subset=['end_station_latitude', 'end_station_longitude']).copy()\
                                            .reset_index(drop=True)
df_missing_end_stations_no_duplicates = df_missing_end_stations_no_duplicates[
                                            df_missing_end_stations_no_duplicates['end_station_name'].isna()].copy()\
                                                .reset_index(drop=True)

In [27]:
approximate_start_stations_df = get_approximate_stations_locations(df_missing_start_stations_no_duplicates, 
                                   standard_stations.reset_index(drop=True).copy(), 
                                        save_file="approximate_stations/approx_start_stations.csv", 
                                            col ="start_station")

0


In [28]:
approximate_end_stations_df = get_approximate_stations_locations(df_missing_end_stations_no_duplicates, 
                                   standard_stations.reset_index(drop=True).copy(), 
                                     save_file="approximate_stations/approx_end_stations.csv", 
                                         col ="end_station")

0


In [29]:
approximate_start_stations_df = pd.read_csv("approximate_stations/approx_start_stations.csv") 
approximate_end_stations_df = pd.read_csv("approximate_stations/approx_end_stations.csv") 

In [30]:
df_missing_stations = df_missing_stations.merge(
    approximate_start_stations_df[["start_station_approx", 
                             'start_station_latitude',
                             'start_station_longitude',
                             'start_station_approx_distance']],
    on=['start_station_latitude', 
        'start_station_longitude'], how="left")

In [31]:
df_missing_stations = df_missing_stations.merge(
    approximate_end_stations_df[["end_station_approx", 
                             'end_station_latitude',
                             'end_station_longitude',
                             'end_station_approx_distance']],
    on=['end_station_latitude', 
        'end_station_longitude'], how="left")

In [32]:
index = df_missing_stations['start_station_approx_distance'] < 500
df_missing_stations.loc[index, 'start_station_name'] = df_missing_stations.loc[index, 'start_station_approx']

index = df_missing_stations['end_station_approx_distance'] < 500
df_missing_stations.loc[index, 'end_station_name'] = df_missing_stations.loc[index, 'end_station_approx']

In [33]:
df_completed_stations = df_missing_stations.dropna(subset=['start_station_name', 'end_station_name'])

In [34]:
df_existing_stations = df_existing_stations.drop(columns=['start_station_id', 'end_station_id'])
df_completed_stations = df_completed_stations.drop(columns=['start_station_id', 'end_station_id'])

In [35]:
df_existing_stations

Unnamed: 0,ride_id,rideable_type,start_time,end_time,start_station_name,end_station_name,start_station_latitude,start_station_longitude,end_station_latitude,end_station_longitude,user_type,duration_sec
3236,ABDA3A49559A3FAE,classic_bike,2021-01-07 22:15:32,2021-01-07 22:31:47,Vine St at Shattuck Ave,Vine St at Shattuck Ave,37.880222,-122.269592,37.880222,-122.269592,Customer,975.0
6110,589DBD2D0F0902D0,classic_bike,2021-01-18 15:06:21,2021-01-18 16:39:39,Vine St at Shattuck Ave,Vine St at Shattuck Ave,37.880222,-122.269592,37.880222,-122.269592,Customer,5598.0
6232,7B7344B07AC7ACC1,classic_bike,2021-01-28 18:07:00,2021-01-28 18:16:54,Carl St at Cole St,Haight St at Lyon St,37.765942,-122.449228,37.770519,-122.442326,Subscriber,594.0
6233,99E98580EAB8D099,electric_bike,2021-01-03 15:03:16,2021-01-03 15:14:38,Carl St at Cole St,Grove St at Divisadero,37.765924,-122.449284,37.775969,-122.437662,Customer,682.0
6234,4A477E3D8717E550,electric_bike,2021-01-25 11:19:22,2021-01-25 11:22:40,Carl St at Cole St,Haight St at Lyon St,37.765948,-122.449321,37.770540,-122.442437,Subscriber,198.0
...,...,...,...,...,...,...,...,...,...,...,...,...
5349519,569DF06C0E9CB3BA,classic_bike,2023-04-03 14:16:30,2023-04-03 14:41:22,Rockridge BART Station,San Pablo Park,37.844279,-122.251900,37.855783,-122.283127,Customer,1492.0
5349520,FAA47B8C591368D7,classic_bike,2023-04-19 17:50:50,2023-04-19 18:30:39,Channing Way at Shattuck Ave,San Pablo Park,37.865847,-122.267443,37.855783,-122.283127,Subscriber,2389.0
5349521,C2BD6C4867680C4C,classic_bike,2023-04-29 15:33:07,2023-04-29 16:12:39,Bay Pl at Vernon St,San Pablo Park,37.811483,-122.260506,37.855783,-122.283127,Customer,2372.0
5349522,6E1E0148C8FDD082,classic_bike,2023-04-17 22:26:15,2023-04-17 22:40:21,Rockridge BART Station,San Pablo Park,37.844279,-122.251900,37.855783,-122.283127,Customer,846.0


In [36]:
df = df_completed_stations.append(df_existing_stations).copy()

In [37]:
del df_completed_stations
del df_existing_stations
del df_missing_stations

In [38]:
df.to_csv("baywheels_confirmed_stations.csv")

In [39]:
df.shape

(4857247, 16)

# Dropping Trips < X minutes duration, where start = end

X = 4

In [40]:
minutes = 4
minutes_not_same = 2

In [35]:
df_same_station = df[(df['start_station_name'] == df['end_station_name']) & 
                     (df['duration_sec'] > minutes*60)]


df_no_same_station = df[(df['start_station_name'] != df['end_station_name']) & 
                     (df['duration_sec'] > minutes_not_same*60)]



In [36]:
df_final = df_no_same_station.append(df_same_station).reset_index(drop=True)#

del df
del df_same_station
del df_no_same_station

In [37]:
df_final = create_datetime_features(df_final, "start_time")
df_final = create_datetime_features(df_final, "end_time")

In [38]:
df_final

Unnamed: 0,ride_id,rideable_type,start_time,end_time,start_station_name,end_station_name,start_station_latitude,start_station_longitude,end_station_latitude,end_station_longitude,...,demand_lag_1_h,demand_lag_2_h,demand_lag_24_h,duration_sec,start_station_approx,start_station_approx_distance,end_station_approx,end_station_approx_distance,start_time_minute,end_time_minute
0,357CDE244D24405B,electric_bike,2021-01-26 11:32:59,2021-01-26 11:38:21,3rd St at Townsend St,Parker Ave at McAllister St,37.760000,-122.410000,37.760000,-122.420000,...,,,,322.0,3rd St at Townsend St,257.580529,Parker Ave at McAllister St,133.097365,32,38
1,19A3E1F4211D0EE8,electric_bike,2021-01-26 14:16:37,2021-01-26 14:19:24,Powell St at Columbus Ave,3rd St at Townsend St,37.770000,-122.410000,37.760000,-122.410000,...,,,,167.0,Powell St at Columbus Ave,298.635347,3rd St at Townsend St,257.580529,16,19
2,27004D90ADC81AFF,electric_bike,2021-01-26 14:02:37,2021-01-26 14:06:35,Parker Ave at McAllister St,Powell St at Columbus Ave,37.760000,-122.420000,37.770000,-122.410000,...,,,,238.0,Parker Ave at McAllister St,133.097365,Powell St at Columbus Ave,298.635347,2,6
3,2F81FCA3D9CD056A,electric_bike,2021-01-26 15:03:05,2021-01-26 15:07:25,3rd St at Townsend St,Parker Ave at McAllister St,37.760000,-122.410000,37.760000,-122.420000,...,,,,260.0,3rd St at Townsend St,257.580529,Parker Ave at McAllister St,133.097365,3,7
4,B4F4680078748D61,electric_bike,2021-01-21 14:38:42,2021-01-21 14:43:26,3rd St at Townsend St,Parker Ave at McAllister St,37.760000,-122.410000,37.760000,-122.420000,...,,,,284.0,3rd St at Townsend St,257.580529,Parker Ave at McAllister St,133.097365,38,43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3974760,D388BD824032A7B4,electric_bike,2022-12-23 12:19:24,2022-12-23 12:33:57,Eureka Valley Recreation Center,Eureka Valley Recreation Center,37.759218,-122.436877,37.759211,-122.436798,...,,,,873.0,,,,,19,33
3974761,E6C4D7A5C2710094,classic_bike,2022-12-23 12:10:58,2022-12-23 12:41:31,25th Ave at Irving St,25th Ave at Irving St,37.763443,-122.483554,37.763443,-122.483554,...,,,,1833.0,,,,,10,41
3974762,A0C55F0DF9CEAF55,classic_bike,2022-12-12 12:22:08,2022-12-12 12:27:01,Cyril Magnin St at Ellis St,Cyril Magnin St at Ellis St,37.785876,-122.408923,37.785876,-122.408923,...,,,,293.0,,,,,22,27
3974763,59AD8197AB3727AE,classic_bike,2022-12-04 17:05:26,2022-12-04 18:02:43,Parkmoor Ave at Race St,Parkmoor Ave at Race St,37.316736,-121.910005,37.316736,-121.910005,...,,,,3437.0,,,,,5,2


In [39]:
df_final.to_csv("baywheels_cleaned.csv")


In [40]:
standard_stations.to_csv("standard_stations.csv")

In [41]:
standard_stations

Unnamed: 0,station_name,station_latitude,station_longitude
0,Bestor Art Park,37.278118,-121.825770
1,Locust St at Grant St,37.302419,-121.868460
2,Willow St at Blewett Ave,37.308742,-121.900190
3,Bird Ave at Coe Ave,37.310122,-121.894399
4,Park Ave at Laurel Grove Ln,37.310755,-121.925801
...,...,...,...
529,North Berkeley BART Station,37.873558,-122.283093
530,Shattuck Ave at Hearst Ave,37.873676,-122.268487
531,Hearst Ave at Euclid Ave,37.875112,-122.260553
532,Virginia St at Shattuck Ave,37.876573,-122.269528


In [42]:
#count = 0
#for idx, i, j in zip(df_missing_stations.index, 
#                     df_missing_stations['start_station_latitude'], 
#                     df_missing_stations['start_station_longitude']):
#    approx_station, approx_distance = calculate_geodesic_distance(start_stations, 
#                        (i, j))
#    df_missing_stations.loc[idx,"start_station_approx"] = approx_station
#    df_missing_stations.loc[idx,"start_station_approx_distance"] = approx_distance
#    count += 1
#    if count%10000 == 0:
#        print(count)
#        df_missing_stations.to_csv("missing_stations.csv")

In [43]:
#
#import plotly.express as px
#fig = px.density_mapbox(df_initial.head(100000), lat='start_station_latitude', lon='start_station_longitude', radius=2,
#                        center=dict(lat=0, lon=180), zoom=0,
#                        mapbox_style="stamen-terrain")
#fig.update_geos(fitbounds="locations")
#fig.show()

In [44]:
#from sklearn.metrics.pairwise import haversine_distances
#
#points_in_radians = df_initial[['start_station_latitude','start_station_longitude']].head(10000).apply(np.radians).values
#distances_in_km = haversine_distances(points_in_radians) * 6371

In [45]:
#distance_matrix = distances_in_km

#clustering = DBSCAN(min_samples=2)
#scaler = MinMaxScaler()
#df = df_initial.copy()
#df[['start_station_latitude', 'start_station_longitude', 
#    'end_station_latitude', 'end_station_longitude']] = \
#                            scaler.fit_transform(df[['start_station_latitude', 'start_station_longitude', 
#                                                     'end_station_latitude', 'end_station_longitude']])
#clusters = clustering.fit_predict(df[['start_station_latitude', 'start_station_longitude']].head(10000))