In [1]:
import glob
import time
import numpy as np
import pandas as pd
from geopy import distance
from multiprocessing import Process
from sklearn.preprocessing import MinMaxScaler



In [2]:
def create_datetime_features(input_df, column_name):

    input_df[column_name+'_hour'] = input_df[column_name].dt.hour
    input_df[column_name+'_minute'] = input_df[column_name].dt.minute
    input_df[column_name+'_quarter'] = input_df[column_name].dt.quarter
    input_df[column_name+'_month'] = input_df[column_name].dt.month
    input_df[column_name+'_year'] = input_df[column_name].dt.year
    input_df[column_name+'_week'] = input_df[column_name].dt.isocalendar().week
    input_df[column_name+'_day'] = input_df[column_name].dt.day
    input_df[column_name+'_dayofweek'] = input_df[column_name].dt.dayofweek
    input_df[column_name+'_season'] = (input_df[column_name].dt.month)%12 // 3 + 1

    return input_df

In [3]:
def calculate_geodesic_distance(df_stations, point):
    df = df_stations.copy()
    for idx, i, j  in zip(df.index, df.start_station_latitude, df.start_station_longitude):
        df.loc[idx, 'distance'] = distance.distance(point, (i,j)).m
        if int(df.loc[idx, 'distance']) == 0:
            
            return df.loc[idx, "start_station_name"], df.loc[idx, 'distance']
    df = df.sort_values("distance")
    return df.iloc[0]["start_station_name"], df.iloc[0]["distance"]

In [4]:
def get_approximate_stations_locations(df_missing_stations, start_stations, save_file, col ="start_station"):
    count = 0
    for idx, i, j in zip(df_missing_stations.index, 
                         df_missing_stations[f'{col}_latitude'], 
                         df_missing_stations[f'{col}_longitude']):
        
        approx_station, approx_distance = calculate_geodesic_distance(start_stations, 
                            (i, j))
        df_missing_stations.loc[idx,f"{col}_approx"] = approx_station
        df_missing_stations.loc[idx,f"{col}_approx_distance"] = approx_distance
        
        if count%1000 == 0:
            print(count)
            df_missing_stations.to_csv(f"{save_file}")
        count += 1
    df_missing_stations.to_csv(f"{save_file}")
    return df_missing_stations

In [5]:
def parallel_get_approximate_stations_locations(df_missing_stations, start_stations, 
                            save_file_suffix="approx_stations", batch_size=100000, sleep_time=120):

    ranges = np.arange(0, df_missing_stations.shape[0], 
                       batch_size, dtype=int)
    ranges = np.concatenate((ranges, [df_missing_stations.shape[0]]))
    x = 0
    processes = []
    for ii in range(1, len(ranges)):
        print(ranges[ii-1], ranges[ii])
        save_file = f"approximate_stations/{save_file_suffix}_{ii}.csv"
        partitioned_df = df_missing_stations.loc[ranges[ii-1]: 
                                    ranges[ii]].copy().reset_index(drop=True)    
        
        p = Process(target=get_approximate_stations_locations, 
                            args = (partitioned_df, start_stations, save_file))
        #get_approximate_stations_locations(partitioned_df, start_stations, save_file)
        x +=1
        p.start()
        print(x)
        processes.append(p)
        time.sleep(sleep_time)
    for thread in processes:
        thread.join()

# Importing and renaming columns

In [6]:
df_initial = pd.DataFrame()
for file in glob.glob("*.csv"):
    df_initial = df_initial.append(pd.read_csv(file).rename(columns={
        "started_at": "start_time",
        "ended_at": "end_time",
        "start_lat": "start_station_latitude",
        "start_lng": "start_station_longitude",
        "end_lat": "end_station_latitude",
        "end_lng": "end_station_longitude",
        "member_casual": "user_type"
    }))

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


# Finding Standard Stations

In [7]:
start_stations = df_initial[["start_station_name", 
                             'start_station_latitude', 
                             "start_station_longitude"]]\
.sort_values(['start_station_latitude', "start_station_longitude"])\
.drop_duplicates(subset=['start_station_latitude','start_station_longitude'])\
.dropna().drop_duplicates(subset=['start_station_name'])\
.query("start_station_latitude != 0").reset_index(drop=True).rename(
    columns={"start_station_name": "station_name", 
            'start_station_latitude': "station_latitude",
            "start_station_longitude": "station_longitude"})

In [8]:
end_stations = df_initial[["end_station_name", 
                           'end_station_latitude', 
                           "end_station_longitude"]]\
.sort_values(['end_station_latitude', "end_station_longitude"])\
.drop_duplicates(subset=['end_station_latitude','end_station_longitude'])\
.dropna().drop_duplicates(subset=['end_station_name'])\
.query("end_station_latitude != 0").reset_index(drop=True).rename(
    columns={"end_station_name": "station_name", 
            'end_station_latitude': "station_latitude",
            "end_station_longitude": "station_longitude"})

In [9]:
standard_stations = start_stations.append(end_stations).sort_values(
    ['station_latitude', "station_longitude"]).drop_duplicates(subset=['station_name'])

# Trip duration calculation

In [10]:
df_initial['start_time'] = pd.to_datetime(df_initial['start_time'])
df_initial['end_time'] = pd.to_datetime(df_initial['end_time'])
df_initial['duration_sec'] = (df_initial['end_time'] - df_initial['start_time'])/ pd.Timedelta(seconds=1)

df_initial = create_datetime_features(df_initial, "start_time")
df_initial = create_datetime_features(df_initial, "end_time")
df_initial['user_type'] = df_initial['user_type'].replace({"member":"Subscriber", "casual": "Customer"})

Removing 2020 data for irrelevancy 

In [11]:
df_reduced = df_initial.query("start_time_year > 2020").reset_index(drop=True)

# Dropping Trips < X minutes duration, where start = end

X = 4

In [12]:
def flagging_short_trips(x):
    return ((x['end_station_latitude'] == x['start_station_latitude']) &
        (x['end_station_longitude'] == x['start_station_longitude'])) \
            | (x['start_station_name'] == x['end_station_name'])
       

In [13]:
indexes = df_reduced.apply(flagging_short_trips, axis=1)

In [14]:
indexes.value_counts()

False    4428353
True      380284
dtype: int64

In [15]:
df_same_stations = df_reduced.loc[indexes]

In [16]:
minutes = 4

In [17]:
df_no_same_stations = df_reduced.loc[~indexes].append(
                        df_same_stations.query(f"duration_sec < {minutes*60}")).reset_index(drop=True)

In [18]:
df_reduced.head()

Unnamed: 0,duration_sec,start_time,end_time,start_station_id,start_station_name,start_station_latitude,start_station_longitude,end_station_id,end_station_name,end_station_latitude,...,end_time_minute,end_time_quarter,end_time_month,end_time_year,end_time_week,end_time_day,end_time_dayofweek,end_time_season,start_station_approx,start_station_approx_distance
0,322.0,2021-01-26 11:32:59,2021-01-26 11:38:21,,,37.76,-122.41,,,37.76,...,38,1,1,2021,4,26,1,1,,
1,167.0,2021-01-26 14:16:37,2021-01-26 14:19:24,,,37.77,-122.41,,,37.76,...,19,1,1,2021,4,26,1,1,,
2,238.0,2021-01-26 14:02:37,2021-01-26 14:06:35,,,37.76,-122.42,,,37.77,...,6,1,1,2021,4,26,1,1,,
3,260.0,2021-01-26 15:03:05,2021-01-26 15:07:25,,,37.76,-122.41,,,37.76,...,7,1,1,2021,4,26,1,1,,
4,434.0,2021-01-26 09:04:22,2021-01-26 09:11:36,,,37.79,-122.41,,,37.78,...,11,1,1,2021,4,26,1,1,,


In [22]:
start_station_in_list = df_reduced["start_station_name"].isin(start_stations['station_name'].dropna())
end_station_in_list = df_reduced["end_station_name"].isin(start_stations['station_name'].dropna())
df_missing_stations = df_reduced[(df_reduced["start_station_name"].isna()) |\
                                 (~ start_station_in_list) |\
                                 (~ end_station_in_list)].reset_index(drop=True)

# Lookup missing Station Names

Approximating missing stations

In [37]:
df_missing_start_stations_no_duplicates = df_missing_stations.drop_duplicates(
                                        subset=['start_station_latitude', 'start_station_longitude'])
df_missing_start_stations_no_duplicates = df_missing_start_stations_no_duplicates[
                                            df_missing_start_stations_no_duplicates['start_station_name'].isna()]

In [38]:
df_missing_end_stations_no_duplicates = df_missing_stations.drop_duplicates(
                                        subset=['end_station_latitude', 'end_station_longitude'])
df_missing_end_stations_no_duplicates = df_missing_end_stations_no_duplicates[
                                            df_missing_end_stations_no_duplicates['end_station_name'].isna()]

In [23]:
approximate_stations_df = get_approximate_stations_locations(df_missing_stations_no_duplicates, 
                                   start_stations, save_file="approximate_stations/approx_stations.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


0


In [24]:
approximate_stations_df = pd.read_csv("approximate_stations/approx_stations.csv") 

In [30]:
df_missing_stations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1482050 entries, 0 to 1482049
Data columns (total 37 columns):
 #   Column                         Non-Null Count    Dtype         
---  ------                         --------------    -----         
 0   duration_sec                   1482050 non-null  float64       
 1   start_time                     1482050 non-null  datetime64[ns]
 2   end_time                       1482050 non-null  datetime64[ns]
 3   start_station_id               258 non-null      object        
 4   start_station_name             606 non-null      object        
 5   start_station_latitude         1482050 non-null  float64       
 6   start_station_longitude        1482050 non-null  float64       
 7   end_station_id                 701534 non-null   object        
 8   end_station_name               701983 non-null   object        
 9   end_station_latitude           1481957 non-null  float64       
 10  end_station_longitude          1481957 non-null  float

In [32]:
df_missing_stations = df_missing_stations.drop(columns=['start_station_approx', 
                                  'start_station_approx_distance']).merge(
    approximate_stations_df[["start_station_approx", 
                             'start_station_latitude',
                             'start_station_longitude',
                             'start_station_approx_distance']],
    on=['start_station_latitude', 'start_station_longitude'], how="left")

In [40]:
index = df_missing_stations['start_station_approx_distance'] < 500
df_missing_stations.loc[index, 'start_station_name'] = df_missing_stations.loc[index, 'start_station_approx']

In [42]:
df_missing_stations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1482050 entries, 0 to 1482049
Data columns (total 37 columns):
 #   Column                         Non-Null Count    Dtype         
---  ------                         --------------    -----         
 0   duration_sec                   1482050 non-null  float64       
 1   start_time                     1482050 non-null  datetime64[ns]
 2   end_time                       1482050 non-null  datetime64[ns]
 3   start_station_id               258 non-null      object        
 4   start_station_name             1257124 non-null  object        
 5   start_station_latitude         1482050 non-null  float64       
 6   start_station_longitude        1482050 non-null  float64       
 7   end_station_id                 701534 non-null   object        
 8   end_station_name               701983 non-null   object        
 9   end_station_latitude           1481957 non-null  float64       
 10  end_station_longitude          1481957 non-null  float

In [25]:
#count = 0
#for idx, i, j in zip(df_missing_stations.index, 
#                     df_missing_stations['start_station_latitude'], 
#                     df_missing_stations['start_station_longitude']):
#    approx_station, approx_distance = calculate_geodesic_distance(start_stations, 
#                        (i, j))
#    df_missing_stations.loc[idx,"start_station_approx"] = approx_station
#    df_missing_stations.loc[idx,"start_station_approx_distance"] = approx_distance
#    count += 1
#    if count%10000 == 0:
#        print(count)
#        df_missing_stations.to_csv("missing_stations.csv")

KeyboardInterrupt: 

In [None]:
start_stations_df = df_missing_stations.apply(lambda x:
                        calculate_geodesic_distance(start_stations, 
                        (x['start_station_latitude'], x['start_station_longitude'])), axis=1).apply(lambda x: 
                                                            [x[0], x[1]]).apply(pd.Series)

In [None]:
end_stations_df = df_missing_stations.apply(lambda x:
                        calculate_geodesic_distance(start_stations, 
                        (x['end_station_latitude'], x['end_station_longitude'])), axis=1).apply(lambda x: 
                                                            [x[0], x[1]]).apply(pd.Series)

In [None]:
df_temp['approximate_start_station_distance_meters'].value_counts()

In [149]:

station = (37.309014, -121.900011)
calculate_geodesic_distance(start_stations, station)
#distance.distance(newport_ri, cleveland_oh).m

('Willow St at Blewett Ave', 0.027421055809842915)

In [137]:
start_stations#[]

Unnamed: 0,start_station_name,start_station_latitude,start_station_longitude
10207,Willow St at Blewett Ave,37.309014,-121.900011
16446,Bird Ave at Willow St,37.311284,-121.896325
44347,Bird Ave at Coe Ave,37.315158,-121.897833
22693,Parkmoor Ave at Race St,37.316736,-121.910005
15027,Palm St at Willow St,37.317298,-121.884995
...,...,...,...
51,North Berkeley BART Station,37.873558,-122.283093
27,Shattuck Ave at Hearst Ave,37.873676,-122.268487
467,Hearst Ave at Euclid Ave,37.875112,-122.260553
497,Virginia St at Shattuck Ave,37.876573,-122.269528


In [117]:
df_initial[['end_station_latitude', "end_station_longitude"]].drop_duplicates()

Unnamed: 0,end_station_latitude,end_station_longitude
0,37.804272,-122.433537
1,37.767037,-122.415443
2,37.808848,-122.249680
4,37.802746,-122.413579
5,37.773793,-122.421239
...,...,...
209872,37.794464,-122.394773
210993,37.763814,-122.412995
237549,37.780761,-122.411985
265471,37.768330,-122.453021


In [120]:
for i in df_initial['start_time_year'].unique():
    for j in df_initial['start_time_quarter'].unique():
        print(i, j)
        df_temp = df_initial.query(f'(start_time_quarter=={j}) & (start_time_year=={i})')
        print(df_temp.info())
        print(df_temp.dropna().shape)
        print()

2020 1
<class 'pandas.core.frame.DataFrame'>
Int64Index: 903275 entries, 0 to 176798
Data columns (total 26 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   duration_sec             903275 non-null  float64       
 1   start_time               903275 non-null  datetime64[ns]
 2   end_time                 903275 non-null  datetime64[ns]
 3   start_station_id         417984 non-null  object        
 4   start_station_name       419208 non-null  object        
 5   start_station_latitude   903275 non-null  float64       
 6   start_station_longitude  903275 non-null  float64       
 7   end_station_id           418507 non-null  object        
 8   end_station_name         419763 non-null  object        
 9   end_station_latitude     903275 non-null  float64       
 10  end_station_longitude    903275 non-null  float64       
 11  user_type                903275 non-null  object        
 12  start_tim

<class 'pandas.core.frame.DataFrame'>
Int64Index: 514137 entries, 0 to 198492
Data columns (total 26 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   duration_sec             514137 non-null  float64       
 1   start_time               514137 non-null  datetime64[ns]
 2   end_time                 514137 non-null  datetime64[ns]
 3   start_station_id         393880 non-null  object        
 4   start_station_name       393879 non-null  object        
 5   start_station_latitude   514137 non-null  float64       
 6   start_station_longitude  514137 non-null  float64       
 7   end_station_id           381871 non-null  object        
 8   end_station_name         381871 non-null  object        
 9   end_station_latitude     513530 non-null  float64       
 10  end_station_longitude    513530 non-null  float64       
 11  user_type                514137 non-null  object        
 12  start_time_quart

(533211, 26)

2022 3
<class 'pandas.core.frame.DataFrame'>
Int64Index: 782136 entries, 0 to 273478
Data columns (total 26 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   duration_sec             782136 non-null  float64       
 1   start_time               782136 non-null  datetime64[ns]
 2   end_time                 782136 non-null  datetime64[ns]
 3   start_station_id         670546 non-null  object        
 4   start_station_name       671213 non-null  object        
 5   start_station_latitude   782136 non-null  float64       
 6   start_station_longitude  782136 non-null  float64       
 7   end_station_id           653645 non-null  object        
 8   end_station_name         654309 non-null  object        
 9   end_station_latitude     781434 non-null  float64       
 10  end_station_longitude    781434 non-null  float64       
 11  user_type                782136 non-null  object        


In [103]:
df_initial['bike_id'].dropna()

0          13052.0
1          12235.0
2          12822.0
3          11705.0
4           3673.0
            ...   
176794    999960.0
176795    999960.0
176796    633095.0
176797    633095.0
176798    633095.0
Name: bike_id, Length: 905007, dtype: float64

In [56]:
df_initial[df_initial['end_station_name'].isnull()]['rideable_type'].value_counts()

electric_bike    1235850
classic_bike        6579
docked_bike         1705
Name: rideable_type, dtype: int64

In [51]:
df_initial['rideable_type'].value_counts()

electric_bike    3562253
classic_bike     1365716
docked_bike       366175
Name: rideable_type, dtype: int64

In [63]:

import plotly.express as px
fig = px.density_mapbox(df_initial.head(100000), lat='start_station_latitude', lon='start_station_longitude', radius=2,
                        center=dict(lat=0, lon=180), zoom=0,
                        mapbox_style="stamen-terrain")
fig.update_geos(fitbounds="locations")
fig.show()

In [78]:
from sklearn.metrics.pairwise import haversine_distances

points_in_radians = df_initial[['start_station_latitude','start_station_longitude']].head(10000).apply(np.radians).values
distances_in_km = haversine_distances(points_in_radians) * 6371

MemoryError: Unable to allocate 74.5 GiB for an array with shape (100000, 100000) and data type float64

In [85]:
#distance_matrix = distances_in_km

clustering = DBSCAN(min_samples=2)
scaler = MinMaxScaler()
df = df_initial.copy()
df[['start_station_latitude', 'start_station_longitude', 
    'end_station_latitude', 'end_station_longitude']] = \
                            scaler.fit_transform(df[['start_station_latitude', 'start_station_longitude', 
                                                     'end_station_latitude', 'end_station_longitude']])
clusters = clustering.fit_predict(df[['start_station_latitude', 'start_station_longitude']].head(10000))

In [87]:
set(clusters)

{0}

In [77]:
len(m)

10000

In [None]:
clustering.