In [98]:
# ---- LIBRARY IMPORTS ----
import pandas as pd
import networkx as nx
import datetime

In [99]:
# ---- Reading in datasets ----
usecols = [index for index in range(1, 14)]
dtype = {'ride_id': "string", 'rideable_type': "string", 'started_at': "string", 'ended_at': "string", 'start_station_name': "string",
         'start_station_id': "string", 'end_station_name': "string", 'end_station_id': "string", 'start_lat': float,
         'start_lng': float, 'end_lat': float, 'end_lng': float, 'member_casual': "string"}
df = pd.read_csv('data/202407-citibike-tripdata_5.csv', usecols=usecols, dtype=dtype)

# Data Cleansing
The following operations are required for data cleaning

1. Encode `rideable_type` and `member_casual`
2. Index `start_station_id` and `end_station_id`
3. Create fields for `start_date`, `start_time`, `end_date`, `end_time`
4. Create field for trip duration
5. create network data structure using `networkx`
6. Create distance between stations using the [Manhattan's Distance](https://www.datacamp.com/tutorial/manhattan-distance)

## Encoding Binary Variables
The `rideable_type` and `member_casual` fields will use dummy encoding using pandas' [`get_dummies`](https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html) method.

In [100]:
df = pd.get_dummies(data=df, columns=['rideable_type', 'member_casual'], dtype="int", drop_first=True)

## Indexing Stations
We want to index station IDs so that we can have unique identifiers for them.  This will allow us to create nodes for our network data structure.  We need to first get a dictionary of all of the station IDs where the keys are the station IDs and the values are their indexes, then map their indexes in our dataframe.

In [101]:
# Get dict of unique station IDs
start_stations = df['start_station_id']
end_stations = df['end_station_id']
all_stations = pd.concat([start_stations, end_stations], axis=0).drop_duplicates(ignore_index=True)
station_dict = {value: index for index, value in all_stations.items()}