In [1]:
# I/O
import pandas as pd
import numpy as np
import geopandas as gpd
import requests
from io import BytesIO
from zipfile import ZipFile

In [2]:
# URL of the zipped data file
url = 'https://s3.amazonaws.com/capitalbikeshare-data/202301-capitalbikeshare-tripdata.zip'

In [3]:
# Download the zipped file
response = requests.get(url)
zipfile = ZipFile(BytesIO(response.content))

In [4]:
# List files in the zip file
zipfile.namelist()

['202301-capitalbikeshare-tripdata.csv',
 '__MACOSX/._202301-capitalbikeshare-tripdata.csv']

In [5]:
# zipfile is the first item in the list
file_name = zipfile.namelist()[0]

# Read the desired file into a pandas DataFrame  
with zipfile.open(file_name) as file:
    df = pd.read_csv(file)

In [6]:
# Display the DataFrame
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,65F0ACD101BF0D49,classic_bike,2023-01-04 19:34:07,2023-01-04 19:39:29,East Falls Church Metro / Sycamore St & 19th St N,31904.0,W Columbia St & N Washington St,32609.0,38.885321,-77.156427,38.885621,-77.166917,member
1,D75158CE73DC43F0,classic_bike,2023-01-27 15:26:38,2023-01-27 19:21:36,Carroll & Westmoreland Ave,32025.0,Fenton St & Ellsworth Dr,32036.0,38.975,-77.01121,38.997033,-77.025608,member
2,33E85889625FF7CA,classic_bike,2023-01-05 20:44:38,2023-01-05 20:51:18,15th & L St NW,31276.0,Thomas Circle,31241.0,38.903649,-77.034918,38.9059,-77.0325,member
3,E1F055A1651F47A1,classic_bike,2023-01-03 17:45:14,2023-01-03 17:57:23,Hartland Rd & Harte Pl,32255.0,Merrifield Cinema & Merrifield Town Center,32235.0,38.878601,-77.222808,38.870093,-77.22997,member
4,88CC90CEEC298BAF,classic_bike,2023-01-03 05:18:46,2023-01-03 05:25:50,Merrifield Cinema & Merrifield Town Center,32235.0,Hartland Rd & Harte Pl,32255.0,38.870093,-77.22997,38.878601,-77.222808,member


# Exploratory Data Analysis  

Let's start creating new variables within our dataset and taking a look at its structure. 

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204077 entries, 0 to 204076
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   ride_id             204077 non-null  object 
 1   rideable_type       204077 non-null  object 
 2   started_at          204077 non-null  object 
 3   ended_at            204077 non-null  object 
 4   start_station_name  195428 non-null  object 
 5   start_station_id    195428 non-null  float64
 6   end_station_name    194680 non-null  object 
 7   end_station_id      194680 non-null  float64
 8   start_lat           204077 non-null  float64
 9   start_lng           204077 non-null  float64
 10  end_lat             203856 non-null  float64
 11  end_lng             203856 non-null  float64
 12  member_casual       204077 non-null  object 
dtypes: float64(6), object(7)
memory usage: 20.2+ MB


In [8]:
start_station_diff = 204077 - 195428
end_station_diff = 204077 - 194680

Start creating variables: 
- Start and End Dates
- Start and End Times
- Convert Station IDs to Character Integers
- Create new geoDFs that have start and end locations so that we can create the neighborhoods that they started and ended in
- 

In [40]:
df["started_at"]= pd.to_datetime(df["started_at"])
df["started_at_date"]= pd.to_datetime(df["started_at"]).dt.date
df["started_at_time"]= pd.to_datetime(df["started_at"]).dt.time
df["ended_at"]= pd.to_datetime(df["ended_at"])
df["ended_at_date"]= pd.to_datetime(df["ended_at"]).dt.date
df["ended_at_time"]= pd.to_datetime(df["ended_at"]).dt.time
df["duration"] = pd.to_datetime(df["ended_at"]) - pd.to_datetime(df["started_at"])
df["duration_minutes"] = df.duration.apply(lambda td: td.total_seconds() / 60)
df["start_station_id"] = df["start_station_id"].astype(str)
df["end_station_id"] = df["end_station_id"].astype(str) 

Create two new dataframes: 
1. Start times df that has the location they started at if it has a station
2. End times df that has the location they ended at if it has a station 

In [66]:
start_stations = df[["start_station_name","start_station_id","start_lat","start_lng"]]
end_stations = df[["end_station_name","end_station_id","end_lat","end_lng"]]
start_stations = start_stations.loc[start_stations['start_station_name'].notna()]
end_stations = end_stations.loc[end_stations['end_station_name'].notna()]
start_stations.rename(columns = {"start_station_name":"station_name",
                                "start_station_id":"station_id",
                                "start_lat":"lat",
                                 "start_lng":"lng"}, inplace=True)

end_stations.rename(columns = {"end_station_name":"station_name",
                                "end_station_id":"station_id",
                                "end_lat":"lat",
                                 "end_lng":"lng"}, inplace=True )
stations = pd.concat([start_stations, end_stations])
stations = stations.drop_duplicates(subset=["station_name","station_id"])

In [68]:
stations_gdf = gpd.GeoDataFrame(stations, geometry=gpd.points_from_xy(stations.lng, stations.lat), crs="EPSG:4326")