In [1]:
# dependencies
import pandas as pd
import os
from datetime import datetime

# Rides per Hour Data for 2021
## Files are too large for Tableau
1. create a function to return rides/hour data from the larger sets
2. function iterates through each month in the selected years
3. function yields a limited number of columns
4. function concatenates the limited data into one df
6. export to csv
7. load into Tableau as a manageable filesize

In [2]:
# file directory to import from
# only looking at 2021 for this report
data_files = os.listdir('monthly_data/2021_monthly_files/')
data_files

['202105-citibike-tripdata.csv',
 '202107-citibike-tripdata.csv',
 '202101-citibike-tripdata.csv',
 '202103-citibike-tripdata.csv',
 '202104-citibike-tripdata.csv',
 '202106-citibike-tripdata.csv',
 '202102-citibike-tripdata.csv']

In [5]:
# date formats changed in 2021, so there are two data parsers
def new_date_parser(x): return datetime.strptime(x, "%Y-%m-%d %H:%M:%S")
def old_date_parser(x): return datetime.strptime(x, "%Y-%m-%d %H:%M:%S.%f")

# columns changed in 2021, these are the new columns - use to rename old columns with new names
map_old_cols = [
    'trip_duration',
    'started_at',
    'ended_at',
    'start_station_id',
    'start_station_name',
    'start_lat',
    'start_lng',
    'end_station_id',
    'end_station_name',
    'end_lat',
    'end_lng',
    'bike_id',
    'member_casual',
    'birth_year',
    'gender'
]

# function to import data from csv files and union all files
def load_files(data_files):
    for file in data_files:
        if file < "202102": 
            yield (
                pd.read_csv('monthly_data/2021_monthly_files/'+file,
                    names=map_old_cols,
                    header=0,
                    index_col=False,
                    usecols=['started_at'],
                    parse_dates=['started_at'],
                    date_parser=old_date_parser
                ) 
            )
        else:
            yield (
                pd.read_csv('monthly_data/2021_monthly_files/'+file,
                    usecols=['started_at'],
                    parse_dates=['started_at'],
                    date_parser=new_date_parser
                ) 
            )


def hours_data(agg_data):
    for data in agg_data:
        data['ride_month'] = data['started_at'].dt.month
        data['ride_day'] = data["started_at"].dt.day
        data['ride_weekday'] = data["started_at"].dt.dayofweek
        data['ride_hour'] = data["started_at"].dt.hour
        yield(
            data.groupby([data['ride_month'], data['ride_day'], data['ride_weekday'], data['ride_hour']]).size()
            .groupby(level= [0, 2, 3]).mean().reset_index().rename(columns={0: 'avg_hour_ride_count'})
            # .groupby(level= [0, 1, 2]).mean().reset_index().rename(columns={0: 'avg_hour_ride_count'})
            # data.groupby([data['ride_month'], data['ride_day'], data['ride_hour']]).size().groupby(level= [0, 1, 2]).mean().reset_index().rename(columns={0: 'avg_hour_ride_count'})
            # data.groupby([data['ride_month'], data['ride_day'], data['ride_hour']])['starttime'].count().reset_index(name="hourly_ride_count")
        )

merged_data = pd.concat(hours_data(load_files(data_files)))


In [6]:
merged_data.head(50)

Unnamed: 0,ride_month,ride_weekday,ride_hour,avg_hour_ride_count
0,5,0,0,721.8
1,5,0,1,370.0
2,5,0,2,220.0
3,5,0,3,150.2
4,5,0,4,154.2
5,5,0,5,482.0
6,5,0,6,1474.2
7,5,0,7,2943.4
8,5,0,8,4628.8
9,5,0,9,3679.8


In [7]:
# save to csv
merged_data.to_csv('annual_data/2021_hours_data.csv', index = False)