In [1]:
# dependencies
import pandas as pd
import os
# import re
# from functools import reduce
from datetime import datetime


In [2]:
# set the year of the csv files
search_year = "2019"

In [3]:
# file directory to import from - this returns file names from the directory
data_files = os.listdir('monthly_data/' + search_year + '_monthly_files')
data_files.sort()

# Files are too large for Tableau
1. create a function for station data starts and a function for station ending data
2. each function iterates through each month
3. each function yields a limited number of columns
4. each function concatenates the limited data into one df
5. merge the two dataframes
6. export to csv
7. load into Tableau as a manageable filesize

In [4]:
# date formats changed in 2021, so there are two data parsers
def new_date_parser(x): return datetime.strptime(x, "%Y-%m-%d %H:%M:%S")
def old_date_parser(x): return datetime.strptime(x, "%Y-%m-%d %H:%M:%S.%f")

# columns changed in 2021, these are the new columns - use to rename old columns with new names
map_old_cols = [
    'trip_duration',
    'started_at',
    'ended_at',
    'start_station_id',
    'start_station_name',
    'start_lat',
    'start_lng',
    'end_station_id',
    'end_station_name',
    'end_lat',
    'end_lng',
    'bike_id',
    'member_casual',
    'birth_year',
    'gender'
]

# generator function to import data from csv files and union all files
def load_files(data_files):
    for file in data_files:
        # if file is older than Februay 2021
        if file < "202102": 
            yield(
            pd.read_csv('monthly_data/' +search_year + '_monthly_files/' + file,
                names=map_old_cols,
                header=0,
                index_col=False,
                dtype={'start_station_id': str, 'end_station_id': str},
                usecols= ['started_at', 'ended_at', 'start_station_id', 'end_station_id'],
                parse_dates= ['started_at', 'ended_at'],
                date_parser= old_date_parser)
            )

        else:
            yield(
                pd.read_csv('monthly_data/' +search_year + '_monthly_files/' + file,
                dtype={'start_station_id': str, 'end_station_id': str},
                usecols= ['started_at', 'ended_at', 'start_station_id', 'start_station_name', 'start_lat', 'start_lng', 'end_station_id', 'end_lat', 'end_lng'],
                parse_dates= ['started_at', 'ended_at'],
                date_parser= new_date_parser)
            )

# generator function to group station arrrival and departure counts
def starts_data(agg_data):
    for data in agg_data:
        yield(
            data.groupby([data.started_at.dt.date, data['start_station_id']])['start_station_id'].count().reset_index(name="departures")
        )


def ends_data(agg_data):
    for data in agg_data:
        yield(
            data.groupby([data.started_at.dt.date, data['end_station_id']])[
                'end_station_id'].count().reset_index(name="arrivals")
        )

station_starts = pd.DataFrame(pd.concat(starts_data(load_files(data_files))))
station_ends = pd.DataFrame(pd.concat(ends_data(load_files(data_files))))

# merged_data = pd.merge(station_starts,station_ends, how='left',left_on=['started_at','start_station_id'],right_on=['started_at','end_station_id'])
# merged_data = merged_data.drop(columns=['end_station_id'])


In [5]:
# merge the starting station data with the ending station data
merged_data = pd.merge(station_starts,station_ends, how='left',left_on=['started_at','start_station_id'],right_on=['started_at','end_station_id'])
merged_data = merged_data.drop(columns=['end_station_id'])
merged_data.shape

(286445, 4)

In [6]:
merged_data.to_csv('annual_data/' + search_year + '_station_data.csv', index = False)