In [1]:
import pandas as pd
import os
from datetime import datetime

In [2]:
search_year = ["2019","2020","2021"]

In [3]:
files = []
for year in search_year:
    month_files = os.listdir('monthly_data/' + year + '_monthly_files')
    month_files.sort()
    files.extend([[year,month_files]])


In [4]:
# columns changed in 2021, these are the new columns - use to rename old columns with new names
map_old_cols = [
    'trip_duration',
    'started_at',
    'ended_at',
    'start_station_id',
    'start_station_name',
    'start_lat',
    'start_lng',
    'end_station_id',
    'end_station_name',
    'end_lat',
    'end_lng',
    'bike_id',
    'member_casual',
    'birth_year',
    'gender'
]

# generator function to import data from csv files and union all files


def load_files(files):
    for col in files:
        for row in col[1]:
            if row < "202102":
                yield(
                    pd.read_csv('monthly_data/' + col[0] + '_monthly_files/' + row,
                                names=map_old_cols,
                                header=0,
                                index_col=False,
                                dtype={'start_station_id': str,
                                    'end_station_id': str},
                                usecols=['start_station_id', 'start_station_name', 'start_lat', 'start_lng'])
                )

            else:
                yield(
                    pd.read_csv('monthly_data/' + col[0] + '_monthly_files/' + row,
                                dtype={'start_station_id': str,
                                    'end_station_id': str},
                                usecols=['start_station_id', 'start_station_name',
                                        'start_lat', 'start_lng'])
                )


station_data = pd.concat(load_files(files))


In [5]:
grouped_id_name = station_data.groupby(['start_station_id', 'start_station_name']).size().reset_index()
grouped_lat_lng = station_data.groupby(['start_station_name'])[['start_lat', 'start_lng']].max().reset_index()
clean_station_list = pd.merge(grouped_id_name, grouped_lat_lng, how='left', on=['start_station_name']).drop(columns=[0])


In [9]:
clean_station_list

Unnamed: 0,start_station_id,start_station_name,start_lat,start_lng
0,116,W 17 St & 8 Ave,40.743000,-74.001000
1,119,Park Ave & St Edwards St,40.696089,-73.978034
2,120,Lexington Ave & Classon Ave,40.686768,-73.959281
3,127,Barrow St & Hudson St,40.731724,-74.006744
4,128,MacDougal St & Prince St,40.727103,-74.002970
...,...,...,...,...
2859,8841.03,W Mosholu Pkwy S & Sedgwick Ave,40.882260,-73.887020
2860,Lab - NYC,Prototype Lab,40.754530,-73.996360
2861,MTL-ECO51-1,MTL-ECO51-1,40.796170,-73.951335
2862,SYS033,Pier 40 X2,40.728487,-74.011693


In [10]:
clean_station_list = clean_station_list.drop_duplicates(subset=['start_station_id'], keep='first').reset_index(drop=True)
clean_station_list

Unnamed: 0,start_station_id,start_station_name,start_lat,start_lng
0,116,W 17 St & 8 Ave,40.743000,-74.001000
1,119,Park Ave & St Edwards St,40.696089,-73.978034
2,120,Lexington Ave & Classon Ave,40.686768,-73.959281
3,127,Barrow St & Hudson St,40.731724,-74.006744
4,128,MacDougal St & Prince St,40.727103,-74.002970
...,...,...,...,...
2819,8841.03,W Mosholu Pkwy S & Sedgwick Ave,40.882260,-73.887020
2820,Lab - NYC,Prototype Lab,40.754530,-73.996360
2821,MTL-ECO51-1,MTL-ECO51-1,40.796170,-73.951335
2822,SYS033,Pier 40 X2,40.728487,-74.011693


In [11]:
clean_station_list.to_csv('annual_data/station_data.csv', index = False)