<a href="https://colab.research.google.com/github/DonRomaniello/CitibikeDocks/blob/master/TripData_Clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install boto3

import boto3
from botocore import UNSIGNED
from botocore.client import Config
import requests
import pandas as pd
from io import BytesIO
from zipfile import ZipFile
from google.colab import drive

In [None]:
def readDirtyZip(dirtyZipUrl):
  dirtyZipFilename = requests.get(dirtyZipUrl).content
  dirtyZipFile = ZipFile( BytesIO(dirtyZipFilename), 'r')
  tripData = pd.read_csv(dirtyZipFile.open([cleanFilename for cleanFilename in dirtyZipFile.namelist() if "._" not in cleanFilename and ".csv" in cleanFilename][0]), low_memory=False)
  
  return tripData

In [1]:
def legacyCheckFix(s3URL):
  legacyTrips = readDirtyZip(s3URL)

  if 'start station id' in legacyTrips.columns:
    legacyTrips['start station id'] = legacyTrips['start station id'].map(stationNameDictionary)
    legacyTrips['end station id'] = legacyTrips['end station id'].map(stationNameDictionary)

    legacyTrips.rename(columns=legacyColumnRename, inplace=True)
  
  legacyTrips = legacyTrips[legacyColumnRename.values()]
  legacyTrips.dropna(inplace=True)
  return legacyTrips

In [None]:
class fileListUrls:

  def __init__(self, startMonth, startYear, endMonth, endYear):
    self.startMonth = startMonth
    self.startYear = startYear
    self.endMonth = endMonth
    self.endYear = endYear

  def tripURLs(self):
    tripURLs = []
    monthRange = pd.date_range((str(self.startYear) + '-' + str(self.startMonth)), (str(self.endYear) + '-' + str(self.endMonth)) , freq='MS').strftime("%Y%m").tolist()
    for dictName in s3.list_objects(Bucket='tripdata')['Contents']:
      for month in monthRange:
        if dictName['Key'].startswith(month):
          tripURLs.append('https://s3.amazonaws.com/tripdata/' + dictName['Key'])
          monthRange.remove(month)
  
    tripURLs.reverse()
    return tripURLs

  def nameForCsv(self):
    nameForCsv = '/drive/MyDrive/' + str(self.startYear) + str(self.startMonth).zfill(2) + '-' + str(self.endYear) + str(self.endMonth).zfill(2) + 'csv.xz'
    return nameForCsv

In [None]:
stationLocationsRequest = requests.get('https://gbfs.citibikenyc.com/gbfs/en/station_information.json')
stationLocationData = stationLocationsRequest.json()
stationLocations = pd.DataFrame(stationLocationData['data']['stations'])
stationNameDictionary = dict(zip(stationLocations[stationLocations['short_name'].str.contains('[a-zA-Z]+', regex=True)==False].legacy_id.astype('int64'), stationLocations[stationLocations['short_name'].str.contains('[a-zA-Z]+', regex=True)==False].short_name))

del stationLocationsRequest, stationLocationData, stationLocations

In [None]:
legacyColumnRename = dict({'starttime': 'started_at', 'stoptime': 'ended_at', 'start station id': 'start_station_id', 'end station id': 'end_station_id'})

s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))

drive.mount('/drive')

In [None]:
urlRange = fileListUrls(input("Start month (integer): "), input("Start year: "), input("End month (integer, inclusive): "), input("End year:"))

csvUrls = urlRange.tripURLs()

hotTrips = legacyCheckFix(csvUrls[0])

print("Writing first CSV...")

hotTrips.to_csv(urlRange.nameForCsv())

del hotTrips

for url in csvUrls[1:]:
  hotTrips = legacyCheckFix(url)
  print('Appending...')
  hotTrips.to_csv(urlRange.nameForCsv(), mode='a', header=False)
  del hotTrips

Start month (integer): 2
Start year: 2020
End month (integer, inclusive): 4
End year:2020
Writing first CSV...
Appending...
Appending...
