In [1]:
import pandas as pd
import numpy as np
import csv

In [2]:
filenames = ['201706-citibike-tripdata.csv', '201707-citibike-tripdata.csv', '201708-citibike-tripdata.csv']
outputs = ['June.csv', 'July.csv', 'August.csv']

def to_hour(starttime):
    return starttime[8:-6].split()[1]

def to_day(starttime):
    return starttime[8:-6].split()[0]

def record_2_timeseries(record):
    return [record[r] if r in record.index else 0 for r in range(720)]

def build_series(filename):
    df = pd.read_csv(filename, usecols=[1, 3])
    df.columns = ['datetime', 'id']
    print "Loaded data \n", df.head(5)
    df['hour'] = df['datetime'].apply(lambda x: to_hour(x))
    df['day'] = df['datetime'].apply(lambda x: to_day(x))
    df['hours'] =zip(df['hour'], df['day'])
    df['hours'] = df['hours'].apply(lambda xy: int(xy[0])+(int(xy[1])-1)*24)
    df = df[['id', 'hours']]
    stations = np.unique(df['id'])
    print "\nStations and pick-up hours \n", df.head(5)
    grouped = df.groupby(['id', 'hours']).size()
    print "\nAggregated per hour per station %s\n" % stations[0], grouped.get(stations[0])[:6]
    series = map(lambda station: record_2_timeseries(grouped.get(station)), stations)
    print "\nTime series \n", series[0][:10]
    return series, stations

In [3]:
stat = []
for filename, output in zip(filenames, outputs):
    print "\nProcessing ", filename
    series, stations = build_series(filename)
    stat.append(stations)
    with open(output, 'wb') as csvfile:
        writer = csv.writer(csvfile, quoting = csv.QUOTE_ALL)
        map(lambda ind: writer.writerow([stations[ind]]+series[ind]), range(len(series)))

print "Done."


Processing  201706-citibike-tripdata.csv
Loaded data 
              datetime    id
0  2017-06-01 00:00:02   515
1  2017-06-01 00:00:13   488
2  2017-06-01 00:00:20   461
3  2017-06-01 00:00:24  2009
4  2017-06-01 00:00:33   360

Stations and pick-up hours 
     id  hours
0   515      0
1   488      0
2   461      0
3  2009      0
4   360      0

Aggregated per hour per station 72
hours
0     4
5     2
6     1
7    11
8    10
9    15
dtype: int64

Time series 
[4, 0, 0, 0, 0, 2, 1, 11, 10, 15]

Processing  201707-citibike-tripdata.csv
Loaded data 
              datetime    id
0  2017-07-01 00:00:00   539
1  2017-07-01 00:00:03   293
2  2017-07-01 00:00:08  3242
3  2017-07-01 00:00:11  2002
4  2017-07-01 00:00:15  2002

Stations and pick-up hours 
     id  hours
0   539      0
1   293      0
2  3242      0
3  2002      0
4  2002      0

Aggregated per hour per station 72
hours
1    2
5    4
6    4
7    1
8    2
9    9
dtype: int64

Time series 
[0, 2, 0, 0, 0, 4, 4, 1, 2, 9]

Processing