In [17]:
import pandas as pd
import datetime

In [26]:
def parse_timestamp(time_in_secs):    
    return datetime.datetime.fromtimestamp(int(time_in_secs) / 1e6)

In [38]:
# Columns description
# Timestamp micro since 1970 01 01 00:00:00 GMT'
# Line ID
# Direction
# Journey Pattern ID
# Time Frame (The start date of the production time table - in Dublin the production time table starts at 6am and ends at 3am)
# Vehicle Journey ID (A given run on the journey pattern)
# Operator (Bus operator, not the driver)
# Congestion [0=no,1=yes]
# Lon WGS84
# Lat WGS84
# Delay (seconds, negative if bus is ahead of schedule)
# Block ID (a section ID of the journey pattern)
# Vehicle ID
# Stop ID
# At Stop [0=no,1=yes]

categorical_columns = ['Line ID', 'Direction', 'Journey Pattern ID', 'Vehicle Journey ID', 'Operator', 'Congestion', 'Block ID', 'Vehicle ID', 'Stop ID', 'At Stop']
df = pd.read_csv('sir140113-200113/siri.20130114.csv.gz',
                 names=['Timestamp', 'Line ID', 'Direction', 'Journey Pattern ID', 'Time Frame', 'Vehicle Journey ID', 'Operator', 'Congestion', 'Lon', 'Lat', 'Delay', 'Block ID', 'Vehicle ID', 'Stop ID', 'At Stop'],
                 parse_dates=[0], date_parser=parse_timestamp,
                 dtype={col_name: 'category' for col_name in categorical_columns})

In [39]:
df.columns

Index(['Timestamp', 'Line ID', 'Direction', 'Journey Pattern ID', 'Time Frame',
       'Vehicle Journey ID', 'Operator', 'Congestion', 'Lon', 'Lat', 'Delay',
       'Block ID', 'Vehicle ID', 'Stop ID', 'At Stop'],
      dtype='object')

In [40]:
df.dtypes

Timestamp             datetime64[ns]
Line ID                     category
Direction                   category
Journey Pattern ID          category
Time Frame                    object
Vehicle Journey ID          category
Operator                    category
Congestion                  category
Lon                          float64
Lat                          float64
Delay                          int64
Block ID                    category
Vehicle ID                  category
Stop ID                     category
At Stop                     category
dtype: object

In [41]:
df.head()

Unnamed: 0,Timestamp,Line ID,Direction,Journey Pattern ID,Time Frame,Vehicle Journey ID,Operator,Congestion,Lon,Lat,Delay,Block ID,Vehicle ID,Stop ID,At Stop
0,2013-01-14 04:00:01,66,0,00660001,2013-01-13,14217,PO,0,-6.56971,53.380451,222,66006,40001,3968,0
1,2013-01-14 04:00:01,40,0,040D0001,2013-01-13,14403,HN,0,-6.373083,53.410049,338,40204,38067,6005,0
2,2013-01-14 04:00:01,27,0,077A1001,2013-01-13,14130,RD,0,-6.259118,53.34565,-534,27009,33254,1358,0
3,2013-01-14 04:00:01,39,0,039A0001,2013-01-13,14729,PO,0,-6.274983,53.350784,-125,39015,33557,7160,0
4,2013-01-14 04:00:01,46,0,046A0001,2013-01-13,16565,D1,0,-6.2306,53.317665,-988,46007,33532,2032,0


In [42]:
df.describe()

Unnamed: 0,Lon,Lat,Delay
count,1725539.0,1725539.0,1725539.0
mean,-6.271743,53.34475,-70.30397
std,0.08367796,0.05517641,461.1429
min,-6.615016,53.0704,-4698.0
25%,-6.307931,53.31962,-258.0
50%,-6.261075,53.34644,0.0
75%,-6.231783,53.37473,61.0
max,-6.053017,53.60652,31362.0
