In [19]:
import pandas as pd
import datetime
import numpy as np
import itertools
from ipykernel import kernelapp as app
%load_ext Cython
import numba
import gc
global df
from datetime import timedelta  

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [20]:
#filter data to those buses that are at the stop and whose traveltime is 0.0 because we are
# only interested in the timetables from terminus
df = pd.read_hdf("cleaned_store.h5", key="table_name", where="TravelTime == 0.0 and At_Stop == 1")

In [21]:
df.head()

Unnamed: 0,Timestamp,Journey_Pattern_ID,Time_Frame,Vehicle_Journey_ID,Bus_Operator,Longitude,Latitude,Delay_seconds,Block_ID,Vehicle_ID,Stop_ID,At_Stop,Week_Day,Distance,TravelTime,TimeCategory,Rain,Temp,windSpeed
0,2012-11-12 13:28:20,056A0001,2012-11-12,2070,RD,-6.233867,53.342285,0,56002,24549,395,1,0,0.0,0.0,13:30:00,0.0,13.6,4.27
24,2012-11-12 13:20:06,017A1003,2012-11-12,6114,HN,-6.152884,53.3899,4,171009,24587,956,1,0,0.0,0.0,13:30:00,0.0,13.6,4.27
78,2012-11-12 13:08:37,015A1001,2012-11-12,3095,RD,-6.334367,53.307034,0,15015,24606,1105,1,0,0.0,0.0,13:00:00,0.0,13.6,4.27
114,2012-11-12 13:25:22,077A1001,2012-11-12,4905,RD,-6.427166,53.290718,328,27017,24623,4713,1,0,0.0,0.0,13:30:00,0.0,13.6,4.27
203,2012-11-12 13:15:02,01500001,2012-11-12,1519,RD,-6.258533,53.346085,0,150004,33009,337,1,0,0.0,0.0,13:30:00,0.0,13.6,4.27


In [22]:
len(df)

22813

In [23]:
def map_day(x):
    if x <= 4:
        return "Mon-Fr"
    elif x == 5 :
        return "Sat"
    else:
        return "Sun"

In [24]:
#created category for day in database
df["Day_Cat"] = df["Week_Day"].apply(map_day)

In [25]:
#sum or subtract delay from timestamp to get correct time
df["Timestamp_plus_delay"] = df['Timestamp'] - pd.to_timedelta(df['Delay_seconds'], unit='s')

In [26]:
#round time to remove duplicates later on
df["Timestamp_plus_delay"] = pd.DatetimeIndex(df['Timestamp_plus_delay']).round('10min')  

In [27]:
#eliminated date from datetime of correct timestamp
df["Time_no_date"] = pd.DatetimeIndex(df['Timestamp_plus_delay']).time

In [28]:
df = df[["Journey_Pattern_ID", "Delay_seconds","Stop_ID","Day_Cat", "Time_no_date"]]

In [29]:
different_journey_patterns = df["Journey_Pattern_ID"].value_counts().index

In [30]:
list_of_dfs = []
for journey in different_journey_patterns:
    
    #filter dataframe based on journey
    df_temp = df[df["Journey_Pattern_ID"] == journey]
    #groupby stops
    grouped = df_temp.groupby("Stop_ID")
    #get size of each group of stops
    grouped_df = grouped.size().reset_index()
    #rename columns
    grouped_df.columns = ['Stop_ID', 'Count']
    #sort by values
    grouped_df.sort_values(["Count"], ascending=False, inplace=True)
    grouped_df.reset_index(inplace=True, drop=True)
    print(grouped_df.head())
    #take stop id with largest count
    stop_id_value  = grouped_df.loc[0, "Stop_ID"]
    print(stop_id_value)
    #filter df_temp by this stop_id and its journey id and store in a temporary list
    df_temp = df_temp[df_temp["Stop_ID"] == stop_id_value]
    df_temp.head()
    list_of_dfs.append(df_temp)
    
new_df = pd.concat(list_of_dfs)

  Stop_ID  Count
0    1491    548
1    1492      1
2    1939      1
1491
  Stop_ID  Count
0     293    433
293
  Stop_ID  Count
0    5171    414
5171
  Stop_ID  Count
0    4795    390
1    4799      1
4795
  Stop_ID  Count
0    6318    366
1    6319      2
2    6320      2
3     348      1
4    4719      1
6318
  Stop_ID  Count
0    4592    364
4592
  Stop_ID  Count
0    6057    333
1    4514      1
6057
  Stop_ID  Count
0    1772    328
1    1776      1
1772
  Stop_ID  Count
0    2353    318
1    1196      1
2    4643      1
2353
  Stop_ID  Count
0    1423    317
1    1424      1
1423
  Stop_ID  Count
0    7157    295
1    7158      8
2    4747      3
3    1858      1
4    1909      1
7157
  Stop_ID  Count
0    4533    278
1     770      3
2    4844      2
3    3148      1
4    4182      1
4533
  Stop_ID  Count
0    1105    284
1105
  Stop_ID  Count
0     291    283
291
  Stop_ID  Count
0    4843    268
1    4905      2
2    4909      1
4843
  Stop_ID  Count
0    4747    268
4747
  St

  Stop_ID  Count
0    4745     21
4745
  Stop_ID  Count
0     324     20
1     296      1
324
  Stop_ID  Count
0    4962     20
4962
  Stop_ID  Count
0    3088     18
1    3704      1
2    3705      1
3088
  Stop_ID  Count
0    4962     20
4962
  Stop_ID  Count
0     265     20
265
  Stop_ID  Count
0    7026     20
7026
  Stop_ID  Count
0    5047     18
1     485      1
5047
  Stop_ID  Count
0    4664     19
4664
  Stop_ID  Count
0     299     18
299
  Stop_ID  Count
0     395     18
395
  Stop_ID  Count
0    5189     17
5189
  Stop_ID  Count
0    5013     17
5013
  Stop_ID  Count
0     284     17
284
  Stop_ID  Count
0    3057     16
3057
  Stop_ID  Count
0    4260     16
4260
  Stop_ID  Count
0    4962     16
4962
  Stop_ID  Count
0    6048     16
6048
  Stop_ID  Count
0    4843     16
4843
  Stop_ID  Count
0    7026     16
7026
  Stop_ID  Count
0    4952     16
4952
  Stop_ID  Count
0    3732     16
3732
  Stop_ID  Count
0    5047     16
5047
  Stop_ID  Count
0     226     15
226
  

  Stop_ID  Count
0    7149      1
7149
  Stop_ID  Count
0    4096      1
4096
  Stop_ID  Count
0    3605      1
3605
  Stop_ID  Count
0     279      1
279
  Stop_ID  Count
0    7270      1
7270
  Stop_ID  Count
0    7347      1
7347
  Stop_ID  Count
0    4168      1
4168
  Stop_ID  Count
0     288      1
288
  Stop_ID  Count
0    6048      1
6048
  Stop_ID  Count
0    3732      1
3732
  Stop_ID  Count
0    4177      1
4177
  Stop_ID  Count
0    4392      1
4392
  Stop_ID  Count
0    4167      1
4167
  Stop_ID  Count
0     292      1
292
  Stop_ID  Count
0    2039      1
2039
  Stop_ID  Count
0    7188      1
7188
  Stop_ID  Count
0    2955      1
2955
  Stop_ID  Count
0    7347      1
7347
  Stop_ID  Count
0    4108      1
4108
  Stop_ID  Count
0    6282      1
6282
  Stop_ID  Count
0    3815      1
3815
  Stop_ID  Count
0    3514      1
3514
  Stop_ID  Count
0    3593      1
3593
  Stop_ID  Count
0     281      1
281
  Stop_ID  Count
0     265      1
265
  Stop_ID  Count
0     289    

In [53]:
new_df.drop(["Delay_seconds", "Stop_ID"], inplace=True, axis=1)

In [54]:
new_df.drop_duplicates(inplace=True)

In [55]:
len(new_df)

17821

In [56]:
new_df_temp = new_df[new_df["Journey_Pattern_ID"] == "039A0001"]

In [57]:
new_df_temp.sort_values(["Day_Cat","Time_no_date"], ascending=True, inplace=True)
new_df_temp.reset_index(inplace=True, drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [60]:
#these timestamps look very promising, let us write everything to mysql database
new_df.sort_values(["Journey_Pattern_ID","Day_Cat","Time_no_date"], ascending=True, inplace=True)
new_df.reset_index(inplace=True, drop=True)

In [63]:
new_df.to_csv("Timetable.csv",index=False)