In [102]:
import pandas as pd
import numpy as np

In [103]:
df = pd.read_hdf("cleaned_store.h5", key="table_name")

In [104]:
j_patterns = df["Journey_Pattern_ID"].value_counts().index.tolist()

In [105]:
sql_table = pd.DataFrame()

In [106]:
# insert jpid into sql_table
sql_table["Journey_Pattern_ID"] = pd.Series(j_patterns)

In [107]:
# this column encodes the main journey pattern id for each variation. e.g. 046A0001, 046A0002, 046A0003 etc.. will have 046A0001
# as the main journey pattern id. 046A1001, 046A1002, 046A1003 will have 046A1001 as their main journey pattern id instead.
# therefore direction is kept encoded.
# very important: 046A1000 does not exist! the main route is always with 1!
sql_table['Main_Journey_Pattern_ID'] = sql_table["Journey_Pattern_ID"].astype(str).str[:-1]

In [108]:
sql_table['Main_Journey_Pattern_ID'] = sql_table['Main_Journey_Pattern_ID'] + '1'

In [109]:
#encoding a better line_id compared to the one available from gps data
sql_table['Line_ID'] = sql_table["Journey_Pattern_ID"].astype(str).str[:-4]

In [110]:
sql_table['Line_ID'] = sql_table['Line_ID'].apply(lambda x: x.lstrip('0'))

In [111]:
#direction where the bus is going, can only be 0 or 1
sql_table['Direction'] = sql_table["Journey_Pattern_ID"].astype(str).str[4]

In [112]:
#encode the different sub routes within a lineid
sql_table['Variation'] = sql_table["Journey_Pattern_ID"].astype(str).str[-1]

In [113]:
#loading csv file which has same information as the table with same name in our database
df2 = pd.read_csv('JourneyPatternID_StopID.csv')

In [114]:
idx_max = df2.groupby(['Journey_Pattern_ID'])['Distance'].transform(max) == df2['Distance']

In [115]:
#dataframe with stops for each journey pattern id which have largest distance or in other words, the destination stop for each jpid
df3 = df2[idx_max]

In [116]:
#let us rename columns and drop distance
df3.rename(columns={'Stop_ID': 'Destination_Stop_ID'}, inplace=True)
df3.drop('Distance', inplace=True, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [117]:
idx_min = df2.groupby(['Journey_Pattern_ID'])['Distance'].transform(min) == df2['Distance']

In [118]:
#dataframe with stops for each journey pattern id which have smallest distance or in other words, the starting stop for each jpid
df4 = df2[idx_min]

In [119]:
#let us rename columns and drop distance
df4.rename(columns={'Stop_ID': 'Source_Stop_ID'}, inplace=True)
df4.drop('Distance', inplace=True, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [120]:
# merge sql table with table with sources
sql_table = pd.merge(sql_table,df4, how='inner', on='Journey_Pattern_ID')

In [121]:
# merge sql table with table with destinations
sql_table = pd.merge(sql_table,df3, how='inner', on='Journey_Pattern_ID')

In [122]:
sql_table.sort_values(["Journey_Pattern_ID"], ascending=True, inplace=True)
sql_table.reset_index(inplace=True, drop=True)

In [123]:
sql_table.to_csv("JPID_LineID_Start_End.csv",index=False)