# Parse out

Convert the output of crtm_poll to a dataframe filtering the relevant values.

## Setup

In [19]:
import pandas as pd
import json
import datetime
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)

In [45]:
data = []
file = open('out', 'r')
n_lines = 100000
for line in range(n_lines):
    #print(line)
    line = file.readline()
    if (not 'ERROR' in line and line != '\n'):
        try:
            line_json = json.loads(line)['stopTimes']
            #print(line_json)
            
            for time in line_json['times']['Time']:
                filtered = [line_json['actualDate'],
                            line_json['stop']['codStop'],
                            time['line']['codLine'],
                            time['codIssue'],
                            time['time']
                           ]
                #print(filtered)
                data.append(filtered)
        except Exception as e:
            pass
            #print(e)
            #print("ERROR in line: " + line)

data[0]

['2020-02-07T12:51:25+01:00',
 '8_09868',
 '8__656___',
 '5306341',
 '2020-02-07T13:00:11+01:00']

In [47]:
df = pd.DataFrame(data, columns=("actual_time", "cod_stop", "cod_line", "cod_issue", "eta"))
df['eta'] =pd.to_datetime(df['eta'])
df['actual_time'] =pd.to_datetime(df['actual_time'])
df['remaining_seconds'] = (df['eta'] - df['actual_time']).astype('timedelta64[s]')

In [48]:
df.head()

Unnamed: 0,actual_time,cod_stop,cod_line,cod_issue,eta,remaining_seconds
0,2020-02-07 12:51:25+01:00,8_09868,8__656___,5306341,2020-02-07 13:00:11+01:00,526.0
1,2020-02-07 12:51:25+01:00,8_09868,8__656___,5305592,2020-02-07 13:06:00+01:00,875.0
2,2020-02-07 12:51:25+01:00,8_09868,8__656___,5305746,2020-02-07 13:16:00+01:00,1475.0
3,2020-02-07 12:51:25+01:00,8_09868,8__815___,5306331,2020-02-07 13:28:38+01:00,2233.0
4,2020-02-07 12:51:25+01:00,8_09868,8__815___,5306611,2020-02-07 14:27:00+01:00,5735.0


In [49]:
df.groupby(['cod_stop', 'cod_issue']).agg(['min']).query("cod_stop == '8_06297'")

Unnamed: 0_level_0,Unnamed: 1_level_0,actual_time,cod_line,eta,remaining_seconds
Unnamed: 0_level_1,Unnamed: 1_level_1,min,min,min,min
cod_stop,cod_issue,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
8_06297,5305592,2020-02-07 12:53:41+01:00,8__656___,2020-02-07 13:04:00+01:00,82.0
8_06297,5305746,2020-02-07 12:53:41+01:00,8__656___,2020-02-07 13:14:00+01:00,75.0
8_06297,5305777,2020-02-07 12:53:41+01:00,8__560___,2020-02-07 13:27:43+01:00,46.0
8_06297,5305849,2020-02-07 13:00:22+01:00,8__656___,2020-02-07 13:24:00+01:00,36.0
8_06297,5305915,2020-02-07 13:06:59+01:00,8__656___,2020-02-07 13:34:00+01:00,36.0
8_06297,...,...,...,...,...
8_06297,5308038,2020-02-08 09:29:23+01:00,8__658___,2020-02-08 10:19:19+01:00,431.0
8_06297,5308040,2020-02-08 09:29:23+01:00,8__658___,2020-02-08 12:30:00+01:00,8174.0
8_06297,5308665,2020-02-08 09:29:23+01:00,8_N_906___,2020-02-09 01:55:00+01:00,56474.0
8_06297,5308666,2020-02-08 09:29:23+01:00,8_N_906___,2020-02-09 03:10:00+01:00,60974.0


In [53]:
selected_bus = df.query("cod_stop == '8_06297' & cod_line == '8__656___' & remaining_seconds < 3600")
cod_issues = selected_bus['cod_issue'].unique()
selected_bus.head()
for cod_issue in cod_issues:
    iplot({
        'data': 
            [{'x': selected_bus[selected_bus['cod_issue'] == cod_issue]['actual_time'],
              'y': selected_bus[selected_bus['cod_issue'] == cod_issue]['eta'],
             }]})

In [52]:
selected_bus.groupby('cod_issue').min()

Unnamed: 0_level_0,actual_time,cod_stop,cod_line,eta,remaining_seconds
cod_issue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5306001,2020-02-07 12:53:41+01:00,8_06297,8__658___,2020-02-07 13:16:02+01:00,54.0
5306323,2020-02-07 12:53:41+01:00,8_06297,8__658___,2020-02-07 13:45:50+01:00,204.0
5306594,2020-02-07 13:17:06+01:00,8_06297,8__658___,2020-02-07 14:15:13+01:00,184.0
5306617,2020-02-07 13:45:47+01:00,8_06297,8__658___,2020-02-07 14:45:31+01:00,125.0
5306640,2020-02-07 14:15:44+01:00,8_06297,8__658___,2020-02-07 15:14:33+01:00,108.0
5306957,2020-02-07 15:50:21+01:00,8_06297,8__658___,2020-02-07 16:49:00+01:00,63.0
5306958,2020-02-07 14:47:42+01:00,8_06297,8__658___,2020-02-07 15:46:26+01:00,88.0
5306960,2020-02-07 16:50:54+01:00,8_06297,8__658___,2020-02-07 17:46:40+01:00,46.0
5306962,2020-02-07 18:47:02+01:00,8_06297,8__658___,2020-02-07 19:46:39+01:00,59.0
5306964,2020-02-07 20:49:50+01:00,8_06297,8__658___,2020-02-07 21:43:15+01:00,170.0
