In [1]:
import pandas as pd # main data processing tool
import datetime # for datetime object opperations
import time # for measuring the processing time
import plotly.express as px # for graphs

In [2]:
df = pd.read_csv(
                     'out_parsed',
                     parse_dates=['actual_date', 'eta'],
                     nrows=1e6
                     )

In [3]:
df.head()

Unnamed: 0,actual_date,cod_stop,cod_line,cod_issue,eta,destination_stop
0,2020-02-15 17:45:40+01:00,8_23,8__629___,8__629____2_17:05:00_2_-__13_8__629___,2020-02-15 17:46:34+01:00,8_06002
1,2020-02-15 17:45:40+01:00,8_23,8__686___,5278458,2020-02-15 17:47:01+01:00,8_17480
2,2020-02-15 17:45:40+01:00,8_23,8__651___,5308556,2020-02-15 17:47:19+01:00,8_17480
3,2020-02-15 17:45:40+01:00,8_23,8__654___,5308548,2020-02-15 17:48:21+01:00,8_17480
4,2020-02-15 17:45:40+01:00,8_23,8__622___,8__622____2_17:00:00_2_-__17_8__622___,2020-02-15 17:48:56+01:00,8_06002


In [4]:
def count_bursts(group, mode, bursts_count):
    burst_size = len(group)
    cod_issue = group.iloc[0]['cod_issue']
    cod_stop = group.iloc[0]['cod_stop']
    cod_line = group.iloc[0]['cod_line']
    if (group.iloc[0]['eta'] == mode and burst_size >= 2):
        static = True
    else:
        static = False
    bursts_count.append({
                            'len': burst_size,
                            'cod_issue': cod_issue,
                            'cod_stop': cod_stop,
                            'cod_line': cod_line,
                            'static': static
                         })

In [5]:
def add_static_column(df, burst_threshold, mode):
    """Adds a column to the DataFrame with a boolean value
    depending on if the row's ETA was marked as sttic or not."""
    df['static'] = (False if
                             (not (len(df) >= burst_threshold and df.iloc[0]['eta'] == mode))
                          else True)
    return df

In [6]:
def filter_static_values(df, bursts_count):
    """Remove the static ETA values from a DataFrame.
    The DataFrame must only contain information about a specific trip.
    The static values are those whose ETA value is equal to the one of the
    largest group of same consecutive ETA value and are before or after
    another row with the same ETA.
    
    Keyword arguments:
    df -- Pandas DataFrame with the ETA's
    
    Output:
    df -- filtered DataFrame
    """
    
    # Sort by actual_date and group by "bursts" of same ETA
    df_grouped = (df
                    .sort_values(['actual_date'], ascending=[True])
                    .groupby((df.shift()['eta'] != df['eta']).cumsum())
                 )
    # Get the ETA value of the first "burst"
    try:
        mode = (df_grouped
                          .filter(lambda x: len(x) >= 3)
                          .iloc[0]['eta']
                )
    except IndexError:
            mode = ''
        
    df_grouped.apply(lambda x: count_bursts(x, mode, bursts_count))
    # Return the values with a 'static' column
    return (df_grouped.apply(lambda x: add_static_column(x, 2, mode)))

In [7]:
bursts_count = []
#{
#    'len': 0,
#    'cod_issue': '',
#    'static': False
#}

In [8]:
df['remaining_seconds'] = df['eta'] - df['actual_date']
df['eta_date'] = df['eta'].dt.day
test_df = df[df['remaining_seconds'] < pd.Timedelta(100, unit='m')]
test_df_grouped = test_df.groupby(['cod_issue', 'cod_stop', 'cod_line', 'eta_date'])

In [9]:
# klk = test_df_grouped.get_group((list(test_df_grouped.groups)[0]))
# #print(klk)
# df_grouped = (klk
#                     .sort_values(['actual_date'], ascending=[True])
#                     .groupby((df.shift()['eta'] != df['eta']).cumsum())
#                  )
# try:
#         mode = (df_grouped
#                           .filter(lambda x: len(x) >= 10)
#                           .iloc[0]['eta']
#                 )
# except IndexError:
#         mode = ''

# print("mode: " + str(mode))
# kk = []
# #df_grouped.apply(lambda x: print(x))
# df_grouped.apply(lambda x: count_bursts(x, mode, kk))
                 
# kk = df_grouped.apply(lambda x: add_static_column(x, 2, mode))

In [None]:
test_df = test_df_grouped.apply(lambda x: filter_static_values(x, bursts_count)).reset_index(drop=True)

In [None]:
#bursts_count

In [None]:
bursts_count_df = pd.DataFrame(bursts_count)
bursts_count_df

In [None]:
bursts_count_df['len'].value_counts()

In [None]:
fig = px.histogram(
                   bursts_count_df,
                   x='len',
                   color='static',
                   color_discrete_map={
                         True: "red",
                         False: "green"},
                   )
fig.show()

In [None]:
for n in range(10):
#     selected_trip = (bursts_count_df
#                                     .query("static == False")
#                                     .query("cod_line == '8__563___'")
#                                     .sort_values(by='len', ascending=False)
#                                     .iloc[n]
#                      )
    selected_trip = (bursts_count_df
                                    .query("static == False")
                                    .query("cod_line == '8__656___'")
                                    .sample()
                                    .iloc[0]
                    )
    plot_df = test_df[(test_df['cod_issue'] == selected_trip['cod_issue']) & (test_df['cod_stop'] == selected_trip['cod_stop'])]

    fig = px.scatter(
                     plot_df,
                     x='actual_date',
                     y='eta',
                     color='static',
                     color_discrete_map={
                         True: "red",
                         False: "green"},
                     hover_data=[
                                 plot_df['eta'].dt.minute,
                                 plot_df['eta'].dt.second
                                 ]
                     )
    # Edit the layout
    fig.update_layout(title='ETA over time',
                      xaxis_title='Sample collection time',
                      yaxis_title='ETA')
    fig.show()

In [None]:
#plot_df