In [112]:
import numpy as np
import pandas as pd
import plotly.express as px

import DataRetriever as dr

retriever = dr.DataRetriever()

year_two = retriever.get_data("All-Subsystems-minute-Year2.pkl")

pd.options.mode.chained_assignment = None

In [129]:
# Convert column from string to datetime.
year_two["Timestamp"] = pd.to_datetime(year_two["Timestamp"])

In [130]:
# Creating a pd.Series with the timestamp shifted one downwards. Adding the first value twice, and excluding the last.
timestamp_plus_one = pd.concat([pd.Series(year_two["Timestamp"][0]), year_two["Timestamp"][:-1]], ignore_index=True)

# Creating a new column with the time delta in seconds.
year_two["Timestamp_Delta"] = (year_two["Timestamp"] - timestamp_plus_one).astype('timedelta64[s]')
df_delta = year_two[["Timestamp" ,"Timestamp_Delta"]][1:]

What information can be extracted from binning the intervals?

In [131]:
df_delta.groupby(pd.cut(df_delta["Timestamp_Delta"], bins=[0, 54, 59, 60, 65, 300, np.inf])).count()

Unnamed: 0_level_0,Timestamp,Timestamp_Delta
Timestamp_Delta,Unnamed: 1_level_1,Unnamed: 2_level_1
"(0.0, 54.0]",0,0
"(54.0, 59.0]",1079,1079
"(59.0, 60.0]",517762,517762
"(60.0, 65.0]",390,390
"(65.0, 300.0]",369,369
"(300.0, inf]",3,3


We see that 762 intervals are larger than 1 minute, and 1080 intervals smaller than 1 minute.
Is it possible to recognise a pattern for the missing intervals?

In [168]:
df_delta["Bin"] = pd.cut(df_delta["Timestamp_Delta"], bins=[0, 54, 59, 60, 65, 300, np.inf])#, labels=False)
non_60 = df_delta.loc[df_delta["Timestamp_Delta"] != 60]

In [169]:
# # Correct Timestamps
# for row in non_60.index:
#     dt = df_delta["Timestamp"][row]
#     non_60.at[row, "Timestamp"] = pd.Timestamp(year=dt.year, month=dt.month, day=dt.day,
#                                        hour=dt.hour, minute=dt.minute, second=dt.second)
# non_60["Timestamp"] = pd.to_datetime(non_60["Timestamp"])
# non_60["Timestamp"] = non_60["Timestamp"].dt.time

# Incorrect Timestamps; however, plot requires incorrect timestamp. 
# (Kan ikke kun vise tiden, og datoen skal være den samme for at kunne lave groupby.)
for row in non_60.index:
    dt = df_delta["Timestamp"][row]
    non_60.at[row, "Timestamp"] = pd.Timestamp(year=2000, month=1, day=1,
                                               hour=dt.hour, minute=dt.minute, second=00)
non_60["Timestamp"] = pd.to_datetime(non_60["Timestamp"])

In [179]:
non_60

Unnamed: 0,Timestamp,Timestamp_Delta,Bin
30,2000-01-01 00:31:00,59.0,"(54.0, 59.0]"
60,2000-01-01 01:01:00,61.0,"(60.0, 65.0]"
270,2000-01-01 04:31:00,59.0,"(54.0, 59.0]"
870,2000-01-01 14:31:00,59.0,"(54.0, 59.0]"
1438,2000-01-01 00:01:00,174.0,"(65.0, 300.0]"
...,...,...,...
518166,2000-01-01 00:01:00,179.0,"(65.0, 300.0]"
518196,2000-01-01 00:31:00,57.0,"(54.0, 59.0]"
518226,2000-01-01 01:01:00,63.0,"(60.0, 65.0]"
518406,2000-01-01 04:01:00,59.0,"(54.0, 59.0]"


In [190]:
binned_count = pd.DataFrame(non_60.groupby(["Bin", "Timestamp"]).count()).reset_index()
binned_count = binned_count[binned_count["Timestamp_Delta"] != 0]

In [197]:
fig = px.scatter(binned_count, x="Timestamp", y="Timestamp_Delta", color="Bin", title="Occurrence of Missing Record Given Time of Day", )

fig.update_layout(
    xaxis_title="Time of Day",
    yaxis_title="Occurrence",
)

fig.show()

It was seen that three intervals were larger than 5 minutes, how long (in hours), did the interval last?

In [99]:
df_delta["Timestamp_Delta"].loc[(df_delta["Timestamp_Delta"] > 300)] / 60 / 60
# Hours of missing data for the three longest periods.

375267    72.061389
388327     1.016667
511699    15.107222
Name: Timestamp_Delta, dtype: float64