In [5]:
import numpy as pd
import pandas as pd
from aeon.clustering import TimeSeriesKMeans
import plotly.graph_objects as go
from aeon.utils.windowing import sliding_window_view

In [6]:
# Replace 'file_path.csv' with the actual path to your file
df_sensor = pd.read_csv('../data/dodgers+loop+sensor/Dodgers.data', parse_dates=[0], header=None, names=['Date', 'Value'])
df_sensor.sample(10)

Unnamed: 0,Date,Value
38600,2005-08-22 00:40:00,6
13151,2005-05-25 15:55:00,36
37699,2005-08-18 21:35:00,10
28579,2005-07-18 05:35:00,4
29079,2005-07-19 23:15:00,6
1144,2005-04-13 23:20:00,15
31525,2005-07-28 11:05:00,30
20916,2005-06-21 15:00:00,36
41829,2005-09-02 05:45:00,14
48525,2005-09-25 11:45:00,32


In [10]:
df_events = pd.read_csv(
    '../data/dodgers+loop+sensor/Dodgers.events',
    header=None,
    names=['Date', 'StartTime', 'EndTime', 'Attendance', 'Opponent', 'Result'],
    parse_dates={'Date_StartTime': ['Date', 'StartTime'], 'Date_EndTime': ['Date', 'EndTime']},
    date_format='%m/%d/%y %H:%M:%S',
    encoding='latin1'
)
df_events.sample(10)

Unnamed: 0,Date_StartTime,Date_EndTime,Attendance,Opponent,Result
52,2005-07-31 13:10:00,2005-07-31 17:14:00,44543,St. Louis,L 7-5
79,2005-09-28 19:10:00,2005-09-28 21:58:00,46424,Arizona,L 4-3
16,2005-05-15 12:10:00,2005-05-15 14:53:00,53239,Atlanta,L 5-2
47,2005-07-26 19:10:00,2005-07-26 22:08:00,42826,Cincinnati,W 7-4
43,2005-07-15 19:40:00,2005-07-15 22:08:00,51057,San Francisco,L 6-0
38,2005-06-29 12:10:00,2005-06-29 14:43:00,43569,San Diego,W 4-2
59,2005-08-23 19:10:00,2005-08-23 21:49:00,44416,Colorado,W 8-3
73,2005-09-14 19:10:00,2005-09-14 22:17:00,30329,Colorado,L 8-7
35,2005-06-12 13:10:00,2005-06-12 15:46:00,54368,Minnesota,W 4-3
2,2005-04-15 19:40:00,2005-04-15 21:48:00,51816,San Diego,W 4-0


In [11]:
df = df_sensor.copy()
df['y'] = 0

# Iterate through each event in df_events
for _, event in df_events.iterrows():
    # Set 'y' to 1 for rows in dataset where 'Date' is within the event's time range
    df.loc[
        (df['Date'] >= event['Date_StartTime']) & (df['Date'] <= event['Date_EndTime']),
        'y'
    ] = 1

# Rename columns for clarity
X = df[['Date', 'Value']]
y = df[['y']]

print("Shape of X:", X.shape)
print("Shape of y:", y.shape)
X.sample(10)

Shape of X: (50400, 2)
Shape of y: (50400, 1)


Unnamed: 0,Date,Value
34431,2005-08-07 13:15:00,19
38876,2005-08-22 23:40:00,2
2575,2005-04-18 22:35:00,13
26851,2005-07-12 05:35:00,-1
32212,2005-07-30 20:20:00,32
48626,2005-09-25 20:10:00,16
39698,2005-08-25 20:10:00,25
37551,2005-08-18 09:15:00,26
48524,2005-09-25 11:40:00,21
9305,2005-05-12 07:25:00,41


In [12]:
def plot_sensor_values_interactive(data):
    fig = go.Figure()

    # Add sensor value line
    fig.add_trace(go.Scatter(
        x=data['Date'], 
        y=data['Value'], 
        mode='lines', 
        name='Sensor Value',
        line=dict(color='blue')
    ))

    # Add event periods as shaded regions
    for _, event in df_events.iterrows():
        fig.add_shape(
            type="rect",
            x0=event['Date_StartTime'], 
            x1=event['Date_EndTime'], 
            y0=data['Value'].min(), 
            y1=data['Value'].max(),
            fillcolor="red",
            opacity=0.3,
            line_width=0,
            layer="below"
        )

    # Update layout
    fig.update_layout(
        title="Interactive Time Series of Sensor Values",
        xaxis_title="Date",
        yaxis_title="Value",
        template="plotly_white"
    )

    fig.show()

plot_sensor_values_interactive(X)

In [13]:
# Replace -1 in 'Value' column with a moving average
X_filled = X.copy()
X_filled.loc[:, 'Value'] = X['Value'].replace(-1, None) 
X_filled['Value'] = X_filled['Value'].interpolate(method='linear')

plot_sensor_values_interactive(X)

In [17]:
# Create a sliding window splitter with a window size of 12
window_size = 12

# Apply sliding window view to the 'Value' column
X_windows = sliding_window_view(X_filled['Value'].dropna().astype(float).values, window_shape=window_size)
X_windows = pd.DataFrame(X_windows)
X_windows.sample(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
7369,10.0,10.0,13.0,8.0,7.0,12.0,6.0,7.0,9.0,9.0,11.0,10.0
10560,2.0,3.0,2.0,1.0,3.0,1.0,1.0,4.0,1.0,3.0,7.0,3.0
24470,22.0,31.0,18.0,21.0,27.0,21.0,19.0,22.0,22.0,26.0,24.0,17.0
44106,11.0,8.0,17.0,10.0,13.0,12.0,12.0,21.0,15.0,17.0,18.0,20.0
29853,8.0,3.0,9.0,14.0,9.0,9.0,9.0,8.0,9.0,6.0,6.0,4.0
7190,43.0,42.0,53.0,33.0,46.0,42.0,29.0,39.0,33.0,47.0,37.0,39.0
33924,3.0,5.0,5.0,7.0,8.0,6.0,7.0,9.0,6.0,6.0,12.0,10.0
36515,10.0,7.0,13.0,5.0,9.0,9.0,21.0,9.0,14.0,14.0,15.0,23.0
3996,21.0,39.0,17.0,30.0,26.0,36.0,42.0,45.0,29.0,37.0,34.0,37.0
21253,30.0,19.0,15.0,21.0,26.0,21.0,20.0,30.0,21.0,27.0,19.0,24.0


In [None]:
X_reshaped = X_windows.values

n_clusters = 2
model = TimeSeriesKMeans(n_clusters=n_clusters, random_state=42)
labels = model.fit_predict(X_reshaped)

X_filled['Cluster'] = labels

print("Cluster labels assigned to the data:")
print(X_filled['Cluster'].value_counts())

In [None]:
centroids = model.cluster_centers_
print("Centroids of the clusters:")
print(centroids)

In [None]:
X_final = X_filled.copy()
X_final['Cluster'] = labels.map(lambda x: 0 if x == 0 else 1) #TODO update depending on clusters
X_final['Prediction'] = None

for i in range(len(X_windows)):
    for j in range(window_size):
        original_index = i + j
        if original_index < len(X_final):
            X_final.loc[original_index, 'Prediction'] = labels[i]

X_final['Prediction'] = X_final['Prediction'].fillna(method='ffill').fillna(method='bfill')

print("Mapped predictions to the original time series:")
X_final[['Date', 'Value', 'Prediction']].head(20)

In [None]:
from sklearn.metrics import precision_score, recall_score

def compare_predictions_with_actual(data):
    fig = go.Figure()

    # Add actual values line
    fig.add_trace(go.Scatter(
        x=data['Date'], 
        y=data['y'], 
        mode='lines', 
        name='Actual y',
        line=dict(color='blue')
    ))

    # Add predicted values line
    fig.add_trace(go.Scatter(
        x=data['Date'], 
        y=data['Prediction'], 
        mode='lines', 
        name='Predicted y',
        line=dict(color='red', dash='dot')
    ))

    # Update layout
    fig.update_layout(
        title="Comparison of Actual vs Predicted Values",
        xaxis_title="Date",
        yaxis_title="y",
        template="plotly_white"
    )

    fig.show()

precision = precision_score(X_final['y'], X_final['Prediction'])
recall = recall_score(X_final['y'], X_final['Prediction'])

print(f"Precision: {precision}")
print(f"Recall: {recall}")
compare_predictions_with_actual(X_final)