In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from sklearn.metrics import accuracy_score
from sktime.transformations.series.adapt import TabularToSeriesAdaptor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sktime.classification.distance_based import KNeighborsTimeSeriesClassifier
from pyts.approximation import paa as paa
from sklearn.model_selection import train_test_split
from sktime.classification.shapelet_based import ShapeletTransformClassifier
from sklearn.metrics import classification_report
from tensorflow.keras.optimizers import Adam
from tslearn.shapelets import LearningShapelets, grabocka_params_to_shapelet_size_dict
from tslearn.utils import to_time_series_dataset

# Load the data

In [None]:
def load_npy(filename):
    return np.load(filename)


dir_path = 'cleaned_time_series/'
len_threshold = 1280
X, y, ids = [], [], []

for file in os.listdir(dir_path):
    if os.path.splitext(file)[1] != '.npy':
        continue

    split = file.split("_")
    ids.append(split[0])  # track_id
    y.append(split[1][:-4])  # genre
    ts = load_npy(dir_path + file)

    if len(ts) > len_threshold:
        ts = ts[0:len_threshold]
    else:
        # pad = [np.mean(ts[:-5])] * (len_threshold-len(ts)) # fill by mean value of last n observations
        pad = [ts[-1]] * (len_threshold - len(ts))  # fill with last observation
        ts = np.append(ts, pad)

    X.append([ts])

X, y, ids = np.array(X), np.array(y), np.array(ids)
print(len(X))

# Classification

## Encode the classes into int values

In [None]:
# Encode the class labels as integers
encoder = LabelEncoder()
y = encoder.fit_transform(y)
original_labels = encoder.classes_

for i, label in enumerate(original_labels):
    print(f"{label} -> {i}")
# print for each label its corresponding integer

## Approximate with PAA

In [ ]:
# Initialize PAA transformer
approximator = paa.PiecewiseAggregateApproximation(window_size=4)

# Apply PAA to your time series data
X_paa = approximator.transform(X.reshape(-1, 1280))

print(X_paa.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_paa.reshape(-1, 1, 320), y, test_size=0.2, random_state=42, stratify=y
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
scaler = TabularToSeriesAdaptor(MinMaxScaler(), fit_in_transform=True)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# KNN with DTW

## KNN with DTW distance (BINARY CLASSIFICATION TASK)

We required to filter the data in order to have only two classes: "Heavy-Metal" and "Piano"

In [None]:
# Indices of classes "heavy-metal" (4) and "piano" (12)
heavy_metal_indices = np.where(y == 4)[0]  # heavy-metal class index
piano_indices = np.where(y == 12)[0]       # piano class index

# Combine indices
combined_indices = np.concatenate((heavy_metal_indices, piano_indices))

# Filter X and y based on these indices
X_filtered = X[combined_indices]
y_filtered = y[combined_indices]
ids_filtered = np.array(ids)[combined_indices]

Splitted the data again with the filtered data

In [None]:
X_train_binary, X_test_binary, y_train_binary, y_test_binary = train_test_split(
    X_filtered, y_filtered, test_size=0.1, random_state=42, stratify=y_filtered
)

X_train_binary.shape, X_test_binary.shape, y_train_binary.shape, y_test_binary.shape

In [None]:
knn_binary = KNeighborsTimeSeriesClassifier(n_neighbors=5, distance="dtw", n_jobs=-1)
knn_binary.fit(X_train_scaled_binary, y_train_binary)

y_pred_binary = knn_binary.predict(X_test_scaled_binary)
print(accuracy_score(y_test_binary, y_pred_binary))
print(classification_report(y_test_binary, y_pred_binary))

## KNN with DTW distance (MULTICLASS CLASSIFICATION TASK)

In [None]:
knn = KNeighborsTimeSeriesClassifier(n_neighbors=7, distance="dtw", n_jobs=-1)
knn.fit(X_train_scaled, y_train)

y_pred = knn.predict(X_test_scaled)
print(accuracy_score(y_test, y_pred))

# KNN with Euclidean distance

## KNN with Euclidean distance (MULTICLASS CLASSIFICATION TASK)

In [ ]:
knn = KNeighborsTimeSeriesClassifier(n_neighbors=7, distance="euclidean", n_jobs=-1)
knn.fit(X_train_scaled, y_train)

y_pred_multi = knn.predict(X_test_scaled)
print(f'Accuracy score:{accuracy_score(y_test, y_pred_multi)}')
print(classification_report(y_test, y_pred_multi))

## KNN with Euclidean distance (BINARY CLASSIFICATION TASK)

In [ ]:
knn_binary = KNeighborsTimeSeriesClassifier(n_neighbors=5, distance="euclidean", n_jobs=-1)
knn_binary.fit(X_train_scaled_binary, y_train_binary)

y_pred_binary = knn_binary.predict(X_test_scaled_binary)
print(accuracy_score(y_test_binary, y_pred_binary))
print(classification_report(y_test_binary, y_pred_binary))

# Shapelets

## With tslearn library, using adam optimizer and Learning Shapelets algorithm

Shapelets are subsequences that can be used to represent a class. Matrix profiles make it possibile to identify these shapelets.


In [None]:
# transform X_paa from this shape (10000, 320) to this shape (10000, 1, 320)
X_paa = X_paa.reshape(X_paa.shape[0], 1, X_paa.shape[1])
X_paa.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_paa, y, test_size=0.2, random_state=42, stratify=y
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [ ]:
# Adapt X to tslearn format
X_train = to_time_series_dataset(X_train)
X_test = to_time_series_dataset(X_test)

In [None]:
scaler = TabularToSeriesAdaptor(MinMaxScaler(), fit_in_transform=True)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [ ]:
# To work with the tslearn library, we need to reshape the data where the first dimension is the number of time series, the second dimension is the number of points in each time series, and the third dimension is the number of dimensions (in this case, 1 since we have univariate time series).
X_train_scaled = X_train_scaled.reshape(X_train_scaled.shape[0], X_train_scaled.shape[2], 1)
X_test_scaled = X_test_scaled.reshape(X_test_scaled.shape[0], X_test_scaled.shape[2], 1)

X_train_scaled.shape, X_test_scaled.shape

In [None]:
st = ShapeletTransformClassifier()

In [ ]:
adam = Adam(learning_rate=0.001)

In [ ]:
%%time
# We will extract 1 shapelet and align it with a time series
shapelet_sizes = {16: 1200}


# Define the model and fit it using the training data
shp_clf = LearningShapelets(n_shapelets_per_size=shapelet_sizes,
                            weight_regularizer=0.001,
                            optimizer=adam,
                            max_iter=2000,
                            verbose=1,
                            scale=False,
                            random_state=42)
shp_clf.fit(X_train_scaled, y_train)

In [ ]:
# Get the number of extracted shapelets, the (minimal) distances from
# each of the timeseries to each of the shapelets, and the corresponding
# locations (index) where the minimal distance was found
n_shapelets = sum(shapelet_sizes.values())
distances = shp_clf.transform(X_train_scaled)
predicted_locations = shp_clf.locate(X_train_scaled)

In [ ]:
n_shapelets

In [ ]:
distances

In [ ]:
predicted_locations

In [ ]:
# We can extract the shapelets from the model
shapelets = shp_clf.shapelets_
shapelets

In [ ]:
# Check if the predictions are correct
shp_clf.score(X_test_scaled, y_test)
# classification report
from sklearn.metrics import classification_report
y_pred = shp_clf.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

Plotting the most important shapelet match for each time series

In [ ]:
num_instances = 50

# Set up the figure and subplots
fig, axs = plt.subplots(num_instances, 1, figsize=(10, num_instances * 3))

# Iterate through each instance and corresponding axes
for idx, ax in enumerate(axs):
    # Plot the time series        


    ax.plot(X_train_scaled[idx].ravel(), "r-", label='Time Series')

    # Find the shapelet with the smallest distance to the time series
    min_dist_idx = np.argmin(distances[idx])

    # Get the start position of this shapelet in the time series
    start_pos = predicted_locations[idx, min_dist_idx]

    # Extract the shapelet
    shp = shapelets[min_dist_idx]

    if start_pos != -1:  # Only plot if the shapelet matches the time series
        # Extract the segment of the time series where the shapelet matches
        matched_segment = X_train_scaled[idx, start_pos:start_pos + len(shp)].ravel()
        ax.plot(range(start_pos, start_pos + len(shp)), matched_segment, "g-", linewidth=2, label='Shapelet Match')

    # Enhance plot
    ax.legend()
    ax.set_title(f"Time Series {idx + 1} with Most Important Shapelet Match")
    ax.set_xlabel("Time")
    ax.set_ylabel("Value")

# Adjust layout to prevent overlapping
plt.tight_layout()
plt.show()


## Plotting the ts with id = x and the most important shapelet match

In [ ]:
# Assuming X is the specific ID we're interested in
specific_id = "1U5HTINVvE7oSSscuD3Gjm"

# Set up the figure and subplots dynamically based on the number of instances with the specific ID
num_instances = sum(1 for id_val in ids if id_val == specific_id)
fig, axs = plt.subplots(num_instances, 1, figsize=(10, num_instances * 3), squeeze=False)

# Counter for the number of plots created
plot_count = 0


# Iterate through each instance
for idx in range(len(ids)):
    if ids[idx] == specific_id:
        ax = axs[plot_count, 0]  # Access the subplot for the current plot

        # Plot the time series
        ax.plot(X_train_scaled[idx].ravel().T, "r-", label='Time Series')

        # Find the shapelet with the smallest distance to the time series
        min_dist_idx = np.argmin(distances[idx])

        # Get the start position of this shapelet in the time series
        start_pos = predicted_locations[idx, min_dist_idx]

        # Extract the shapelet
        shp = shapelets[min_dist_idx]

        if start_pos != -1:  # Only plot if the shapelet matches the time series
            # Extract the segment of the time series where the shapelet matches
            matched_segment = X_train_scaled[idx, start_pos:start_pos + len(shp)].ravel()
            ax.plot(range(start_pos, start_pos + len(shp)), matched_segment, "g-", linewidth=2, label='Shapelet Match')

        # Enhance plot
        ax.legend()
        ax.set_title(f"Time Series with Most Important Shapelet Match of ID {specific_id}")
        ax.set_xlabel("Time")
        ax.set_ylabel("Value")

        # Increment the plot counter
        plot_count += 1

# Adjust layout to prevent overlapping
plt.tight_layout()
plt.show()