In [3]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Load the data
training_data = np.load('training_data.npy')
valid_periods = np.load('valid_periods.npy')
categories = np.load('categories.npy')

In [4]:
def build_sequences_optimized(data, valid_periods, window=200, stride=20, telescope=18):
    assert window % stride == 0

    num_sequences = len(valid_periods)
    dataset = np.zeros((num_sequences, window))
    labels = np.zeros((num_sequences, telescope))

    for i in range(num_sequences):
        start, end = valid_periods[i]
        actual_entry_end = None
        if end - start < window + telescope:
            if end - start < telescope + 1:
                print("Sequence too short, skipping")
                continue
            else:
                entry = data[i, start:end - telescope]
                if entry.shape[0] == 0:
                    print("Sequence too short, skipping it should be impossible")
                    continue
                entry = np.pad(entry, (window - entry.shape[0], 0), 'edge')
                actual_entry_end = end - telescope
        else:
            if end - start - window - telescope == 0:
                actual_start = start
            else:
                actual_start = np.random.randint(start, end - telescope - window)
            entry = data[i, actual_start:actual_start + window]
            actual_entry_end = actual_start + window
        
        label = data[i, actual_entry_end:actual_entry_end + telescope]
        dataset[i] = entry
        labels[i] = label
    
    #remove empty rows
    non_empty_indices = ~np.all(dataset == 0, axis=1)
    dataset = dataset[non_empty_indices]
    labels = labels[non_empty_indices]
    return dataset, labels

# Usage:
dataset, labels = build_sequences_optimized(training_data, valid_periods)
print("Dataset shape: ", dataset.shape)

Dataset shape:  (48000, 200)


In [5]:
def split_dataset(dataset, labels, split_percentage=0.8):
    # Calculate the index at which to split the data
    split_index = int(len(dataset) * split_percentage)

    # Split the dataset and labels into training and test sets
    train_data, test_data = dataset[:split_index], dataset[split_index:]
    train_labels, test_labels = labels[:split_index], labels[split_index:]

    return train_data, train_labels, test_data, test_labels

# Usage:
train_data, train_labels, test_data, test_labels = split_dataset(dataset, labels)
print("Train data shape: ", train_data.shape)
print("Test data shape: ", test_data.shape)

Train data shape:  (38400, 200)
Test data shape:  (9600, 200)


In [6]:
# Define the categories
category_dict = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5}

# Convert categories to numerical values
numerical_categories = np.vectorize(category_dict.get)(categories)

# Find the maximum length of the valid periods
max_length = max(valid_periods[:, 1] - valid_periods[:, 0])

# Pad the sequences in valid_training_data to the maximum length
valid_training_data_padded = np.array([np.pad(training_data[i, valid_periods[i, 0]:valid_periods[i, 1]], (0, max_length - (valid_periods[i, 1] - valid_periods[i, 0]))) for i in range(training_data.shape[0])])

# Use valid_training_data_padded instead of valid_training_data
X_train, X_test, y_train, y_test = train_test_split(valid_training_data_padded, numerical_categories, test_size=0.2, random_state=42)

# Train a forecasting model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(train_data, train_labels)

# Make predictions on the test data
predictions = model.predict(test_data)

In [8]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
from math import sqrt

# Calculate Mean Absolute Error
mae = mean_absolute_error(test_labels, predictions)

# Calculate Mean Squared Error
mse = mean_squared_error(test_labels, predictions)

# Calculate Root Mean Squared Error
rmse = sqrt(mse)

# Calculate Mean Absolute Percentage Error
epsilon = 1e-10  # small constant to avoid division by zero
mape = np.mean(np.abs((test_labels - predictions) / (test_labels + epsilon))) * 100

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"MAPE: {mape}%")

MAE: 0.11333084585729576
MSE: 0.025716736011604804
RMSE: 0.16036438510967704
MAPE: 2296764040.5367875%
