In [1]:
import numpy as np
import pandas as pd
import torch

In [2]:
df = pd.read_csv('../data/processed_sunspot_data.csv')
data = df['SunspotNumber'].values

# Normalize the data to the range [0, 1]
min_val = np.min(data)
max_val = np.max(data)
normalized_data = (data - min_val) / (max_val - min_val)

# Create sequences for time series prediction
seq_length = 12  # Using past 12 months to predict the next month
prediction_step = 1  # Predicting 1 month ahead
X, y = [], []
for i in range(len(normalized_data) - seq_length - prediction_step + 1):
    X.append(normalized_data[i:i + seq_length])
    y.append(normalized_data[i + seq_length + prediction_step - 1])
X = np.array(X)
y = np.array(y)


In [3]:
# Convert to PyTorch tensors
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

# Split the data into training, validation, and test sets
train_ratio = 0.7
val_ratio = 0.15
N = len(X_tensor)
train_size = int(N * train_ratio)
val_size = int(N * val_ratio)
test_size = N - train_size - val_size

X_train = X_tensor[:train_size]
y_train = y_tensor[:train_size]
X_val = X_tensor[train_size:train_size + val_size]
y_val = y_tensor[train_size:train_size + val_size]
X_test = X_tensor[train_size + val_size:]
y_test = y_tensor[train_size + val_size:]


In [4]:
print(f"Train shapes: X={X_train.shape}, y={y_train.shape}")
print(f"Validation shapes: X={X_val.shape}, y={y_val.shape}")
print(f"Test shapes: X={X_test.shape}, y={y_test.shape}")

Train shapes: X=torch.Size([2316, 12]), y=torch.Size([2316])
Validation shapes: X=torch.Size([496, 12]), y=torch.Size([496])
Test shapes: X=torch.Size([497, 12]), y=torch.Size([497])


In [5]:
np.savez('../data/sunspot_sequences.npz', 
        X_train=X_train.numpy(), y_train=y_train.numpy(),
        X_val=X_val.numpy(), y_val=y_val.numpy(),
        X_test=X_test.numpy(), y_test=y_test.numpy(),
        min_val=min_val, max_val=max_val,
        seq_length=seq_length, prediction_step=prediction_step)