In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

# Load dataset
df = pd.read_csv('train.csv')

# Convert date column to datetime
df['date'] = pd.to_datetime(df['date'])

# Sort values for time series consistency
df = df.sort_values(['date', 'country', 'store', 'product'])

# Fill missing values using backward fill and then forward fill
df['num_sold'].fillna(method='bfill', inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['num_sold'].fillna(method='bfill', inplace=True)
  df['num_sold'].fillna(method='bfill', inplace=True)


In [None]:
print(df.isna().sum())  # Check for NaNs


id          0
date        0
country     0
store       0
product     0
num_sold    0
dtype: int64


In [None]:


# Normalize num_sold using MinMaxScaler
scaler = MinMaxScaler()
df['num_sold'] = scaler.fit_transform(df[['num_sold']])

# Prepare time series data with a sequence length of 30
def create_sequences(data, seq_length=30):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length])
    return np.array(X), np.array(y)

# Define dataset class
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Define LSTM Model
class LSTMModel(nn.Module):
    def __init__(self, input_size=1, hidden_size=50, num_layers=2, output_size=1):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])  # Use the last output
        return out

# Convert num_sold to numpy and create sequences
num_sold_values = df['num_sold'].values.reshape(-1, 1)
seq_length = 30  # Use past 30 days to predict the 31st day
X, y = create_sequences(num_sold_values, seq_length)

# Split data into train and test sets (80% train, 20% test)
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Create DataLoaders
train_dataset = TimeSeriesDataset(X_train, y_train)
test_dataset = TimeSeriesDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Initialize model, loss function, and optimizer
model = LSTMModel()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train the LSTM model
epochs = 20
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}')

# Evaluate model on test data
model.eval()
test_predictions = []
actual_values = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        y_pred = model(X_batch)
        test_predictions.extend(y_pred.numpy())
        actual_values.extend(y_batch.numpy())

# Convert lists to numpy arrays
test_predictions = np.array(test_predictions).flatten()
actual_values = np.array(actual_values).flatten()

# Calculate Mean Squared Error
mse = mean_squared_error(actual_values, test_predictions)
print(f"Mean Squared Error (MSE): {mse:.4f}")


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1/20, Loss: 0.0143
Epoch 2/20, Loss: 0.0142
Epoch 3/20, Loss: 0.0142
Epoch 4/20, Loss: 0.0142
Epoch 5/20, Loss: 0.0142
Epoch 6/20, Loss: 0.0142
Epoch 7/20, Loss: 0.0142
Epoch 8/20, Loss: 0.0142
Epoch 9/20, Loss: 0.0142
Epoch 10/20, Loss: 0.0142
Epoch 11/20, Loss: 0.0142
Epoch 12/20, Loss: 0.0142
Epoch 13/20, Loss: 0.0142
Epoch 14/20, Loss: 0.0142
Epoch 15/20, Loss: 0.0142
Epoch 16/20, Loss: 0.0142
Epoch 17/20, Loss: 0.0142
Epoch 18/20, Loss: 0.0142
Epoch 19/20, Loss: 0.0142
Epoch 20/20, Loss: 0.0142
Mean Squared Error (MSE): 0.0095


In [None]:
# Calculate MSE
#mse = mean_squared_error(test_data.values, test_forecast_values)

# Calculate PMSE
pmse = (mse / np.mean(actual_values)) * 100

print(f"Mean Squared Error (MSE): {mse}")
print(f"Percentage Mean Squared Error (PMSE): {pmse:.2f}%")

Mean Squared Error (MSE): 0.009547565132379532
Percentage Mean Squared Error (PMSE): 8.39%
