In [None]:
import os
import numpy as np
import pandas as pd
from datetime import datetime, timedelta,date

# 
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")
sns.set_palette('icefire_r', 2)
import warnings

%matplotlib inline
#Przetwarzanie

# Baza danych gównych do nauki modelu
train = pd.read_csv("train.csv") 
# Baza danych Titanica do sprawdzenia modelu
test = pd.read_csv("test.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
print(pd.isnull(train).sum())

In [None]:
print(pd.isnull(test).sum())

In [None]:
cols= ['country', 'store', 'product' ]
for col in cols:
  df_grouped = train.groupby(col)[col].count()
  df_grouped = df_grouped / train[col].count() * 100
  print(f"Unique value counts and percentages for column {col}:")
  print(df_grouped)

In [None]:
cols= ['country', 'store', 'product' ]
for col in cols:
  df_grouped = test.groupby(col)[col].count()
  df_grouped = df_grouped / test[col].count() * 100
  print(f"Unique value counts and percentages for column {col}:")
  print(df_grouped)

In [None]:
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])

In [None]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
Enc_col = ['country', 'store', 'product' ]

# Fit the encoder and create the mapping for each column separately
categories = {}
for col in Enc_col:
    for df in [train, test]:
        df[col + '_Enc'] = enc.fit_transform(df[col].values.reshape(-1, 1))
    categories[col] = enc.categories_[0]  # Store categories for the current column

# Print the mapping
for col in Enc_col:
    print(f"Kolumna: {col}")
    for i, category in enumerate(categories[col]):
        print(f"  Wartość zakodowana: {i} -> Wartość początkowa: {category}")

In [None]:
categorical_features = ['id','country', 'store', 'product' ]
for df in [train, test]:
     df.drop(columns=categorical_features, inplace=True)

In [None]:
train.tail()

In [None]:
test.head()

In [None]:
train.info()

In [None]:
train[['country_Enc', 'store_Enc', 'product_Enc']] = train[['country_Enc', 'store_Enc', 'product_Enc']].astype(np.int8)
test[['country_Enc', 'store_Enc', 'product_Enc']] = test[['country_Enc', 'store_Enc', 'product_Enc']].astype(np.int8)
train['num_sold'] = train['num_sold'].astype(np.int16)

In [None]:
train.info()

In [None]:
test.info()

In [None]:
print(train['date'].isna().any()) 
train = train.set_index('date')
test = test.set_index('date')

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam  # Import Adam optimizer


In [None]:
features = ["country_Enc", "store_Enc", "product_Enc"]
target   = 'num_sold'

# Przygotuj dane dla modelu
X_train = train[features]
y_train = train[target]

X_test = test[features]

# Konwertuj dane na tensory PyTorch
X_tensor_train = torch.from_numpy(X_train.to_numpy()).float()
y_tensor_train = torch.from_numpy(y_train.to_numpy()).float()

# Konwertuj dane na tensory PyTorch
X_tensor_test = torch.from_numpy(X_test.to_numpy()).float()

class MyDataset(torch.utils.data.Dataset):
  def __init__(self, X_tensor, y_tensor):
    if y_tensor is not None:
      # Handle missing labels (e.g., remove data points or assign a specific value)
      # Option 1: Remove data points with missing labels
      # filtered_indices = [i for i, label in enumerate(y_tensor) if label is not None]
      # self.X_tensor = X_tensor[filtered_indices]
      # self.y_tensor = y_tensor[filtered_indices]

      # Option 2: Assign a specific value for missing labels
      self.y_tensor = [label if label is not None else -1 for label in y_tensor]  # Replace with your chosen value
    self.X_tensor = X_tensor
    self.y_tensor = y_tensor

  def __getitem__(self, idx):
    X = self.X_tensor[idx]
    y = self.y_tensor[idx]  # Assuming labels are handled in __init__
    return X, y

  def __len__(self):
    return len(self.X_tensor)

train_dataset = MyDataset(X_tensor_train, y_tensor_train)
test_dataset = MyDataset(X_tensor_test, None) 

# Create a DataLoader for batch training
batch_size = 512  # Adjust batch size as needed
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [None]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=2, dropout=0.2)  # Add dropout for regularization
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Pass the input through LSTM layers
        x, (hidden, cell) = self.lstm(x)
        # Use the output from the last hidden layer
        x = hidden[-1]  # Consider using both hidden and cell for complex tasks (optional)
        # Apply linear layer for prediction
        x = self.fc(x)
        return x

# Hyperparameters (adjust as needed)
input_dim = len(features)  # Number of features
hidden_dim = 64
num_layers = 1
output_dim = 1  # Assuming predicting single value (num_sold)

model = LSTMModel(input_dim, hidden_dim, output_dim)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # Set learning rate

# Training loop
num_epochs = 10
print_every = 100
for epoch in range(num_epochs):
  model.train()  # Set model to training mode
  for batch_x, batch_y in train_loader:
    #print(f"Batch_x shape: {batch_x.shape}")  # Print shapes for debugging
    #print(f"Batch_y shape: {batch_y.shape}")  # Print shapes for debugging
    optimizer.zero_grad()
    for x, y in zip(batch_x, batch_y):  # Iterate over elements in each batch
      x = x.unsqueeze(0)
      y_pred = model(x)  # Pass each feature (x) through the model
      loss = criterion(y_pred, y)
      loss.backward()
      optimizer.step()

    if (i + 1) % print_every == 0:
      print(f'Epoch: {epoch+1}, Batch: {i+1}, Loss: {loss.item()}')
    # # Optional: Validation loop
    # with torch.no_grad():
    #     val_loss = 0.0
    #     for batch_x, batch_y in val_loader:
    #         y_pred = model(batch_x)
    #         val_loss += criterion(y_pred, batch_y).item()
    #     val_loss /= len(val_loader)
    #     print('Validation Loss: {:.4f}'.format(val_loss))