In [10]:
import pandas as pd
import polars as pl
import pandasql as ps
from skimpy import skim
import torch
import torch.nn as nn
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit
import torch.optim as optim
from sklearn.metrics import mean_absolute_error
from torch.utils.data import DataLoader, TensorDataset

In [11]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Utilisation de l'appareil : {device}")

Utilisation de l'appareil : mps


In [12]:
agg_sales_train='/Users/arthurgaujoux/Desktop/Farpoint/predict_future_sales/df_agg_sales_train.csv'
data_pd=pd.read_csv(
    agg_sales_train,
    dtype={
        'date': 'string',
        'shop_id': 'int64',
        'item_id': 'int64',
        'rescaling_item_cnt_day': 'int64',
        'monday_flag': 'int64',
        'tuesday_flag': 'int64',
        'wednesday_flag': 'int64',
        'thursday_flag': 'int64',
        'friday_flag': 'int64',
        'saturday_flag': 'int64',
        'sunday_flag': 'int64',
        'week_end_flag': 'int64',
        'january_flag': 'int64',
        'february_flag': 'int64',
        'march_flag': 'int64',
        'april_flag': 'int64',
        'may_flag': 'int64',
        'june_flag': 'int64',
        'july_flag': 'int64',
        'august_flag': 'int64',
        'september_flag': 'int64',
        'october_flag': 'int64',
        'november_flag': 'int64',
        'december_flag': 'int64',
        'holiday_flag': 'int64'
    }
)

In [13]:
#################### sampling data ###################
query = """ 
select
    date,
    shop_id,
    item_id,
    rescaling_item_cnt_day,
    monday_flag,
    tuesday_flag,
    wednesday_flag,
    thursday_flag,
    friday_flag, 
    saturday_flag, 
    sunday_flag, 
    week_end_flag, 
    january_flag, 
    february_flag, 
    march_flag, 
    april_flag, 
    may_flag, 
    june_flag, 
    july_flag, 
    august_flag, 
    september_flag,
    october_flag,
    november_flag,
    december_flag,
    holiday_flag
from data_pd
where date >= '2013-01-01'
and date <='2013-06-30'
"""
sample_data_pd = ps.sqldf(query, locals())

print(sample_data_pd)

sample_data_pd=sample_data_pd[sample_data_pd['rescaling_item_cnt_day']<100]


              date  shop_id  item_id  rescaling_item_cnt_day  monday_flag  \
0       2013-01-01        2      991                      23            0   
1       2013-01-01        2     1472                      23            0   
2       2013-01-01        2     1905                      23            0   
3       2013-01-01        2     2920                      24            0   
4       2013-01-01        2     3320                      23            0   
...            ...      ...      ...                     ...          ...   
631916  2013-06-30       59    18755                      23            0   
631917  2013-06-30       59    19864                      23            0   
631918  2013-06-30       59    20949                      23            0   
631919  2013-06-30       59    21487                      24            0   
631920  2013-06-30       59    22087                      26            0   

        tuesday_flag  wednesday_flag  thursday_flag  friday_flag  \
0      

In [14]:
data_pd=data_pd[data_pd['rescaling_item_cnt_day']<100]
skim(data_pd[['shop_id', 'item_id', 'rescaling_item_cnt_day']])

In [15]:
def create_sequences_with_flags(data, target_col, sequence_length):
    """
    Crée des séquences avec les identifiants, les flags temporels et les ventes cibles.
    
    Args:
        data (pd.DataFrame): Données d'entrée, avec les flags et autres colonnes nécessaires.
        target_col (str): Colonne cible (par exemple, 'ct_item_day').
        sequence_length (int): Longueur des séquences glissantes.

    Returns:
        X (np.ndarray): Séquences d'entrée (avec identifiants et flags).
        y (np.ndarray): Valeurs cibles (les ventes à prédire).
    """
    X, y = [], []
    
    # Trier les données par shop_id, item_id et date pour conserver l'ordre chronologique
    data = data.sort_values(by=["shop_id", "item_id", "date"]).reset_index(drop=True)
    
    # Grouper les données par shop_id et item_id
    grouped = data.groupby(["shop_id", "item_id"])
    
    for (shop_id, item_id), group in grouped:
        for i in range(len(group) - sequence_length):
            # Créer une séquence des valeurs cibles (target_col)
            seq = group[target_col].iloc[i:i + sequence_length].values
            
            # Sélectionner les flags temporels pour chaque point dans la séquence
            seq_flags = group[[
                'monday_flag', 'tuesday_flag', 'wednesday_flag', 'thursday_flag', 'friday_flag', 
                'saturday_flag', 'sunday_flag', 'week_end_flag', 
                'january_flag', 'february_flag', 'march_flag', 'april_flag', 'may_flag', 
                'june_flag', 'july_flag', 'august_flag', 'september_flag', 'october_flag', 
                'november_flag', 'december_flag', 'holiday_flag'
            ]].iloc[i:i + sequence_length].values

            # Ajouter les identifiants (shop_id, item_id) à chaque point de la séquence
            seq_with_flags = np.hstack((np.repeat([[shop_id, item_id]], sequence_length, axis=0), 
                                        seq_flags))  # Ajoute les identifiants avec les flags
            
            # Ajouter les ventes cibles à la fin de la séquence
            seq_with_target = np.hstack((seq_with_flags, seq.reshape(-1, 1)))  # Ajoute les ventes (target_col)
            
            # La cible pour la prochaine valeur après la séquence
            target = group[target_col].iloc[i + sequence_length]
            
            # Ajouter la séquence et la cible à la liste
            X.append(seq_with_target)
            y.append(target)
    
    return np.array(X), np.array(y)


In [16]:
class TransformerLSTMModel(nn.Module):
    def __init__(self, input_size, lstm_hidden_size, num_layers, nhead):
        super(TransformerLSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, lstm_hidden_size, num_layers, batch_first=True)
        self.transformer_encoder_layer = nn.TransformerEncoderLayer(d_model=lstm_hidden_size, nhead=nhead)
        self.transformer_encoder = nn.TransformerEncoder(self.transformer_encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(lstm_hidden_size, 1)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        transformer_out = self.transformer_encoder(lstm_out.permute(1, 0, 2))
        out = self.fc(self.relu(transformer_out[:, -1, :]))
        return out


In [17]:
def main(
        sequence_length,
        input_size,
        lstm_hidden_size,
        num_layers,
        nhead,
        learning_rate,
        batch_size
    ):

    train_losses = []
    validation_accuracies = []
    validation_losses = []
    val_predictions = []
    val_labels = []

    # create sequence
    X, y = create_sequences_with_flags(data=sample_data_pd, target_col="rescaling_item_cnt_day", sequence_length=sequence_length)

    # timne series split
    tscv = TimeSeriesSplit(n_splits=3)
    criterion = nn.MSELoss()

    for train_index, val_index in tscv.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

    # rescaling the target
    scaler = MinMaxScaler()
    y_train_scaled = scaler.fit_transform(y_train.reshape(-1, 1))
    y_val_scaled = scaler.transform(y_val.reshape(-1, 1))

    # creating tensor
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32).permute(0, 2, 1).to(device)
    y_train_tensor = torch.tensor(y_train_scaled, dtype=torch.float32).to(device)
    X_val_tensor = torch.tensor(X_val, dtype=torch.float32).permute(0, 2, 1).to(device)
    y_val_tensor = torch.tensor(y_val_scaled, dtype=torch.float).to(device)


    model = TransformerLSTMModel(input_size, lstm_hidden_size, num_layers, nhead).to(device)

    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Creating of a dataloader, without mixing (shuffle=False)
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

    # Creating of a dataloader for validation
    val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    # Entraînement du modèle
    epochs = 100
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)  # transfer to GPU 
            optimizer.zero_grad()
            predictions = model(X_batch)
            loss = criterion(predictions, y_batch)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        
        # Calculation of the average loss for the epoch 
        avg_loss = epoch_loss / len(train_loader)
        train_losses.append(avg_loss)

        # Model Validation
        model.eval()
        val_predictions = []  
        val_labels = []
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                batch_predictions = model(X_batch)
                val_predictions.append(batch_predictions.cpu().numpy())
                val_labels.append(y_batch.cpu().numpy())
        
        val_predictions = np.concatenate(val_predictions, axis=0)
        val_labels = np.concatenate(val_labels, axis=0)

        val_loss = criterion(torch.tensor(val_predictions), torch.tensor(val_labels))
        validation_losses.append(val_loss.item())

        # Calcul de la précision pour la validation
        mae_val = mean_absolute_error(val_labels, val_predictions)
        val_accuracy = 100 - (mae_val / np.mean(val_labels) * 100)
        validation_accuracies.append(val_accuracy)

    # Analysis
    print(f"Validation Loss moyenne: {np.mean(validation_losses):.4f}")
    print(f"Précision moyenne: {np.mean(validation_accuracies):.2f}%")
    print(f"Précision de validation à la dernière époque: {validation_accuracies[-1]:.2f}%")

    plt.figure(figsize=(12, 6))
    plt.plot(range(1, epochs + 1), train_losses, label="Train Loss", color="blue")
    plt.plot(range(1, epochs + 1), validation_losses, label="Validation Loss", color="red")
    plt.title("Train and Validation Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.grid(True)
    plt.show()

    plt.figure(figsize=(12, 6))
    plt.plot(range(1, epochs + 1), validation_accuracies, label="Validation Accuracy", color="green")
    plt.title("Validation Accuracy")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy (%)")
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
#1. Comparaison des Pertes d'Entraînement et de Validation:
#Si la perte d'entraînement est beaucoup plus faible que la perte de validation, cela peut indiquer un sur-apprentissage. Cela signifie que le modèle s'adapte trop bien aux données d'entraînement mais ne généralise pas bien aux nouvelles données.
#2. Visualisation des Courbes d'Apprentissage:
#Tracez les courbes de perte d'entraînement et de validation. Si la courbe de validation commence à augmenter alors que la courbe d'entraînement continue de diminuer, cela peut être un signe de sur-apprentissage.

In [None]:
#import optuna
#
#def objective(trial):
#    lstm_hidden_size = trial.suggest_categorical('lstm_hidden_size', [64, 128, 256])
#    nhead = trial.suggest_int('nhead', 2, 8)
#    num_layers = trial.suggest_int('num_layers', 1, 3)
#    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)
#    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64])
#    sequence_length = trial.suggest_categorical('sequence_length', [14, 30])
#    
#    # Code pour entraîner le modèle ici...
#    return validation_loss  # Minimise cette valeur

In [19]:
main(
    sequence_length = 7,
    input_size=24,
    lstm_hidden_size = 128,
    num_layers = 3,
    nhead = 8,
    learning_rate = 0.001,
    batch_size = 5000
)



RuntimeError: input.size(-1) must be equal to input_size. Expected 24, got 7