# Modelo - Validação - Teste 📊 🧪

In [2]:
import polars as pl
import tensorflow as tf
import tqdm

In [3]:
train_dataset = pl.read_parquet("data/transformed_train_data/sessions_with_more_than_2_clicks.parquet")

In [4]:
display(train_dataset.shape[0])
display(train_dataset.head(5))

10161584

session,sorted_events,items_clicked,items_carted,items_ordered,items_clicked_count,items_carted_count,items_ordered_count
i64,list[struct[3]],list[i64],list[i64],list[i64],i64,i64,i64
12125765,"[{236629,1661455845476,""clicks""}, {510059,1661455883265,""clicks""}, {236629,1661455939286,""clicks""}]","[236629, 510059, 236629]",[],[],3,0,0
12125766,"[{1119434,1661455845769,""clicks""}, {1119434,1661455891538,""clicks""}, … {1119434,1661498592931,""clicks""}]","[1119434, 1119434, … 1119434]",[],[],18,0,0
12125767,"[{1321238,1661455845771,""clicks""}, {1389738,1661455955373,""clicks""}, … {129869,1661456097095,""clicks""}]","[1321238, 1389738, … 129869]",[],[],6,0,0
12125768,"[{1676761,1661455845774,""clicks""}, {1676761,1661455899065,""clicks""}, … {661746,1661717837490,""clicks""}]","[1676761, 1676761, … 661746]",[],[],9,0,0
12125770,"[{917900,1661455846705,""clicks""}, {1033924,1661714173716,""clicks""}, … {1033924,1661714269437,""clicks""}]","[917900, 1033924, … 1033924]",[],[],4,0,0


In [9]:
# count all possible items clicked
all_items = []
for session_items in tqdm.tqdm(train_dataset["items_clicked"].to_list()):
    all_items.extend(session_items)
    
all_items = list(set(all_items))

100%|██████████| 10161584/10161584 [00:04<00:00, 2303675.36it/s]


In [11]:
display(len(all_items))

1855574

## Gerando dados de treinamento 🏋️

In [15]:
import tqdm

def generate_training_data(
    dataset: pl.DataFrame,
    window_size: int = 2,
    num_negatives: int = 3,
    num_products: int = 1855574,
    seed: int = 42,
) -> tuple:
    """
    Gera os dados de treinamento para o modelo Prod2Vec.
    Tipo: pares de skip-gram
    
    Args:
    ------
        * dataset (pl.DataFrame): Dataset containing the sessions with more than 2 clicks(must have items_clicked column!).
        * window_size (int): Size of the window to be used in the skip-gram model.
        * num_negatives (int): Number of negative samples to be used in the loss function.
        * num_products (int): Number of products in the dataset.
        * seed (int): Seed to be used in the random number generator.
    """
    targets, contexts, labels = [], [], []
    
    # Build sampling table for `num_products` products assuming Zipf's law distribution for the frequencies.
    sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(num_products)
    
   
    # iterate through all sequences (sessions) in the dataset 
    for sequence in tqdm.tqdm(dataset["items_clicked"].to_list()):
        # Generate positive skip-gram pairs for a sequence (session).
        positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
            sequence,
            vocabulary_size=num_products,
            window_size=window_size,
            sampling_table=sampling_table,
            negative_samples=0,
            shuffle=False,
            seed=seed,
        )
        
        # Iterate over each positive skip-gram pair to produce training examples
        # with positive context product and negative samples.
        for target_product, context_product in positive_skip_grams:
            context_class = tf.expand_dims(
                tf.constant([context_product], dtype="int64"), 1
            )
            negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
                true_classes=context_class,
                num_true=1,
                num_sampled=num_negatives,
                unique=True,
                range_max=num_products,
                seed=seed,
                name="negative_sampling",
            )
            
        # Build context and label vectors (for one target product)
        context = tf.concat([tf.squeeze(context_class, 1), negative_sampling_candidates], 0)
        label = tf.constant([1] + [0] * num_negatives, dtype="int64")
        
        # Append each element from the training example to global lists.
        targets.append(target_product)
        contexts.append(context)
        labels.append(label)
        
    return targets, contexts, labels
        
    

In [17]:
skip_gram_pairs = generate_training_data(
    train_dataset,
    seed=1
)

  0%|          | 15336/10161584 [00:26<4:50:09, 582.81it/s]


IndexError: index 1855594 is out of bounds for axis 0 with size 1855574