In [None]:
!pip install cuda-python

In [None]:
import pandas as pd
import numpy as np
from torch import nn, Tensor
import torch


#the function to pad the input sequence
from torch.utils.data import IterableDataset

def generate_encoder_input_mask(dim: int, batchsize: int, num_heads: int) -> Tensor:
    """
    Args:
        dim: int, for src masking this must be encoder sequence length (i.e.
              the length of the input sequence to the model)
    Return:
        A Tensor of shape (batch_size, num_heads, dim, dim)
    """
    # mask = torch.triu(torch.ones(dim, dim) * float('-inf'), diagonal=1)
    # mask = mask.unsqueeze(0)
    # mask = mask.repeat(batchsize * num_heads, 1, 1)
    mask = torch.triu(torch.ones(dim, dim) * float('-inf'), diagonal=1)
    # mask = mask.repeat(batchsize * num_heads, 1, 1)
    return mask



In [None]:
import torch
from torch import Tensor
import numpy as np


def ndcg_at_ip(predictions: Tensor, positive_index: Tensor, k: Tensor):
    """
    Computes the discounted cumulative gain (DCG) at a given value of k.

    Args:
    - scores: a tensor of relevance scores (in descending order) of recommended items.
    - positive_index: a positive integer specifying the number of top items to be considered.

    Returns:
    - ndcg: a value of the DCG score.
    """
    dcg = 0.0
    for i, p in enumerate(predictions[:k]):
        if p == positive_index:
            dcg += 1.0 / float(np.log2(i + 2.0))

    return dcg

def ndcg_batch(scores: Tensor, positive_batch: Tensor):
    batch_dcg = 0
    for i in range(len(positive_batch)):
        top_n = torch.topk(scores[i, :], 100).indices.tolist()
        score = ndcg_at_ip(top_n, positive_batch[i], 20)
        batch_dcg += score
    return batch_dcg

In [None]:
from torch.utils.data import IterableDataset
from torch.utils.data import DataLoader
import random
import pandas as pd
import numpy as np
import os

#parameter
negative_sample_size = 1
num_item = 364846 + 2
class JsonlDataset(IterableDataset):
    def __init__(self, filepaths):
        self.filepaths = filepaths
        random.shuffle(self.filepaths)
        # self.files = [pd.read_json(filepath, lines=True) for filepath in self.filepaths]

    def __iter__(self):
        return self

    def __next__(self):
        for file in self.filepaths:
            json_df = pd.read_json(file, lines=True)
            json_df['events'] = json_df['events'].apply(lambda x: [[d['ts'], d['type'], d['aid']] for d in x])
            print(json_df.head())
            numdata = json_df.to_numpy()
            random.shuffle(numdata)
            yield numdata

class NumpyDataset(IterableDataset):
    def __init__(self, filepaths, batch_size):
        self.filepaths = filepaths
        self.batch_size = batch_size
        random.shuffle(self.filepaths)

    def __iter__(self):
        return self

    def __next__(self):
        for file in self.filepaths:
            data = np.load(file)
            random.shuffle(data)
            batches = np.array_split(data, len(data) // self.batch_size)
            for batch in batches:
                yield {"items": batch, "negatives": np.random.randint(1, num_item,
                                                                      size=(len(batch),
                                                                            len(batch[0][0]),
                                                                            negative_sample_size))}



In [None]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import json
import io
import math
import torch
from typing import Optional, Any, Union, Callable
import torch.nn as nn
from torch.nn.modules.normalization import LayerNorm

from torch import Tensor
import torch.nn.functional as F



# define the transformer model
class TimeSeriesTransformer(nn.Module):
    """
        args:

        shapes:
    """
    def __init__(self,
                 num_item,
                 output_size,
                 hidden_size,
                 num_layers=4,
                 num_heads=4,
                 dim_feedforward_encoder=256,
                 dropout=0.2,
                 max_length=500,
                 layer_norm_eps: float = 1e-5,
                 device = None,
                 dtype = None
                 ):
        factory_kwargs = {'device': device, 'dtype': dtype}
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        self.nhead = num_heads

        self.embedding_aid = nn.Embedding(num_embeddings=num_item + 1, embedding_dim=hidden_size, padding_idx=0, **factory_kwargs)
        self.embedding_type = nn.Embedding(num_embeddings=4, embedding_dim=hidden_size, padding_idx=0, **factory_kwargs)
        self.pos_embedding = nn.Embedding(num_embeddings=max_length, embedding_dim=hidden_size, **factory_kwargs)
        self.device = device
        encoderlayer = nn.TransformerEncoderLayer(
            d_model=hidden_size,
            nhead=num_heads,
            dim_feedforward=dim_feedforward_encoder,
            dropout=dropout,
            batch_first=True,
            **factory_kwargs
            )
        encoder_norm = LayerNorm(hidden_size, **factory_kwargs)
        self.encoder = nn.TransformerEncoder(encoder_layer=encoderlayer, num_layers=num_layers, norm=encoder_norm)

        self.fc_type = nn.Linear(hidden_size, 3, **factory_kwargs)


    def forward(self, src_aid: Tensor, src_type: Tensor,
                src_mask: Tensor,
                src_key_padding_mask: Optional[Tensor] = None):
        src_padding_mask = (src_aid == 0)
        src_padding_mask = src_padding_mask.to(src_mask.dtype)
        src_padding_mask = src_padding_mask.to(device)
#         print(f"src_type: {src_type}")
        src_aid_embedding = self.embedding_aid(src_aid)
        src_type_embedding = self.embedding_type(src_type)
        src = src_aid_embedding * src_type_embedding

        position_ids = torch.arange(0, src_aid.shape[-1])
        position_ids = position_ids.unsqueeze(0).to(device)
#         print(position_ids.device)
        position = self.pos_embedding(position_ids)

        src = src + position

        aid_type_embedding = self.encoder(src, mask=src_mask, src_key_padding_mask=src_padding_mask)

        output_type = self.fc_type(aid_type_embedding)
        # Apply the softmax activation function
        output_type = F.softmax(output_type, dim=1)

        return aid_type_embedding, output_type


    def predict(self, src_aid: Tensor, src_type: Tensor,
                src_mask: Optional[Tensor] = None,
                src_key_padding_mask: Optional[Tensor] = None):
        aid_type_embedding, type_probs = self(src_aid, src_type, src_mask=src_mask)

        last_event_embeddings = aid_type_embedding[:, aid_type_embedding.size(1) - 1:, :]
        last_event_embeddings = last_event_embeddings.reshape(-1, last_event_embeddings.size(2))
        aid_scores = torch.matmul(last_event_embeddings, self.embedding_aid.weight.permute(1, 0)).squeeze(1)
#         print(f"shape of last toke embeddings: {last_event_embeddings.shape}")
#         print(f"shape of self.embedding_aid.weight: {self.embedding_aid.weight.shape}")
#         print(f"shape of scores: {aid_scores.shape}")
        type_probs = type_probs[:, type_probs.size(1) - 1:, :]
        type_probs = type_probs.reshape(-1, type_probs.size(2))
#         print(f"shape of output_type: {type_probs.shape}")

        return aid_scores, type_probs

    def train_step(self, src_aid: Tensor, src_type: Tensor,
                    negative_samples: Tensor,
                    src_mask: Optional[Tensor] = None):

        aid_type_embedding, output_type = self(src_aid, src_type, src_mask)

        target_type = src_type[:, 1:].reshape(-1)
        target_type = torch.nn.functional.one_hot(target_type, num_classes=4)
        target_type = target_type[:, 1:]# remove the 0 class

        output_type = output_type[:, :-1, :]#during training, remove the predicted last token
        output_type = output_type.reshape(-1, output_type.size(2))
#         print(f"output_type: {output_type}")
        aid_type_embedding = aid_type_embedding[:, :-1, :]#during training, remove the attended last token
        aid_type_embedding = aid_type_embedding.reshape(-1, aid_type_embedding.size(2))  # (batch size * sequence_length, embedding dimension)

        positives = src_aid[:, 1:]
        positives_embeddings = self.embedding_aid(positives)# (batch size, sequence_length, hidden_size)
        positives_embeddings = positives_embeddings.reshape(-1, positives_embeddings.size(2))  # (batch size * sequence_length)
#         print(f"shape of positive embeddings: {positives_embeddings.shape}")
        negative_samples = negative_samples[:, :-1]  # (batch size, sequence_length)
        negatives = negative_samples.reshape(-1)  # (batch size * sequence_length)
#         print(f"shape of negatives: {negatives.shape}")
        negatives_embeddings = self.embedding_aid(negatives)# (batch size * sequence_length, embedding dimension)
#         print(f"shape of negatives embeddings: {negatives_embeddings.shape}")
        positives_scores = (positives_embeddings * aid_type_embedding).sum(1)  # (batch size * sequence_length)
        negatives_scores = (negatives_embeddings * aid_type_embedding).sum(1)  # (batch size * sequence_length)
#         print(f"shape of positive scores: {positives_scores.shape}")
#         print(f"shape of negatives scores: {negatives_scores.shape}")

        is_target = (positives != 0).float().reshape(-1)  # (batch size * sequence_length)
#         print(f"shape of is_target: {is_target.shape}")
#         print(f"is_target sum: {is_target.sum()}")
        aid_loss = (- torch.log(torch.sigmoid(positives_scores) + 1e-24) * is_target
                    - torch.log(1 - torch.sigmoid(negatives_scores) + 1e-24) * is_target)
        type_loss = - target_type * torch.log(output_type)
#         print(f"shape of loss before sum: {aid_loss.shape}")
#         print(f"loss before sum: {aid_loss}")
        loss = (aid_loss.sum() + type_loss.sum()) / (is_target.sum() * 2)

#         print(f"aid_loss.sum: {aid_loss.sum()} ***** type_loss.sum: {type_loss.sum()}")
        return {'loss': loss}

    def validation_step(self, src_aid: Tensor, src_type: Tensor,
                   src_mask: Optional[Tensor] = None):
        aid_scores, type_probs = self.predict(src_aid, src_type, src_mask = src_mask)
        aid_positive = src_aid[:, src_aid.size(1) - 1:].reshape(-1) #(batch size )

        dcg = ndcg_batch(aid_scores, aid_positive)

        # Get the maximum number in the second dimension (axis=1)
        type_prediction = torch.argmax(type_probs, dim=1)
        type_positive = src_type[:, src_type.size(1) - 1:].reshape(-1) #(batch size )
#         print(f"shape of type_prediction: {type_prediction.shape}")
#         print(f"Shape of type_positive: {type_positive.shape}")
        # Calculate the number of correctly predicted samples
        correct = (type_prediction == type_positive).sum()
#         print(f"Shape of correct: {correct.shape}")
#         print(f"type_prediction: {type_prediction}")
#         print(f"type_positive: {type_positive}")
        # accuracy = ( correct / type_positive.size(0)) * 100

        return dcg, correct

    def test_step(self, src_aid: Tensor, src_type: Tensor,
                        negative_samples: Tensor,
                        src_mask: Optional[Tensor] = None):
        aid_scores, type_probs = self.predict(src_aid, src_type, src_mask)

        return None


In [None]:
import torch
import torch.nn as nn
import numpy as np
import time
import os
import gc

# Define the hyperparameters
input_size = 364846 + 1
output_size = 364846 + 1
num_heads = 4
hidden_size = 32 * num_heads
num_layers = 2
dropout = 0.2
max_seq_length = 500 + 1
batch_size = 128
lr = 0.001
num_epochs = 5

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# Create the model and the optimizer
model = TimeSeriesTransformer(num_item=input_size,
                              output_size=output_size,
                              hidden_size=hidden_size,
                              num_heads=num_heads,
                              num_layers=num_layers,
                              dropout=dropout,
                              max_length=max_seq_length,
                              device=device)

optimizer = torch.optim.Adam(model.parameters(), lr=lr)

#loading a checkpoint
checkpoint = torch.load('checkpoint_0_.pt')
model.load_state_dict(checkpoint['model_state_dict'])
model.to(device)
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

criterion = nn.CrossEntropyLoss()
start_time = time.time()
total_loss = 0


# for root, dirs, files in os.walk('/kaggle/input/train-data/'):
#     for file in files:
#         trian_filepaths.append(os.path.join(root, file))
train_filepaths = []
# train_filepaths = ['train_seq_5.npy']
train_filepaths = ['train_seq_5.npy','train_seq_10.npy', 'train_seq_15.npy', 'train_seq_20.npy', 'train_seq_25.npy',
                  'train_seq_30.npy', 'train_seq_35.npy', 'train_seq_40.npy', 'train_seq_50.npy', 'train_seq_100.npy', 'train_seq_150.npy']
train_dataset = NumpyDataset(train_filepaths, batch_size)

for epoch in range(num_epochs):
    train_dataset = NumpyDataset(train_filepaths, batch_size)
    batches = next(iter(train_dataset))
    print(f"number of train batches:")
    num_batch = 0
    for batch in batches:
        event_sequences = batch['items']
        negative_samples = torch.from_numpy(batch['negatives']).to(device)
        if len(event_sequences) == 1:
          print("break")
          break
        aids_input = torch.from_numpy(np.array([event_sequence[0] for event_sequence in event_sequences]))
        types_input = torch.from_numpy(np.array([event_sequence[1] for event_sequence in event_sequences]))
        aids_input = aids_input.to(device)
        types_input = types_input.to(device)

        mask = generate_encoder_input_mask(len(aids_input[0]), len(event_sequences), num_heads)
        mask = mask.to(device)
        Loss = model.train_step(aids_input, types_input, negative_samples, mask)
        optimizer.zero_grad()
        loss = Loss['loss']
        loss.backward()
        optimizer.step()

        total_loss += loss
        num_batch += 1
    del train_dataset
    gc.collect()
        # Save a checkpoint every `save_interval` batches
    checkpoint_path = f'checkpoint_{epoch}.pt'
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, checkpoint_path)

    # for root, dirs, files in os.walk('/kaggle/input/validation-data/'):
    #     for file in files:
    #         validation_files.append(os.path.join(root, file))
    validation_files=['validation_seq_5.npy','validation_seq_10.npy', 'validation_seq_15.npy', 'validation_seq_20.npy', 'validation_seq_25.npy',
                  'validation_seq_30.npy', 'validation_seq_35.npy', 'validation_seq_40.npy', 'validation_seq_50.npy', 'validation_seq_100.npy', 'validation_seq_150.npy']
    validation_dataset = NumpyDataset(validation_files, batch_size)
    batches = next(iter(validation_dataset))
    total_correct = 0
    total_dcg = 0
    total_seq = 0
    for batch in batches:
        event_sequences = batch['items']
        negative_samples = torch.from_numpy(batch['negatives'])

        aids_input = torch.from_numpy(np.array([event_sequence[0] for event_sequence in event_sequences]))
        types_input = torch.from_numpy(np.array([event_sequence[1] for event_sequence in event_sequences]))
        aids_input = aids_input.to(device)
        types_input = types_input.to(device)

        mask = generate_encoder_input_mask(len(aids_input[0]), len(event_sequences), num_heads)
        mask = mask.to(device)
        dcg, correct = model.validation_step(aids_input, types_input, mask)
        total_dcg += dcg
        total_correct += correct
        total_seq += len(event_sequences)

    ndcg = total_dcg / total_seq
    accuracy = total_correct / total_seq
    del validation_dataset
    gc.collect()
    print(f"ndcg: {ndcg}")
    print(f"accuracy: {accuracy}")
    time_duration = time.time() - start_time
    print(time_duration)
