In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import T5Config, T5ForConditionalGeneration, PreTrainedTokenizerBase, AdamW, AddedToken
from torch.cuda.amp import GradScaler, autocast
from sklearn.model_selection import train_test_split
import os
from typing import Dict
import json
from tqdm import tqdm

In [None]:
input_file = "/content/seq2seq_data1.xlsx"

In [None]:
def read_data_from_excel(file_path, separator='|'):
    df = pd.read_excel(file_path)
    if 'EVENTS_SEQ' not in df.columns:
        raise ValueError("Excel file must contain 'EVENTS_SEQ' column.")
    df.reset_index(drop=False, inplace=True)
    df.rename(columns={'index': 'User ID', 'EVENTS_SEQ': 'Sequence of events'}, inplace=True)
    df['Sequence of events'] = df['Sequence of events'].astype(str)
    df['Sequence of events'] = df['Sequence of events'].apply(lambda x: x.split(separator))
    df = df[df['Sequence of events'].apply(len) > 10]
    df = df[['User ID', 'Sequence of events']]
    return df

In [None]:
dataframe = read_data_from_excel(input_file, separator='|')
print(dataframe.head(10))
print(len(dataframe))

     User ID                                 Sequence of events
1          1  [first_visit, session_start, open_chat, open_c...
2          2  [first_visit, session_start, open_chat, open_c...
3          3  [first_visit, session_start, open_chat, open_c...
4          4  [first_visit, session_start, open_chat, open_c...
7          7  [first_visit, session_start, open_chat, close_...
9          9  [first_visit, session_start, open_chat, page_v...
18        18  [first_visit, session_start, open_chat, view_i...
446      446  [first_visit, session_start, page_view, view_i...
454      454  [first_visit, session_start, at_visibility, ge...
467      467  [first_visit, session_start, at_visibility, pa...
2233


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cpu


In [None]:
print(df['Sequence of events'][4])

['first_visit', 'session_start', 'open_chat', 'open_chat', 'open_chat', 'open_chat', 'open_chat', 'page_view', 'page_view', 'proactive_message_impression', 'proactive_message_impression']


In [None]:
class GlobalDictionary:
    def __init__(self):
        self.event_to_index = {
            "START": 0,
            "END": 1,
            "PAD": 2,
            "UNK": 3,
        }
        self.index_to_event = {
            0: "START",
            1: "END",
            2: "PAD",
            3: "UNK",
        }
        self.counter = 4
        self.fixed_vocab = False
    def update_dictionary(self, sequence):
        for event in sequence:
            if not self.fixed_vocab and event not in self.event_to_index:
                self.event_to_index[event] = self.counter
                self.index_to_event[self.counter] = event
                self.counter += 1
    def convert_sequence_to_indices(self, sequence):
        indices = [self.event_to_index.get(event, self.event_to_index["UNK"]) for event in sequence]
        return indices
    def fix_vocab(self):
        self.fixed_vocab = True

In [None]:
class EventSequenceProcessor:
    def __init__(self, separator='|'):
        self.global_dict = GlobalDictionary()
        self.dataframe = pd.DataFrame(columns=["User ID", "Sequence of events"])
        self.separator = separator
        self.max_length = 0
    def add_data(self, new_data):
        new_df = pd.DataFrame(new_data)
        for sequence in new_df['Sequence of events']:
            self.global_dict.update_dictionary(sequence)
        new_df['Sequence of events'] = new_df['Sequence of events'].apply(lambda x: self.global_dict.convert_sequence_to_indices(x))
        self.dataframe = pd.concat([self.dataframe, new_df], ignore_index=True)
        self.max_length = max(self.max_length, max(new_df['Sequence of events'].apply(len)))
    def fix_vocabulary(self):
        self.global_dict.fix_vocab()
    def get_dataloader(self, batch_size=16):
        dataset = EventSequenceDataset(self.dataframe, self.global_dict)
        vocab_size = len(self.global_dict.event_to_index)
        max_length = self.max_length
        def collate_fn(batch):
            sequences, targets = zip(*batch)
            # sequences = [list(map(int, seq)) for seq in sequences]
            # targets = [list(map(int, tgt)) for tgt in targets]
            # padded_sequences = nn.utils.rnn.pad_sequence([torch.tensor(seq) for seq in sequences], batch_first=True, padding_value=self.global_dict.event_to_index["PAD"])
            # padded_targets = nn.utils.rnn.pad_sequence([torch.tensor(tgt) for tgt in targets], batch_first=True, padding_value=self.global_dict.event_to_index["PAD"])
            sequences = [torch.tensor(seq, dtype=torch.long) for seq in sequences]
            targets = [torch.tensor(tgt, dtype=torch.long) for tgt in targets]
            padded_sequences = nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=self.global_dict.event_to_index["PAD"])
            padded_targets = nn.utils.rnn.pad_sequence(targets, batch_first=True, padding_value=self.global_dict.event_to_index["PAD"])
            return padded_sequences, padded_targets
        return DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn), vocab_size, max_length
    def get_dictionary(self):
        return self.global_dict.event_to_index
    def get_index_to_event_mapping(self):
        return self.global_dict.index_to_event

In [None]:
class EventSequenceDataset(Dataset):
    def __init__(self, dataframe, global_dict):
        self.dataframe = dataframe
        self.global_dict = global_dict

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        sequence = self.dataframe.iloc[idx, 1]
        input_ids = torch.tensor(self.global_dict.convert_sequence_to_indices(sequence[:-1]), dtype=torch.long)
        target_ids = torch.tensor(self.global_dict.convert_sequence_to_indices(sequence[1:]), dtype=torch.long)
        return input_ids, target_ids

In [None]:
class CustomTokenizer(PreTrainedTokenizerBase):
    def __init__(self, global_dict):
        super().__init__()
        self.global_dict = global_dict
    def _tokenize(self, text):
        tokens = text.split(' ')
        return tokens
    def _convert_token_to_id(self, token):
        return self.global_dict.event_to_index.get(token, self.global_dict.event_to_index["UNK"])
    def _convert_id_to_token(self, index):
        return self.global_dict.index_to_event.get(index, "UNK")
    def convert_tokens_to_string(self, tokens):
        return ' '.join(tokens)
    def save_vocabulary(self, save_directory, filename_prefix=None):
        vocab_file = os.path.join(save_directory, (filename_prefix + '-' if filename_prefix else '') + 'vocab.json')
        with open(vocab_file, 'w') as f:
            json.dump(self.global_dict.event_to_index, f, indent=2)
        return (vocab_file,)
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        return token_ids_0
    def get_vocab(self):
        return self.global_dict.event_to_index
    @property
    def added_tokens_decoder(self):
        return {index: AddedToken(token, lstrip=False, rstrip=False) for token, index in self.get_vocab().items()}
    def convert_tokens_to_ids(self, tokens):
        return [self._convert_token_to_id(token) for token in tokens]

In [None]:
class CustomT5Model(nn.Module):
    def __init__(self, vocab_size, hidden_size=512, num_layers=4, num_heads=4):
        super(CustomT5Model, self).__init__()
        config = T5Config(
            vocab_size=vocab_size,
            d_model=hidden_size,
            num_layers=num_layers,
            num_heads=num_heads,
            d_ff=hidden_size * 4,
            feed_forward_proj='relu',
            is_encoder_decoder=True,
            decoder_start_token_id=0,
            eos_token_id=1,
            # bos_token_id=0,
            pad_token_id=2
        )
        self.model = T5ForConditionalGeneration(config).to(device)
        self.model.config.decoder_start_token_id = config.pad_token_id
    def forward(self, input_ids, attention_mask=None, labels=None):
        input_ids = input_ids.long()
        if labels is not None:
            labels = labels.long()
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return outputs.loss, outputs.logits
    def save_pretrained(self, save_directory):
        self.model.save_pretrained(save_directory)

In [None]:
from tqdm import tqdm
import random

def train_and_validate_model(model, train_dataloader, val_dataloader, vocab_size, index_to_event, epochs=8, device=None, teacher_forcing_ratio=0.5):
    optimizer = AdamW(model.parameters(), lr=5e-5)
    scaler = GradScaler()
    criterion = nn.CrossEntropyLoss(ignore_index=2)
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    for epoch in range(epochs):
        model.train()
        train_loss_total = 0
        train_steps = 0
        train_progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{epochs} - Training", leave=False)
        for sequences, targets in train_progress_bar:
            sequences, targets = sequences.to(device), targets.to(device)
            optimizer.zero_grad()
            with autocast():
                teacher_force = random.random() < teacher_forcing_ratio
                if teacher_force:
                    teacher_forcing_input = targets[:, :-1].clone().detach()
                    teacher_forcing_input = torch.cat([torch.full((sequences.size(0), 1), vocab_size, dtype=torch.long).to(device), teacher_forcing_input], dim=1)
                    decoder_input = teacher_forcing_input
                else:
                    decoder_input = sequences
                loss, logits = model(input_ids=sequences, labels=targets)
                active_loss = targets.view(-1) != 2
                logits = logits[:, :targets.size(1), :]
                logits = logits.view(-1, logits.size(-1))
                targets = targets.view(-1)
                active_loss = active_loss[:logits.size(0)]
                active_logits = logits[active_loss]
                active_labels = targets[active_loss]
                loss = criterion(active_logits, active_labels)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            train_loss_total += loss.item()
            train_steps += 1
            train_progress_bar.set_postfix(loss=loss.item())
        avg_train_loss = train_loss_total / train_steps
        print(f"Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_train_loss:.4f}")

        model.eval()
        val_loss_total = 0
        val_steps = 0
        val_progress_bar = tqdm(val_dataloader, desc=f"Epoch {epoch + 1}/{epochs} - Validation", leave=False)
        with torch.no_grad():
            for sequences, targets in val_progress_bar:
                sequences, targets = sequences.to(device), targets.to(device)
                with autocast():
                    loss, logits = model(input_ids=sequences, labels=targets)
                    active_loss = targets.view(-1) != 2
                    logits = logits[:, :targets.size(1), :]
                    logits = logits.view(-1, logits.size(-1))
                    targets = targets.view(-1)
                    active_loss = active_loss[:logits.size(0)]
                    active_logits = logits[active_loss]
                    active_labels = targets[active_loss]
                    val_loss = criterion(active_logits, active_labels)
                val_loss_total += val_loss.item()
                val_steps += 1
                val_progress_bar.set_postfix(loss=val_loss.item())
        avg_val_loss = val_loss_total / val_steps
        print(f"Epoch {epoch + 1}/{epochs}, Average Validation Loss: {avg_val_loss:.4f}")

In [None]:
processor = EventSequenceProcessor()
processor.add_data(dataframe)
processor.fix_vocabulary()

In [None]:
train_df, val_df = train_test_split(processor.dataframe, test_size=0.2, random_state=42)
train_processor = EventSequenceProcessor()
val_processor = EventSequenceProcessor()
train_processor.add_data(train_df)
val_processor.add_data(val_df)
train_dataloader, _, _ = train_processor.get_dataloader()
val_dataloader, _, _ = val_processor.get_dataloader()

In [None]:
custom_tokenizer = CustomTokenizer(processor.global_dict)
vocab_size = len(custom_tokenizer.get_vocab())
index_to_event = processor.get_index_to_event_mapping()

In [None]:
model = CustomT5Model(vocab_size)
train_and_validate_model(model, train_dataloader, val_dataloader, vocab_size, index_to_event, epochs=8)

  sequences = [torch.tensor(seq, dtype=torch.long) for seq in sequences]
  targets = [torch.tensor(tgt, dtype=torch.long) for tgt in targets]


Epoch 1/8, Average Training Loss: 2.5963




Epoch 1/8, Average Validation Loss: 5.2365




Epoch 2/8, Average Training Loss: 1.8341




Epoch 2/8, Average Validation Loss: 5.0308




Epoch 3/8, Average Training Loss: 1.6673




Epoch 3/8, Average Validation Loss: 5.1338




Epoch 4/8, Average Training Loss: 1.5693




Epoch 4/8, Average Validation Loss: 4.9291




Epoch 5/8, Average Training Loss: 1.5289




Epoch 5/8, Average Validation Loss: 4.9388




Epoch 6/8, Average Training Loss: 1.4562




Epoch 6/8, Average Validation Loss: 5.0033




Epoch 7/8, Average Training Loss: 1.4122




Epoch 7/8, Average Validation Loss: 5.1679




Epoch 8/8, Average Training Loss: 1.3785


                                                                                  

Epoch 8/8, Average Validation Loss: 4.9716




In [None]:
model_path = "/content/custom_t5_model"
model.save_pretrained(model_path)
custom_tokenizer.save_vocabulary(model_path)
print(f"Model saved to {model_path}")

Model saved to /content/custom_t5_model


In [None]:
def generate_sequence(model, tokenizer, input_sequence, prediction_steps=3, num_beams=5):
    input_ids = torch.tensor([tokenizer.global_dict.convert_sequence_to_indices(input_sequence)]).to(device)
    model.eval()
    generated_sequence = input_sequence.copy()

    for _ in range(prediction_steps):
        with torch.no_grad():
            output_ids = model.model.generate(
                input_ids=input_ids,
                max_length=input_ids.size(1) + 1,
                num_beams=num_beams,
                pad_token_id=tokenizer.global_dict.event_to_index["PAD"],
                decoder_start_token_id=tokenizer.global_dict.event_to_index["START"],
                early_stopping=True
            )

        # Get the next predicted token (excluding the input sequence)
        next_token_id = output_ids[0, input_ids.size(1):].item()
        next_token = tokenizer._convert_id_to_token(next_token_id)

        # Append the predicted token to the generated sequence
        generated_sequence.append(next_token)

        # Update the input_ids for the next prediction
        input_ids = torch.tensor([tokenizer.global_dict.convert_sequence_to_indices(generated_sequence)]).to(device)

    return generated_sequence


In [None]:
# Example usage
input_sequence = df['Sequence of events'][454][:8]  # Original sequence of length n-3
print("Input Sequence:", input_sequence)
print("Original Sequence:",df['Sequence of events'][454])
output_sequence = generate_sequence(model, custom_tokenizer, input_sequence, prediction_steps=3, num_beams=5)
print("Generated Sequence:", output_sequence)

Input Sequence: ['first_visit', 'session_start', 'at_visibility', 'general_tab_click', 'general_tab_click', 'open_chat', 'general_tab_click', 'general_tab_click']
Original Sequence: ['first_visit', 'session_start', 'at_visibility', 'general_tab_click', 'general_tab_click', 'open_chat', 'general_tab_click', 'general_tab_click', 'general_tab_click', 'page_view', 'user_engagement']
Generated Sequence: ['first_visit', 'session_start', 'at_visibility', 'general_tab_click', 'general_tab_click', 'open_chat', 'general_tab_click', 'general_tab_click', 'open_chat', 'open_chat', 'open_chat']


In [None]:
# Example usage
input_sequence = df['Sequence of events'][467][:8]  # Original sequence of length n-3
print("Input Sequence:", input_sequence)
print("Original Sequence:",df['Sequence of events'][467])
output_sequence = generate_sequence(model, custom_tokenizer, input_sequence, prediction_steps=3, num_beams=5)
print("Generated Sequence:", output_sequence)

Input Sequence: ['first_visit', 'session_start', 'at_visibility', 'page_view', 'general_button_click', 'general_button_click', 'general_button_click', 'general_button_click']
Original Sequence: ['first_visit', 'session_start', 'at_visibility', 'page_view', 'general_button_click', 'general_button_click', 'general_button_click', 'general_button_click', 'general_button_click', 'general_button_click', 'general_button_click', 'general_button_click', 'general_button_click', 'user_engagement', 'page_view', 'banner_button_click', 'banner_button_click', 'user_engagement', 'page_view', 'at_visibility', 'general_tab_click', 'fnb_menu_link_click', 'user_engagement', 'page_view', 'general_link_click', 'page_view', 'view_item_list', 'select_item', 'user_engagement', 'product_learn_more', 'page_view', 'session_start', 'user_engagement', 'session_start', 'page_view', 'general_button_click', 'gnb_menu_link_click', 'user_engagement', 'page_view', 'view_item_list', 'general_link_click', 'page_view', 'vie

In [None]:
# Example usage
input_sequence = df['Sequence of events'][446][:8]  # Original sequence of length n-3
print("Input Sequence:", input_sequence)
print("Original Sequence:",df['Sequence of events'][446])
output_sequence = generate_sequence(model, custom_tokenizer, input_sequence, prediction_steps=3, num_beams=5)
print("Generated Sequence:", output_sequence)

Input Sequence: ['first_visit', 'session_start', 'page_view', 'view_item', 'page_view', 'proactive_message_impression', 'session_start', 'view_item']
Original Sequence: ['first_visit', 'session_start', 'page_view', 'view_item', 'page_view', 'proactive_message_impression', 'session_start', 'view_item', 'page_view', 'proactive_message_impression', 'view_item', 'page_view']
Generated Sequence: ['first_visit', 'session_start', 'page_view', 'view_item', 'page_view', 'proactive_message_impression', 'session_start', 'view_item', 'product_review_visibility', 'product_review_visibility', 'product_review_visibility']


In [None]:
def generate_sequence(model, tokenizer, input_sequence, prediction_steps=3, top_k=50, temperature=1.0):
    input_ids = torch.tensor([tokenizer.global_dict.convert_sequence_to_indices(input_sequence)]).to(device)
    model.eval()
    generated_sequence = input_sequence.copy()

    for _ in range(prediction_steps):
        with torch.no_grad():
            output_ids = model.model.generate(
                input_ids=input_ids,
                max_length=input_ids.size(1) + 1,
                do_sample=True,
                top_k=top_k,
                temperature=temperature,
                pad_token_id=tokenizer.global_dict.event_to_index["PAD"],
                decoder_start_token_id=tokenizer.global_dict.event_to_index["START"],
                early_stopping=True
            )

        # Get the next predicted token (excluding the input sequence)
        next_token_id = output_ids[0, input_ids.size(1):].item()
        next_token = tokenizer._convert_id_to_token(next_token_id)

        # Append the predicted token to the generated sequence
        generated_sequence.append(next_token)

        # Update the input_ids for the next prediction
        input_ids = torch.tensor([tokenizer.global_dict.convert_sequence_to_indices(generated_sequence)]).to(device)

    return generated_sequence

In [None]:
# Example usage
input_sequence = df['Sequence of events'][446][:8]  # Original sequence of length n-3
print("Input Sequence:", input_sequence)
print("Original Sequence:",df['Sequence of events'][446])
output_sequence = generate_sequence(model, custom_tokenizer, input_sequence, prediction_steps=3, top_k=50, temperature=1.0)
print("Generated Sequence:", output_sequence)

Input Sequence: ['first_visit', 'session_start', 'page_view', 'view_item', 'page_view', 'proactive_message_impression', 'session_start', 'view_item']
Original Sequence: ['first_visit', 'session_start', 'page_view', 'view_item', 'page_view', 'proactive_message_impression', 'session_start', 'view_item', 'page_view', 'proactive_message_impression', 'view_item', 'page_view']
Generated Sequence: ['first_visit', 'session_start', 'page_view', 'view_item', 'page_view', 'proactive_message_impression', 'session_start', 'view_item', 'product_review_visibility', 'product_review_visibility', 'session_start']


In [None]:
# Example usage
input_sequence = df['Sequence of events'][454][:8]  # Original sequence of length n-3
print("Input Sequence:", input_sequence)
print("Original Sequence:",df['Sequence of events'][454])
output_sequence = generate_sequence(model, custom_tokenizer, input_sequence, prediction_steps=3, top_k=50, temperature=1.0)
print("Generated Sequence:", output_sequence)

Input Sequence: ['first_visit', 'session_start', 'at_visibility', 'general_tab_click', 'general_tab_click', 'open_chat', 'general_tab_click', 'general_tab_click']
Original Sequence: ['first_visit', 'session_start', 'at_visibility', 'general_tab_click', 'general_tab_click', 'open_chat', 'general_tab_click', 'general_tab_click', 'general_tab_click', 'page_view', 'user_engagement']
Generated Sequence: ['first_visit', 'session_start', 'at_visibility', 'general_tab_click', 'general_tab_click', 'open_chat', 'general_tab_click', 'general_tab_click', 'open_chat', 'open_chat', 'open_chat']


In [None]:
# Example usage
input_sequence = df['Sequence of events'][467][:8]  # Original sequence of length n-3
print("Input Sequence:", input_sequence)
print("Original Sequence:",df['Sequence of events'][467][:11])
output_sequence = generate_sequence(model, custom_tokenizer, input_sequence, prediction_steps=3, top_k=50, temperature=1.0)
print("Generated Sequence:", output_sequence)

Input Sequence: ['first_visit', 'session_start', 'at_visibility', 'page_view', 'general_button_click', 'general_button_click', 'general_button_click', 'general_button_click']
Original Sequence: ['first_visit', 'session_start', 'at_visibility', 'page_view', 'general_button_click', 'general_button_click', 'general_button_click', 'general_button_click', 'general_button_click', 'general_button_click', 'general_button_click']
Generated Sequence: ['first_visit', 'session_start', 'at_visibility', 'page_view', 'general_button_click', 'general_button_click', 'general_button_click', 'general_button_click', 'page_view', 'page_view', 'page_view']


In [None]:

df['Sequence of events'][467]

NameError: name 'df' is not defined

In [None]:
def generate_sequence(model, tokenizer, input_sequence, max_length=10, num_beams=5):
    input_ids = torch.tensor([tokenizer.global_dict.convert_sequence_to_indices(input_sequence)]).to(device)
    model.eval()
    with torch.no_grad():
        output_ids = model.model.generate(
            input_ids=input_ids,
            max_length=max_length,
            num_beams=num_beams,
            pad_token_id=tokenizer.global_dict.event_to_index["PAD"],
            decoder_start_token_id=tokenizer.global_dict.event_to_index["START"],
            early_stopping=True
        )
    output_sequence = [tokenizer._convert_id_to_token(id.item()) for id in output_ids[0]]
    return output_sequence

In [None]:
input_sequence = dataframe['Sequence of events'][5][:8]
print("Original Input Sequence:", input_sequence)
output_sequence = generate_sequence(model, custom_tokenizer, input_sequence, max_length=10, num_beams=5)
print("Generated Sequence:", output_sequence)

Original Input Sequence: ['first_visit', 'session_start', 'open_chat', 'open_chat', 'open_chat', 'page_view', 'proactive_message_impression']
Generated Sequence: ['START', 'session_start', 'move_to_whatsapp', 'open_chat', 'open_chat', 'open_chat', 'open_chat', 'open_chat', 'open_chat', 'open_chat']
