In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertForMaskedLM, BertConfig, AdamW


In [None]:
class GlobalDictionary:
    def __init__(self):
        self.event_to_index = {
            "START": 0,
            "END": 1,
            "PAD": 2,
            "UNK": 3,
            "MASK": 4
        }
        self.index_to_event = {
            0: "START",
            1: "END",
            2: "PAD",
            3: "UNK",
            4: "MASK"
        }
        self.counter = 5
        self.fixed_vocab = False

    def update_dictionary(self, sequence, separator):
        events = sequence.split(separator)
        events = set(events)
        for event in events:
            if not self.fixed_vocab and event not in self.event_to_index:
                self.event_to_index[event] = self.counter
                self.index_to_event[self.counter] = event
                self.counter += 1

    def convert_sequence_to_indices(self, sequence, separator):
        events = sequence.split(separator)
        indices = [self.event_to_index["START"]] + \
                  [self.event_to_index.get(event, self.event_to_index["UNK"]) for event in events] + \
                  [self.event_to_index["END"]]
        return indices

    def get_index_to_event_mapping(self):
        return self.index_to_event

    def fix_vocab(self):
        self.fixed_vocab = True

In [None]:
class EventSequenceProcessor:
    def __init__(self, separator=' '):
        self.global_dict = GlobalDictionary()
        self.dataframe = pd.DataFrame(columns=["User ID", "Sequence of events"])
        self.separator = separator
        self.max_length = 0

    def add_data(self, new_data):
        new_df = pd.DataFrame(new_data)

        for sequence in new_df['Sequence of events']:
            self.global_dict.update_dictionary(sequence, self.separator)

        new_df['Sequence of events'] = new_df['Sequence of events'].apply(lambda x: self.global_dict.convert_sequence_to_indices(x, self.separator))

        self.dataframe = pd.concat([self.dataframe, new_df], ignore_index=True)

        self.max_length = max(self.max_length, max(new_df['Sequence of events'].apply(len)))

    def fix_vocabulary(self):
        self.global_dict.fix_vocab()

    def get_dataloader(self, batch_size=2, mask_prob=0.15):
        dataset = EventSequenceDataset(self.dataframe, self.global_dict, mask_prob)
        vocab_size = len(self.global_dict.event_to_index)
        max_length = self.max_length

        def collate_fn(batch):
            user_ids, sequences, masked_sequences, masked_positions = zip(*batch)
            user_ids = torch.tensor(user_ids)
            sequences = [torch.tensor(seq) for seq in sequences]
            masked_sequences = [torch.tensor(seq) for seq in masked_sequences]
            padded_sequences = nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=self.global_dict.event_to_index["PAD"])
            padded_masked_sequences = nn.utils.rnn.pad_sequence(masked_sequences, batch_first=True, padding_value=self.global_dict.event_to_index["PAD"])

            if padded_sequences.size(1) < max_length:
                padded_sequences = nn.functional.pad(padded_sequences, (0, max_length - padded_sequences.size(1)), value=self.global_dict.event_to_index["PAD"])
                padded_masked_sequences = nn.functional.pad(padded_masked_sequences, (0, max_length - padded_masked_sequences.size(1)), value=self.global_dict.event_to_index["PAD"])

            attention_mask = (padded_sequences != self.global_dict.event_to_index["PAD"]).long()
            padded_masked_positions = [torch.tensor(mp) for mp in masked_positions]

            return user_ids, padded_sequences, padded_masked_sequences, attention_mask, padded_masked_positions

        return DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn), vocab_size, max_length

    def get_dictionary(self):
        return self.global_dict.event_to_index

    def get_index_to_event_mapping(self):
        return self.global_dict.get_index_to_event_mapping()

In [None]:
class EventSequenceDataset(Dataset):
    def __init__(self, dataframe, global_dict, mask_prob=0.15):
        self.dataframe = dataframe
        self.global_dict = global_dict
        self.mask_prob = mask_prob

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        user_id = self.dataframe.iloc[idx, 0]
        sequence = self.dataframe.iloc[idx, 1]
        masked_sequence, masked_positions = self.mask_sequence(sequence)
        return user_id, sequence, masked_sequence, masked_positions

    def mask_sequence(self, sequence):
        sequence = torch.tensor(sequence)
        masked_positions = []
        mask_prob = self.mask_prob
        masked_sequence = sequence.clone()

        for i in range(1, len(sequence) - 1):
            if torch.rand(1).item() < mask_prob and sequence[i] not in {self.global_dict.event_to_index["START"], self.global_dict.event_to_index["END"], self.global_dict.event_to_index["PAD"]}:
                masked_positions.append(i)
                masked_sequence[i] = self.global_dict.event_to_index["MASK"]

        return masked_sequence, masked_positions



In [None]:
class BERTMLM(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_layers, num_heads):
        super(BERTMLM, self).__init__()
        config = BertConfig(
            vocab_size=vocab_size,
            hidden_size=hidden_size,
            num_hidden_layers=num_layers,
            num_attention_heads=num_heads,
            intermediate_size=hidden_size * 4,
            max_position_embeddings=512,
            hidden_dropout_prob=0.1,
            attention_probs_dropout_prob=0.1,
            output_hidden_states=False
        )
        self.model = BertForMaskedLM(config)

    def forward(self, input_ids, attention_mask, labels):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return outputs.loss, outputs.logits



In [None]:
# Example usage:  Provided text
text = """The woman had died without pain quietly as a woman should whose life had been blameless.  Now she was resting in her bed lying on her back her eyes closed her features calm her long white hair carefully arranged as though she had done it up ten minutes before dying. The whole pale countenance of the dead woman was so collected so calm so resigned that one could feel what a sweet soul had lived in that body what a quiet existence this old soul had led how easy and pure the death of this parent had been. Kneeling beside the bed her son a magistrate with inflexible principles and her daughter Marguerite known as Sister Eulalie were weeping as though their hearts would break. She had from childhood up armed them with a strict moral code teaching them religion without weakness and duty without compromise. He the man had become a judge and handled the law as a weapon with which he smote the weak ones without pity. She the girl influenced by the virtue which had bathed her in this austere family had become the bride of the Church through her loathing for man. They had hardly known their father knowing only that he had made their mother most unhappy without being told any other details. The nun was wildly kissing the dead womans hand an ivory hand as white as the large crucifix lying across the bed. On the other side of the long body the other hand seemed still to be holding the sheet in the death grasp; and the sheet had preserved the little creases as a memory of those last movements which precede eternal immobility. A few light taps on the door caused the two sobbing heads to look up and the priest who had just come from dinner returned. He was red and out of breath from his interrupted digestion for he had made himself a strong mixture of coffee and brandy in order to combat the fatigue of the last few nights and of the wake which was beginning. He looked sad with that assumed sadness of the priest for whom death is a bread winner. He crossed himself and approaching with his professional gesture: Well my poor children I have come to help you pass these last sad hours. But Sister Eulalie suddenly arose. Thank you father but my brother and I prefer to remain alone with her. This is our last chance to see her and we wish to be together all three of us as we we used to be when we were small and our poor mo mother. Grief and tears stopped her; she could not continue. Once more serene the priest bowed thinking of his bed. As you wish my children. He kneeled crossed himself prayed arose and went out quietly murmuring: She was a saint. They remained alone the dead woman and her children. The ticking of the clock hidden in the shadow could be heard distinctly and through the open window drifted in the sweet smell of hay and of woods together with the soft moonlight. No other noise could be heard over the land except the occasional croaking of the frog or the chirping of some belated insect. An infinite peace a divine melancholy a silent serenity surrounded this dead woman seemed to be breathed out from her and to appease nature itself. Then the judge still kneeling his head buried in the bed clothes cried in a voice altered by grief and deadened by the sheets and blankets: Mamma mamma mamma And his sister frantically striking her forehead against the woodwork convulsed twitching and trembling as in an epileptic fit moaned: Jesus Jesus mamma Jesus And both of them shaken by a storm of grief gasped and choked. The crisis slowly calmed down and they began to weep quietly just as on the sea when a calm follows a squall. A rather long time passed and they arose and looked at their dead. And the memories those distant memories yesterday so dear today so torturing came to their minds with all the little forgotten details those little intimate familiar details which bring back to life the one who has left. They recalled to each other circumstances words smiles intonations of the mother who was no longer to speak to them. They saw her again happy and calm. They remembered things which she had said and a little motion of the hand like beating time which she often used when emphasizing something important. And they loved her as they never had loved her before. They measured the depth of their grief and thus they discovered how lonely they would find themselves. It was their prop their guide their whole youth all the best part of their lives which was disappearing. It was their bond with life their mother their mamma the connecting link with their forefathers which they would thenceforth miss. They now became solitary lonely beings; they could no longer look back. The nun said to her brother: You remember how mamma used always to read her old letters; they are all there in that drawer. Let us in turn read them; let us live her whole life through tonight beside her It would be like a road to the cross like making the acquaintance of her mother of our grandparents whom we never knew but whose letters are there and of whom she so often spoke do you remember. Out of the drawer they took about ten little packages of yellow paper tied with care and arranged one beside the other. They threw these relics on the bed and chose one of them on which the word Father was written. They opened and read it. It was one of those old fashioned letters which one finds in old family desk drawers those epistles which smell of another century. The first one started: My dear another one: My beautiful little girl others: My dear child or: My dear laughter And suddenly the nun began to read aloud to read over to the dead woman her whole history all her tender memories. The judge resting his elbow on the bed was listening with his eyes fastened on his mother. The motionless body seemed happy. Sister Eulalie interrupting herself said suddenly:These ought to be put in the grave with her; they ought to be used as a shroud and she ought to be buried in it. She took another package on which no name was written. She began to read in a firm voice: My adored one I love you wildly. Since yesterday I have been suffering the tortures of the damned haunted by our memory. I feel your lips against mine your eyes in mine your breast against mine. I love you I love you. You have driven me mad. My arms open I gasp moved by a wild desire to hold you again. My whole soul and body cries out for you wants you. I have kept in my mouth the taste of your kisses. The judge had straightened himself up. The nun stopped reading. He snatched the letter from her and looked for the signature. There was none but only under the words The man who adores you the name Henry Their father's name was Rene. Therefore this was not from him. The son then quickly rummaged through the package of letters took one out and read: I can no longer live without your caresses Standing erect severe as when sitting on the bench he looked unmoved at the dead woman. The nun straight as a statue tears trembling in the corners of her eyes was watching her brother waiting. Then he crossed the room slowly went to the window and stood there gazing out into the dark night. When he turned around again Sister Eulalie her eyes dry now was still standing near the bed her head bent down. He stepped forward quickly picked up the letters and threw them pell mell back into the drawer. Then he closed the curtains of the bed. When daylight made the candles on the table turn pale the son slowly left his armchair and without looking again at the mother upon whom he had passed sentence severing the tie that united her to son and daughter he said slowly: Let us now retire sister."""

In [None]:
def train_bert_mlm(dataloader, vocab_size, index_to_event, hidden_size=256, num_layers=2, num_heads=4, epochs=3):
    model = BERTMLM(vocab_size, hidden_size, num_layers, num_heads)
    model.train()
    optimizer = AdamW(model.parameters(), lr=5e-5)
    loss_fn = nn.CrossEntropyLoss(ignore_index=2)  # Ensure ignore_index is set correctly

    for epoch in range(epochs):
        for step, batch in enumerate(dataloader):
            user_ids, sequences, masked_sequences, attention_mask, masked_positions = batch
            optimizer.zero_grad()
            loss, logits = model(input_ids=masked_sequences, attention_mask=attention_mask, labels=sequences)
            loss.backward()
            optimizer.step()

            with torch.no_grad():
                predictions = logits.argmax(dim=-1)
                for i, (user_id, seq, pred_seq, mask_pos, masked_seq) in enumerate(zip(user_ids, sequences, predictions, masked_positions, masked_sequences)):
                    pred_sequence = seq.clone()
                    for pos in mask_pos:
                        pred_sequence[pos] = pred_seq[pos]

                    print(f"User ID: {user_id}")
                    print("Original Sequence (Events):", [index_to_event[idx.item()] for idx in seq])
                    print("Masked Sequence (Events):", [index_to_event[idx.item()] for idx in masked_seq])
                    print("Predicted Sequence (Events):", [index_to_event[idx.item()] for idx in pred_sequence])
                    print()
                    print("Original Sequence (Tensor):", seq)
                    print("Masked Sequence (Tensor):", masked_seq)
                    print("Predicted Sequence (Tensor):", pred_sequence)
                    print()
                    print("Masked Positions:", mask_pos)  # Print the actual masked positions
                    print()
            print(f"Epoch {epoch + 1}, Step {step + 1}, Loss: {loss.item()}")



In [None]:
sentences = text.split(". ")
data = [{"User ID": i+1, "Sequence of events": sentence} for i, sentence in enumerate(sentences)]

# Initialize processor
processor = EventSequenceProcessor(separator=' ')
processor.add_data(data)
processor.fix_vocabulary()

dictionary = processor.get_dictionary()
print("Dictionary (event to index):")
print(dictionary)
print()

index_to_event = processor.get_index_to_event_mapping()
# print("\nDictionary (index to event):")
# print(index_to_event)

index_to_event = processor.get_index_to_event_mapping()
dataloader, vocab_size, max_length = processor.get_dataloader(batch_size=2)
train_bert_mlm(dataloader, vocab_size, index_to_event, hidden_size=256, num_layers=2, num_heads=4, epochs=3)

Dictionary (event to index):
{'START': 0, 'END': 1, 'PAD': 2, 'UNK': 3, 'MASK': 4, 'The': 5, 'a': 6, 'woman': 7, 'died': 8, 'whose': 9, 'quietly': 10, 'been': 11, 'pain': 12, 'as': 13, 'had': 14, 'life': 15, 'without': 16, 'blameless': 17, 'should': 18, '': 19, 'bed': 20, 'arranged': 21, 'features': 22, 'closed': 23, 'eyes': 24, 'done': 25, 'it': 26, 'hair': 27, 'minutes': 28, 'in': 29, 'dying': 30, 'her': 31, 'lying': 32, 'was': 33, 'calm': 34, 'carefully': 35, 'on': 36, 'before': 37, 'resting': 38, 'back': 39, 'ten': 40, 'though': 41, 'Now': 42, 'white': 43, 'she': 44, 'up': 45, 'long': 46, 'lived': 47, 'death': 48, 'that': 49, 'countenance': 50, 'so': 51, 'whole': 52, 'one': 53, 'resigned': 54, 'old': 55, 'of': 56, 'pale': 57, 'sweet': 58, 'the': 59, 'could': 60, 'easy': 61, 'pure': 62, 'dead': 63, 'this': 64, 'existence': 65, 'collected': 66, 'how': 67, 'quiet': 68, 'and': 69, 'body': 70, 'soul': 71, 'led': 72, 'what': 73, 'parent': 74, 'feel': 75, 'would': 76, 'magistrate': 77, 'K

  masked_sequences = [torch.tensor(seq) for seq in masked_sequences]


User ID: 17
Original Sequence (Events): ['START', 'This', 'is', 'our', 'last', 'chance', 'to', 'see', 'her', 'and', 'we', 'wish', 'to', 'be', 'together', 'all', 'three', 'of', 'us', 'as', 'we', 'we', 'used', 'to', 'be', 'when', 'we', 'were', 'small', 'and', 'our', 'poor', 'mo', 'mother', 'END', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']
Masked Sequence (Events): ['START', 'This', 'is', 'our', 'last', 'chance', 'MASK', 'see', 'her', 'and', 'we', 'MASK', 'to', 'be', 'together', 'all', 'three', 'of', 'us', 'as', 'we', 'we', 'MASK', 'to', 'be', 'when', 'we', 'MASK', 'MASK', 'MASK', 'MASK', 'poor', 'mo', 'mother', 'END', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD',

In [None]:
!apt-get install git


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git is already the newest version (1:2.34.1-1ubuntu1.11).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


In [None]:
git

# New Section