In [None]:
from transformers import BertTokenizer, BertForMaskedLM
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

In [None]:
import os

directory = '/kaggle/input/ps2training'
for filename in os.listdir(directory):
    if os.path.isfile(os.path.join(directory, filename)):
        with open(os.path.join(directory, filename), 'r') as file:
            text = file.readlines()
            # Process the text as needed


In [None]:
text[:5]

In [None]:
text = [word.rstrip('\n') for word in text]

In [None]:
text[:5]

In [None]:
max_length = max(len(word) for word in text)
max_length=max_length+10


In [None]:
# from transformers import BertTokenizer
# import torch

# # Initialize the BertTokenizer with the character-level option
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, tokenize_chinese_chars=False)

# # Define the maximum sequence length  # Adjust according to your requirements

# # Encode each letter separately
# encoded_tokens = [tokenizer.convert_tokens_to_ids(token) for token in text]

# # Apply padding and truncation
# padded_tokens = encoded_tokens[:max_length] + [tokenizer.pad_token_id] * (max_length - len(encoded_tokens))
# truncated_tokens = encoded_tokens[:max_length]

# # Convert to PyTorch tensors
# padded_tokens_tensor = torch.tensor(padded_tokens)
# truncated_tokens_tensor = torch.tensor(truncated_tokens)

# print("Padded tokens:", padded_tokens_tensor)
# print("Truncated tokens:", truncated_tokens_tensor)



In [None]:
inputs = tokenizer(text, return_tensors='pt', max_length=max_length, truncation=True, padding='max_length')

In [None]:
inputs[:1]

In [None]:
inputs['labels'] = inputs.input_ids.detach().clone()

In [None]:
inputs[:1]

In [None]:
inputs.keys()

In [None]:
# create random array of floats with equal dimensions to input_ids tensor
rand = torch.rand(inputs.input_ids.shape)
# create mask array
mask_arr = (rand < 0.30) * (inputs.input_ids != 101) * (inputs.input_ids != 102) * (inputs.input_ids != 0)

In [None]:
rand

In [None]:
mask_arr[1]

In [None]:
selection = []

# Iterate through each row in the input_ids tensor
for i in range(inputs.input_ids.shape[0]):
    # Extract the indices of non-zero elements and flatten the tensor
    non_zero_indices = torch.flatten(mask_arr[i].nonzero()).tolist()
    selection.append(non_zero_indices)


In [None]:
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 103

In [None]:
inputs.input_ids

In [None]:
class MeditationsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
dataset = MeditationsDataset(inputs)

In [None]:
loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)

In [None]:
from transformers import AdamW

# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=5e-5)

In [None]:
from tqdm import tqdm  # for our progress bar

epochs = 11

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

In [None]:
# Testing function
def test(model, dataloader, tokenizer):
    model.eval()
    with torch.no_grad():
        for inputs, labels in dataloader:
            outputs = model(inputs)
            predicted_indices = torch.argmax(outputs, dim=-1).tolist()
            predictions = [tokenizer.decode(indices) for indices in predicted_indices]
            print(predictions)


In [None]:
# Save the model
torch.save(model.state_dict(), 'modelKaggle2.pth')