### Preparing Dataset 

In [18]:
import torch
from transformers import BertTokenizer, BertForMaskedLM, Trainer, TrainingArguments
from torch.utils.data import Dataset

class LetterDataset(Dataset):
    def __init__(self, words, tokenizer, max_length=31):
        self.words = words
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.words)
    
    def __getitem__(self, idx):
        word = self.words[idx]
        letters = " ".join(list(word))
        encoded = self.tokenizer(
            letters, 
            padding='max_length', 
            truncation=True, 
            max_length=self.max_length, 
            return_tensors='pt'
        )
        input_ids = encoded.input_ids.squeeze()
        attention_mask = encoded.attention_mask.squeeze()
        labels = input_ids.clone()

        rand = torch.rand(input_ids.shape)
        mask_arr = (rand < 0.15) * (input_ids != self.tokenizer.pad_token_id) * \
                   (input_ids != self.tokenizer.cls_token_id) * (input_ids != self.tokenizer.sep_token_id)
        selection = torch.flatten(mask_arr.nonzero()).tolist()
        input_ids[selection] = self.tokenizer.mask_token_id

        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

def preprocess_words(file_path):
    with open(file_path, 'r') as file:
        words = [line.strip() for line in file.readlines()]
    return words

# Loading the words
words = preprocess_words('words_250000_train.txt')

# Initializing the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Creating the dataset
dataset = LetterDataset(words, tokenizer)


In [12]:
tokenizer.mask_token_id


103

In [16]:
max(words,key=len)

'cyclotrimethylenetrinitramine'

In [19]:
tokenizer.decode(dataset[45222]['input_ids'])

'[CLS] c y c l o t r [MASK] [MASK] e [MASK] h y l e n e t r i [MASK] i t [MASK] a m i n e [SEP]'

In [20]:
len(max(words,key=len))

29

In [21]:
#getting the index of the maximun length word in the words list
max_len_word_index = words.index(max(words,key=len))
print(max_len_word_index)

45222


### Finetuning bert-base-uncased

In [29]:
from torch.utils.data import random_split

# Loading the pre-trained BERT model
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Training 
training_args = TrainingArguments(
    output_dir='./results', 
    num_train_epochs=5,  
    per_device_train_batch_size=32, 
    per_device_eval_batch_size=64,
    evaluation_strategy="epoch",  
    save_strategy="epoch",  
    save_total_limit=2,
    logging_dir='./logs', 
    logging_steps=500,  
    load_best_model_at_end=True,  
)

# Initializing the Trainer
trainer = Trainer(
    model=model, 
    args=training_args, 
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  # Providing the validation dataset
)

# Fine-tuning the model
trainer.train()

# Evaluating the model
evaluation_results = trainer.evaluate()
print(evaluation_results)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch,Training Loss,Validation Loss
1,0.0791,0.077556
2,0.0743,0.070027
3,0.0687,0.06669
4,0.0642,0.062969
5,0.0615,0.061639


There were missing keys in the checkpoint model loaded: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias'].


{'eval_loss': 0.0617235042154789, 'eval_runtime': 55.6527, 'eval_samples_per_second': 816.852, 'eval_steps_per_second': 12.776, 'epoch': 5.0}


In [32]:
tokenizer.save_pretrained('hangman_model_finetuned_epoch5')
model.save_pretrained('hangman_model_finetuned_epoch5')

In [33]:
import torch
from transformers import BertTokenizer, BertForMaskedLM
import torch.nn.functional as F
import string
import random

def predict_non_guessed_letter(masked_word, guessed_letters):
    # Loading pre-trained BERT model and tokenizer
    model_name = 'hangman_model_finetuned_epoch5'
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForMaskedLM.from_pretrained(model_name)

    # Tokenizing input
    input_ids = tokenizer.encode(masked_word, return_tensors='pt')

    with torch.no_grad():
        outputs = model(input_ids)
        predictions = outputs.logits

    # List to hold the top 10 predictions for each masked position
    top_predictions = []

    # Get top 10 predictions for each masked token
    masked_positions = (input_ids == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
    for masked_index in masked_positions:
        probs = F.softmax(predictions[0, masked_index], dim=-1)
        top_probs, top_indices = torch.topk(probs, 10)
        for prob, idx in zip(top_probs, top_indices):
            token = tokenizer.decode([idx.item()]).strip()
            top_predictions.append((token, prob.item()))

    # Sort the predictions based on probability in descending order
    top_predictions.sort(key=lambda x: x[1], reverse=True)
    print(top_predictions)
    # Find the first letter not in guessed letters
    for token, _ in top_predictions:
        if token not in guessed_letters:
            return token

    #random guess if no letter found in top predictions that is not in guessed letters
    all_letters = string.ascii_lowercase
    remaining_letters = [letter for letter in all_letters if letter not in guessed_letters]
    if remaining_letters:
        return random.choice(remaining_letters)

    return None  

masked_word = "c [MASK] t [MASK]"
guessed_letters = []
predicted_letter = predict_non_guessed_letter(masked_word, guessed_letters)
print(f"Predicted letter: {predicted_letter}")




[('a', 0.2978498935699463), ('u', 0.21998746693134308), ('o', 0.21487948298454285), ('s', 0.16822588443756104), ('e', 0.14847537875175476), ('a', 0.14816780388355255), ('i', 0.10130421072244644), ('t', 0.07874676585197449), ('o', 0.07848888635635376), ('e', 0.07282354682683945), ('h', 0.06299653649330139), ('i', 0.059931814670562744), ('y', 0.057597316801548004), ('y', 0.03955281525850296), ('u', 0.03754061833024025), ('r', 0.024258708581328392), ('h', 0.012607723474502563), ('s', 0.006888314615935087), ('r', 0.00618574908003211), ('t', 0.006039596162736416)]
Predicted letter: a
