In [23]:

import pandas as pd
import torch
import numpy as np

In [24]:
import string

def read_book(file_path):
    """Read the content of the book from the file."""
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def write_unique_words(words, output_path):
    """Write each unique word to the output file, one word per line."""
    with open(output_path, 'w', encoding='utf-8') as file:
        for word in words:
            file.write(word + '\n')

def process_text(text):
    """Process the text to normalize and extract unique words."""
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Split text into words
    words = text.split()
    # Get unique words
    unique_words = set(words)
    return unique_words

def main(input_file, output_file):
    """Main function to read, process, and write the text."""
    # Read the book
    text = read_book(input_file)
    # Process the text to get unique words
    unique_words = process_text(text)
    # Write unique words to the output file
    write_unique_words(unique_words, output_file)

input_file = './book.txt'
output_file = './words.txt'
main(input_file, output_file)

In [25]:

TOKENIZE_LAMBDA = lambda x : list(x)
VOCAB = {c : i+2 for i, c in enumerate(list('abcdefghijklmnopqrstuvwxyz'))}
VOCAB['_'] = 1 # mask
VOCAB['$'] = 0 # pad
VOCAB['%'] = len(VOCAB) # sep
PAD_IDX = VOCAB['$']

VOCAB_TRANSFORM_LAMBDA = lambda toks : torch.tensor(np.vectorize(lambda x : VOCAB[x])(toks) )
TOKENIZE_LAMBDA = lambda x : list(x)
TEXT_TRANSFORM_LAMBDA = lambda x : VOCAB_TRANSFORM_LAMBDA(TOKENIZE_LAMBDA(x))

In [26]:
TXT_FILENAME = output_file
import re
import numpy as np
import random as rand
import math
class DataTools:

  @staticmethod
  def mask_characters(word, num_chars_to_mask=-1, to_mask=False):
      if type(word) == float:
          print(word)
          word = str(word)
      if not to_mask:
          to_mask = rand.random()*0.9
      if num_chars_to_mask == -1:
          num_chars_to_mask = int(max([1, int(to_mask*len(set(word)))]))

      # Ensure num_chars_to_mask is within the valid range
      if num_chars_to_mask < 1 or num_chars_to_mask > len(set(word)):
          raise ValueError("Number of characters to mask is out of range")

      # Select unique characters from the word
      unique_chars = list(set(word))

      # Randomly choose num_chars_to_mask characters to mask
      chars_to_mask = rand.sample(unique_chars, num_chars_to_mask)

      # Create the masked word
      masked_word = ''.join(['_' if char in chars_to_mask else char for char in word])

      cannot_guess = sorted(list(set("abcdefghijklmnop") - (set(word))))

      random_guessed_letters = rand.sample(cannot_guess, min(6, int(rand.random()*len(cannot_guess))+1))

      label = DataTools.get_label(word, masked_word, to_mask)
      for c in random_guessed_letters:
          label[VOCAB[c]] = 0#-1

      return masked_word + "%" + ''.join(sorted(random_guessed_letters)), label


  @staticmethod
  def get_dictionary_df(txt_filename=TXT_FILENAME):
      return pd.read_csv(filepath_or_buffer=txt_filename, encoding="utf8", names=['words'])
  
  @staticmethod
  def get_label(word, masked, to_mask=False):
      guessable = set(word) - (set(masked))
      freqs = [0]*len(VOCAB)

      for c in word:
          freqs[VOCAB[c]] += 1 + (freqs[VOCAB[c]]) if c in guessable else 0  

      for c in range(len(freqs)):
          if freqs[c] == 0:
              freqs[c] = -torch.inf

      return torch.softmax(torch.tensor(freqs), dim=0)*math.sqrt(1-to_mask)*2


  @staticmethod
  def get_dataset_from_df(df):
      df['transformed'] = df['words'].apply(DataTools.mask_characters)
      df['src'] = df['transformed'].apply(lambda x : x[0])
      df['tgt'] = df['transformed'].apply(lambda x : x[1])

      src, tgt = list(df['src']), list(df['tgt'])
      return src, tgt


class DictionaryDataset(torch.utils.data.Dataset):
    def __init__(self, txt_filename=TXT_FILENAME, truncate=False) -> None:
        # load data + preprocess
        df = DataTools.get_dictionary_df(txt_filename)[:truncate] if truncate else DataTools.get_dictionary_df(txt_filename)
        self.src, self.tgt = DataTools.get_dataset_from_df(df)
        self.src = list(self.src)
        self.tgt = self.tgt

    def __getitem__(self, idx) -> torch.Tensor:
        return self.src[idx], self.tgt[idx]

    def __len__(self):
        return len(self.src)

In [27]:
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer, TransformerEncoder, TransformerEncoderLayer
import math
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size


    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

class MaskedLmTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(MaskedLmTransformer, self).__init__()
        self.encoder_layer = TransformerEncoderLayer(d_model=emb_size,
                                       nhead=nhead,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.encoder = TransformerEncoder(encoder_layer=self.encoder_layer, num_layers=num_encoder_layers)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)
        self.src_tok_emb = TokenEmbedding(vocab_size, emb_size)
        self.generator = nn.Linear(emb_size, vocab_size)

    def forward(self, word):
        embedded = self.src_tok_emb(word)
        emb_1 = self.positional_encoding(embedded)

        encoded = self.encoder(emb_1)
        encoded = encoded.mean(1)

        return self.generator(encoded)

In [28]:
from torch.utils.data import DataLoader
from timeit import default_timer as timer
from torch.nn.utils.rnn import pad_sequence
NUM_EPOCHS = 15


class Trainer:
  @staticmethod
  def get_train_test_iter():
    total_data_iter = DictionaryDataset()#truncate=100)
    train_size = int(0.8 * len(total_data_iter))
    test_size = len(total_data_iter) - train_size
    train_iter, test_iter = torch.utils.data.random_split(total_data_iter, [train_size, test_size])
    return train_iter, test_iter

  @staticmethod
  def train_epoch(model, optimizer, loss_fn,train_iter, batch_size):
      model.train()
      losses = 0
      train_dataloader = DataLoader(train_iter, batch_size=batch_size, collate_fn=Trainer.collate_fn)

      for src, tgt in train_dataloader:
          src = src.to(DEVICE)
          tgt = tgt.to(DEVICE)#, dtype=torch.float)

          optimizer.zero_grad()

          logits = model(src)

          loss = loss_fn(logits, tgt)
          loss.backward()

          optimizer.step()
          losses += loss.item()

      return losses/len(list(train_dataloader))

  @staticmethod
  def evaluate(model, test_iter, loss_fn,batch_size):
      model.eval()
      losses = 0

      val_dataloader = DataLoader(test_iter, batch_size=batch_size, collate_fn=Trainer.collate_fn)


      for src, tgt in val_dataloader:
          src = src.to(DEVICE)
          tgt = tgt.to(DEVICE)#, dtype=torch.float)

          logits = model(src)

          loss = loss_fn(logits, tgt)
          losses += loss.item()

      return losses / len(list(val_dataloader))
  
  @staticmethod
  def train(model, optimizer, loss_fn,num_epochs, batch_size):
    train_iter, test_iter = Trainer.get_train_test_iter()
    for epoch in range(1, num_epochs+1):
        start_time = timer()
        train_loss = Trainer.train_epoch(model, optimizer, loss_fn, train_iter, batch_size)
        end_time = timer()
        val_loss = Trainer.evaluate(model, test_iter, loss_fn, batch_size)
        print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))
  
  @staticmethod
  def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(TEXT_TRANSFORM_LAMBDA(src_sample))
        tgt_batch.append((list(tgt_sample)))
    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX, batch_first=True)
    return src_batch, torch.tensor(tgt_batch)

In [29]:
torch.manual_seed(0)

SRC_VOCAB_SIZE = len(VOCAB)
EMB_SIZE = 26
NHEAD = 2
FFN_HID_DIM = 26
BATCH_SIZE = 128
NUM_ENCODER_LAYERS = 10

transformer = MaskedLmTransformer(
    num_encoder_layers=NUM_ENCODER_LAYERS,
    emb_size=EMB_SIZE,
    nhead=NHEAD,
    vocab_size=SRC_VOCAB_SIZE,
    dim_feedforward=FFN_HID_DIM,
    dropout= 0.1
)

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss()


optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

In [30]:
# The model was already trained over Google Colab

# Uncomment this code if you wish to train the model again 
# Trainer.train(transformer, optimizer, loss_fn, NUM_EPOCHS, BATCH_SIZE)

# Uncomment this code to save the model into a file after training
# torch.save(transformer, f="/content/drive/My Drive/saved_model")

In [31]:
def get_liks(word, transformer):
    prediction = transformer(torch.tensor([list(TEXT_TRANSFORM_LAMBDA(word))]))
    return prediction[0]

def guess_letter(masked, excluded, transformer):
    print(masked)
    liks = get_liks(masked, transformer=transformer)
    reverse_voca_lookup = {VOCAB[k]:k for k in VOCAB}
    pairs = {reverse_voca_lookup[i]:liks[i] for i in range(len(liks))}
    for c in excluded:
        pairs.pop(c)
    pairs = dict(sorted(pairs.items(), key=lambda item: item[1], reverse=True))
    lets = [p[0] for p in pairs]
    return lets[0], lets, pairs


def play_game(model, word):
    if type(model) == str:
        model = torch.load(model)
    masked = '_'*len(word)

    wrongs = 0
    guessed = set()
    wrong_guesses = set()

    while(wrongs < 6):
        in_word = set(masked) - {'_'}
        guess, _, _ = guess_letter(masked, excluded=guessed.union(in_word), transformer=model)
        masked = ''.join([word[i] if word[i] == guess else masked[i] for i in range(len(word))]) + "%" + ''.join(sorted(list(wrong_guesses)))
        print(guess, masked)

        guessed.add(guess)
        if guess not in set(word):
            wrongs+=1
            wrong_guesses.add(guess)

        if masked[0:masked.find("%")]==word:
            print("YOU WON!")
            print(wrongs, " wrong guesses")
            break



In [34]:
play_game("rose", "laceration")


__________
e ___e______%
___e______%
i ___e___i__%
___e___i__%
o ___e___io_%
___e___io_%
h ___e___io_%
___e___io_%
a _a_e_a_io_%h
_a_e_a_io_%h
r _a_era_io_%h
_a_era_io_%h
l la_era_io_%h
la_era_io_%h
s la_era_io_%h
la_era_io_%h
m la_era_io_%hs
la_era_io_%hs
t la_eratio_%hms
la_eratio_%hms
n la_eration%hms
la_eration%hms
p la_eration%hms
la_eration%hms
f la_eration%hmps
la_eration%hmps
c laceration%fhmps
YOU WON!
5  wrong guesses
