In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import random
import numpy as np
import tqdm
import math
from torch.autograd import Variable

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def build_dictionary(dictionary_file_location):
        text_file = open(dictionary_file_location,"r")
        full_dictionary = text_file.read().splitlines()
        text_file.close()
        return full_dictionary
    
full_dictionary_location = "words_250000_train.txt"
full_dictionary = build_dictionary(full_dictionary_location)

In [4]:
class Dict_Dataset(Dataset):
    def __init__(self, dictionary):
        self.words = dictionary
        # Create a mapping from each letter to its corresponding index (0-25)
        self.alphabets = 'abcdefghijklmnopqrstuvwxyz'
        self.CHAR_TO_INDEX = {char: idx for idx, char in enumerate(self.alphabets)}

        self.letter_weight = {}
        for i in dictionary:
            for l in self.alphabets:
                if l in i:
                    if self.letter_weight.get(l):
                        self.letter_weight[l] += 1
                    else:
                        self.letter_weight[l] = 1

    def __len__(self):
        return len(self.words)
    
    def cnt_to_guesses(self,char_set,cnt):
        lst = list(char_set)
        return list(np.random.choice(lst, cnt, p=[self.letter_weight[i] for i in lst]/np.sum([self.letter_weight[i] for i in lst]), replace=False))
    
    def one_hot_encode(self,char):
        """Convert a character to a one-hot vector."""
        vec = torch.zeros(28)
        vec[self.CHAR_TO_INDEX[char]] = 1.0
        return vec

    def word_to_matrix(self,word, correct_guesses,wrong_guesses):
        word = word.lower()  # Ensure the word is lowercase
        matrix = torch.zeros(38, 28)  # Initialize a (38, 28) matrix with zeros 27,28 digits for depicting blanks and wrong vector respect.
        # matrix[:len(word),26]=1  ## depicting word length in our sequence
        # print(matrix)
        for i, char in enumerate(word):
            if char in correct_guesses:
                # print(char)
                matrix[i] = self.one_hot_encode(char)
        # print('rssa')
        for i, char in enumerate(wrong_guesses):
            # print(char)
            matrix[32+i] = self.one_hot_encode(char)  
            matrix[32+i,27]=1
        matrix[:len(word),26]=1  ## depicting word length in our sequence
        return matrix

    def multi_encode(self,set_char):
        string = ''.join(set_char)
        vec = torch.zeros(26)
        for char in string:
            vec[self.CHAR_TO_INDEX[char]] = 1.0
        return vec
    
    def __getitem__(self, idx):
        wrd = self.words[idx]
        set_alpha = set(wrd)

        if len(set_alpha)==1:
            return self.word_to_matrix( wrd, '', '' ), self.multi_encode(set_alpha)
        
        if 'e' in set_alpha:
            cnt_correct_guess = np.random.randint(len(set_alpha)-1) ## -1 for e, -1 for atleast one unguessed
            cnt_incorrect_guess = np.random.randint(6) ## 0 to 5
            correct_guesses = self.cnt_to_guesses(set_alpha-set('e'),cnt_correct_guess)
            wrong_guesses = self.cnt_to_guesses(set(self.alphabets)-set_alpha,cnt_incorrect_guess)
            return self.word_to_matrix( wrd, ''.join(correct_guesses)+'e', ''.join(wrong_guesses) ), self.multi_encode(set_alpha - set(''.join(correct_guesses)+'e'))
        
        elif 'a' in set_alpha:
            cnt_correct_guess = np.random.randint(len(set_alpha)-1) ## -1 for a, -1 for atleast one unguessed
            cnt_incorrect_guess = np.random.randint(5) ## 0 to 4 , one for 'e'
            correct_guesses = self.cnt_to_guesses(set_alpha-set('a'),cnt_correct_guess)
            wrong_guesses = self.cnt_to_guesses(set(self.alphabets)-set_alpha-set('e'),cnt_incorrect_guess)
            return self.word_to_matrix( wrd, ''.join(correct_guesses)+'a', ''.join(wrong_guesses)+'e' ), self.multi_encode(set_alpha - set(''.join(correct_guesses)+'a'))
        
        elif 'i' in set_alpha:
            cnt_correct_guess = np.random.randint(len(set_alpha)-1) ## -1 for i, -1 for atleast one unguessed
            cnt_incorrect_guess = np.random.randint(4) ## 0 to 3 , two for 'e','a'
            correct_guesses = self.cnt_to_guesses(set_alpha-set('i'),cnt_correct_guess)
            wrong_guesses = self.cnt_to_guesses(set(self.alphabets)-set_alpha-set('ea'),cnt_incorrect_guess)
            return self.word_to_matrix( wrd, ''.join(correct_guesses)+'i', ''.join(wrong_guesses)+'ea' ), self.multi_encode(set_alpha - set(''.join(correct_guesses)+'i'))
        
        elif 'o' in set_alpha:
            cnt_correct_guess = np.random.randint(len(set_alpha)-1) ## -1 for o, -1 for atleast one unguessed
            cnt_incorrect_guess = np.random.randint(3) ## 0 to 2 , three for 'e','a','i'
            correct_guesses = self.cnt_to_guesses(set_alpha-set('o'),cnt_correct_guess)
            wrong_guesses = self.cnt_to_guesses(set(self.alphabets)-set_alpha-set('eai'),cnt_incorrect_guess)
            return self.word_to_matrix( wrd, ''.join(correct_guesses)+'o', ''.join(wrong_guesses)+'eai' ), self.multi_encode(set_alpha - set(''.join(correct_guesses)+'o'))
        
        else:
            return self.word_to_matrix( wrd, '', 'eaio' ), self.multi_encode(set_alpha)


In [5]:
dataset = Dict_Dataset(full_dictionary)

In [6]:
class BiLSTMClassifier(nn.Module):
	def __init__(self, batch_size, output_size, hidden_size, vocab_size, embedding_length,device):
		super(BiLSTMClassifier, self).__init__()
		self.batch_size = batch_size
		self.output_size = output_size
		self.hidden_size = hidden_size
		self.vocab_size = vocab_size
		self.embedding_length = embedding_length
		self.device = device
		
		self.word_embeddings = nn.Linear(vocab_size, embedding_length)
		# self.word_embeddings.weight = nn.Parameter(weights, requires_grad=False) 
		self.lstm = nn.LSTM(embedding_length, hidden_size, bidirectional=True) # Our main hero for this tutorial
		self.label = nn.Linear(hidden_size, output_size)
		
	def forward(self, input_sentence, batch_size=None):
		input = self.word_embeddings(input_sentence) 
		input = input.permute(1, 0, 2) 
		if batch_size is None:
			h_0 = Variable(torch.zeros(2, self.batch_size, self.hidden_size).to(self.device)) 
			c_0 = Variable(torch.zeros(2, self.batch_size, self.hidden_size).to(self.device)) 
		else:
			h_0 = Variable(torch.zeros(2, batch_size, self.hidden_size).to(self.device))
			c_0 = Variable(torch.zeros(2, batch_size, self.hidden_size).to(self.device))
		output, (final_hidden_state, final_cell_state) = self.lstm(input, (h_0, c_0))
		final_output = self.label(final_hidden_state[-1]) 
		
		return final_output

In [7]:
def clip_gradient(model, clip_value):
    params = list(filter(lambda p: p.grad is not None, model.parameters()))
    for p in params:
        p.grad.data.clamp_(-clip_value, clip_value)

In [21]:
# Hyperparameters
input_dim = 28  # Input size of each sequence element
seq_len = 38    # Sequence length
num_classes = 26  # Number of classes
embed_size = 48  # Embedding size
num_heads = 4    # Number of heads in multi-head attention
hidden_dim = 196  # Hidden dimension size in the feedforward layer
num_layers = 4  # Number of Transformer Encoder layers
dropout = 0.1    # Dropout rate

# Create a dataset and data loader
dataset = Dict_Dataset(full_dictionary)
train_set, val_set = torch.utils.data.random_split(dataset, [0.8, 0.2])
batch_size = 64
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True,num_workers=6,drop_last=True)
val_dataloader = DataLoader(val_set, batch_size=batch_size, shuffle=True,num_workers=4,drop_last=True)
device = 'cuda:2'
# Initialize the model, loss function, and optimizer
model = BiLSTMClassifier(batch_size, num_classes, hidden_dim, input_dim, embed_size,device)
# model.load_state_dict(torch.load('models/bilstm_best'))

model.to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
max_epoch_loss = 99
num_epochs = 150
flag=0
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in tqdm.tqdm(train_dataloader):
        optimizer.zero_grad()
        outputs = model(inputs.to(device))
        # print(outputs)
        # stop
        loss = criterion(outputs.cpu(), labels)
        loss.backward()
        clip_gradient(model, 1e-1)
        optimizer.step()
        
        running_loss += loss.item() * inputs.size(0)

    epoch_loss = running_loss / len(dataset)
    if epoch_loss < max_epoch_loss:
        flag = 1
        max_epoch_loss = epoch_loss
        torch.save(model.state_dict(), f'models/bilstm_{epoch}')
        torch.save(model.state_dict(), f'models/bilstm_best')
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')

    # Evaluation
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in tqdm.tqdm(val_dataloader):
            outputs = model(inputs.to(device)).cpu()
            # print(torch.argmax(outputs.data, 1))
            # print(labels)
            predicted = torch.argmax(outputs.data, 1)
            total += labels.size(0)
            # print(torch.sum(torch.Tensor([labels[ind,i] for ind,i in enumerate(predicted)])))
            correct += torch.sum(torch.Tensor([labels[ind,i] for ind,i in enumerate(predicted)])).item()
    
    accuracy = 100 * correct / total
    if flag==1:
        flag=0
        print(f'Accuracy: {accuracy:.2f}%')


print("Training complete.")


100%|██████████| 3551/3551 [01:12<00:00, 49.13it/s]


Epoch [1/150], Loss: 0.3284


100%|██████████| 710/710 [00:09<00:00, 72.13it/s]


Accuracy: 53.19%


100%|██████████| 3551/3551 [01:12<00:00, 49.21it/s]


Epoch [2/150], Loss: 0.2855


100%|██████████| 710/710 [00:09<00:00, 78.36it/s] 


Accuracy: 55.57%


100%|██████████| 3551/3551 [01:12<00:00, 49.06it/s]


Epoch [3/150], Loss: 0.2719


100%|██████████| 710/710 [00:09<00:00, 72.04it/s]


Accuracy: 60.13%


100%|██████████| 3551/3551 [01:15<00:00, 47.13it/s]


Epoch [4/150], Loss: 0.2647


100%|██████████| 710/710 [00:08<00:00, 79.18it/s] 


Accuracy: 62.00%


100%|██████████| 3551/3551 [01:13<00:00, 48.28it/s]


Epoch [5/150], Loss: 0.2599


100%|██████████| 710/710 [00:09<00:00, 71.97it/s] 


Accuracy: 63.07%


100%|██████████| 3551/3551 [01:11<00:00, 49.34it/s]


Epoch [6/150], Loss: 0.2576


100%|██████████| 710/710 [00:08<00:00, 87.76it/s] 


Accuracy: 64.23%


100%|██████████| 3551/3551 [01:10<00:00, 50.14it/s]


Epoch [7/150], Loss: 0.2555


100%|██████████| 710/710 [00:07<00:00, 93.50it/s] 


Accuracy: 65.07%


100%|██████████| 3551/3551 [01:16<00:00, 46.66it/s]


Epoch [8/150], Loss: 0.2534


100%|██████████| 710/710 [00:09<00:00, 74.92it/s]


Accuracy: 65.10%


100%|██████████| 3551/3551 [01:16<00:00, 46.50it/s]


Epoch [9/150], Loss: 0.2523


100%|██████████| 710/710 [00:07<00:00, 88.77it/s] 


Accuracy: 65.52%


100%|██████████| 3551/3551 [01:15<00:00, 47.34it/s]


Epoch [10/150], Loss: 0.2512


100%|██████████| 710/710 [00:09<00:00, 72.68it/s]


Accuracy: 65.45%


100%|██████████| 3551/3551 [00:53<00:00, 66.90it/s] 


Epoch [11/150], Loss: 0.2502


100%|██████████| 710/710 [00:11<00:00, 62.48it/s]


Accuracy: 66.17%


100%|██████████| 3551/3551 [00:33<00:00, 106.19it/s]


Epoch [12/150], Loss: 0.2495


100%|██████████| 710/710 [00:05<00:00, 137.93it/s]


Accuracy: 66.05%


100%|██████████| 3551/3551 [00:33<00:00, 105.36it/s]


Epoch [13/150], Loss: 0.2483


100%|██████████| 710/710 [00:04<00:00, 143.91it/s]


Accuracy: 66.51%


100%|██████████| 3551/3551 [00:33<00:00, 105.54it/s]


Epoch [14/150], Loss: 0.2478


100%|██████████| 710/710 [00:05<00:00, 136.71it/s]


Accuracy: 66.90%


100%|██████████| 3551/3551 [00:33<00:00, 104.53it/s]


Epoch [15/150], Loss: 0.2473


  5%|▍         | 34/710 [00:00<00:08, 80.99it/s]

In [9]:
# Hyperparameters
input_dim = 28  # Input size of each sequence element
seq_len = 38    # Sequence length
num_classes = 26  # Number of classes
embed_size = 48  # Embedding size
num_heads = 4    # Number of heads in multi-head attention
hidden_dim = 196  # Hidden dimension size in the feedforward layer
num_layers = 4  # Number of Transformer Encoder layers
dropout = 0.1    # Dropout rate

# Create a dataset and data loader
dataset = Dict_Dataset(full_dictionary)
train_set, val_set = torch.utils.data.random_split(dataset, [0.8, 0.2])
batch_size = 64
train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True,num_workers=6,drop_last=True)
val_dataloader = DataLoader(val_set, batch_size=batch_size, shuffle=True,num_workers=4,drop_last=True)
device = 'cuda:2'
# Initialize the model, loss function, and optimizer
model = BiLSTMClassifier(batch_size, num_classes, hidden_dim, input_dim, embed_size,device)
model.load_state_dict(torch.load('models/bilstm_best'))

model.to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
max_epoch_loss = 99
num_epochs = 150
flag=0
for epoch in range(1):
    # model.train()
    # running_loss = 0.0
    # for inputs, labels in tqdm.tqdm(train_dataloader):
    #     optimizer.zero_grad()
    #     outputs = model(inputs.to(device))
    #     # print(outputs)
    #     # stop
    #     loss = criterion(outputs.cpu(), labels)
    #     loss.backward()
    #     clip_gradient(model, 1e-1)
    #     optimizer.step()
        
    #     running_loss += loss.item() * inputs.size(0)

    # epoch_loss = running_loss / len(dataset)
    # if epoch_loss < max_epoch_loss:
    #     flag = 1
    #     max_epoch_loss = epoch_loss
    #     torch.save(model.state_dict(), f'models/bilstm_{epoch}')
    #     torch.save(model.state_dict(), f'models/bilstm_best')
    #     print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')

    # Evaluation
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in tqdm.tqdm(val_dataloader):
            outputs = model(inputs.to(device)).cpu()
            # print(torch.argmax(outputs.data, 1))
            # print(labels)
            predicted = torch.argmax(outputs.data, 1)
            total += labels.size(0)
            # print(torch.sum(torch.Tensor([labels[ind,i] for ind,i in enumerate(predicted)])))
            correct += torch.sum(torch.Tensor([labels[ind,i] for ind,i in enumerate(predicted)])).item()
    
    accuracy = 100 * correct / total
    print(f'Accuracy: {accuracy:.2f}%')


print("Training complete.")


100%|██████████| 710/710 [00:06<00:00, 116.70it/s]

Accuracy: 70.64%
Training complete.



