In [5]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import GPT2Tokenizer
from sklearn.model_selection import train_test_split
from torch.optim import AdamW


def load_data(lines):
    dataset_dict = {}
    for line in lines:
        line = line.strip()

        words = line.split()
        if len(words) > 2: 
            features = words[:-1]
            features = " ".join(features)
            label = " ".join(words[-2:])
            label = label.strip()
            dataset_dict[features] = label  

        
    return dataset_dict

with open("text.txt", "r") as file:
    lines = file.readlines()



  from .autonotebook import tqdm as notebook_tqdm


In [6]:

class TextDataset(Dataset):
    def __init__(self, data_dict, tokenizer, max_lenght= 50):

        self.data = list(data_dict.items())
        self.tokenizer = tokenizer
        self.max_length = max_lenght

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        features, label = self.data[idx]
        feature_encoding = self.tokenizer(features, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')['input_ids'].squeeze()
        label_encoding = self.tokenizer(label, truncation=True, padding='max_length', max_length=10, return_tensors='pt')['input_ids'].squeeze()
    
        return feature_encoding, label_encoding
    
def collate_fn(batch):

    feature_encoding = []
    label_encoding = []

    for feature, label in batch:
        feature_encoding.append(feature)
        label_encoding.append(label)

    features = torch.stack(feature_encoding)
    labels = torch.stack(label_encoding)

    return features, labels


In [7]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token 

dataset = TextDataset(load_data(lines), tokenizer)
dataloader = DataLoader(dataset, batch_size=32, collate_fn=collate_fn, shuffle=True)

In [26]:


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.version.cuda)

class TextGenerationModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_size, output_size):
        super(TextGenerationModel, self).__init__() 
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_size, batch_first=True, num_layers= 2)
        self.fc1 = nn.Linear(hidden_size, hidden_size // 2) 
        self.fc2 = nn.Linear(hidden_size // 2, hidden_size // 4)   
        self.fc3 = nn.Linear(hidden_size // 4, output_size) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = self.emb(x)  
        x, _ = self.lstm(x)
        x = self.relu(self.fc1(x))
        x = self.dropout(x) 
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x  
    

model = TextGenerationModel(vocab_size=tokenizer.vocab_size, emb_dim=512, hidden_size=256, output_size=tokenizer.vocab_size).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = AdamW(model.parameters(), lr=0.001)

num_epochs = 20

for epoch in range(num_epochs):
    model.train()

    for i , (features, labels) in enumerate(dataloader):
        features, labels = features.to(device), labels.to(device)
        
        outputs = model(features)
        outputs = outputs[:, -10:, :] 

        print("Outputs shape:", outputs.shape)  # Should be [batch_size, seq_len, vocab_size]
        print("Labels shape:", labels.shape)      # Should be [batch_size, seq_len
        print("Features shape:", features.shape)  # Should be [batch_size, seq_len]

        outputs = outputs.reshape(-1, tokenizer.vocab_size)  # [batch_size*seq_len, vocab_size]
        labels = labels.reshape(-1)    

        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print("Epoch finished {}/{}. Loss: {:.4f}".format(epoch+1, num_epochs, loss.item()))

None
Outputs shape: torch.Size([32, 10, 50257])
Labels shape: torch.Size([32, 10])
Features shape: torch.Size([32, 50])
Outputs shape: torch.Size([32, 10, 50257])
Labels shape: torch.Size([32, 10])
Features shape: torch.Size([32, 50])
Outputs shape: torch.Size([32, 10, 50257])
Labels shape: torch.Size([32, 10])
Features shape: torch.Size([32, 50])
Outputs shape: torch.Size([32, 10, 50257])
Labels shape: torch.Size([32, 10])
Features shape: torch.Size([32, 50])
Outputs shape: torch.Size([32, 10, 50257])
Labels shape: torch.Size([32, 10])
Features shape: torch.Size([32, 50])


KeyboardInterrupt: 

In [None]:
def eval_encoding(eval_line):
    eval_dict = []

    words = eval_line.split(" ")

    if len(words) > 1:
        features = words
        eval_dict.append(" ".join(features))

    return eval_dict


class InputData():
    def __init__(self, text, tokenizer, max_length=50):
        self.text = text
        self.tokenizer = tokenizer

        self.max_length = max_length

    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, idx):
        text = self.text[idx]
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')['input_ids'].squeeze()

        return {'input_ids': encoding}
    
def eval_collate_fn(batch):

    input_ids = torch.stack([item['input_ids'] for item in batch])
    return input_ids



model.eval()

with open("eval.txt", "r") as file:
    eval_line = file.read()


testdataset = InputData(eval_encoding(eval_line), tokenizer)
test_loader = DataLoader(testdataset, batch_size=32, collate_fn=eval_collate_fn, shuffle=False)

with torch.no_grad():
    for i , (features) in enumerate(test_loader):
        features = features.to(device)
        outputs = model(features)
        class_preds = torch.argmax(outputs, dim=1)
        print(GPT2Tokenizer.from_pretrained('gpt2').convert_ids_to_tokens(class_preds))
        print(class_preds)

['s']
tensor([82], device='cuda:0')
