In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import math
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

cpu


Find a dataset about Python(or similar; not too big) - you guys can search code parrot by hugging face....(https://huggingface.co/datasets/codeparrot/github-jupyter-code-to-text) they have dataset for python - don’t take everything - it will be too big

In [3]:
import datasets 
train = datasets.load_dataset("codeparrot/github-jupyter-code-to-text", split="train")
test  = datasets.load_dataset("codeparrot/github-jupyter-code-to-text", split="test")
print(train, test)

Found cached dataset parquet (C:/Users/anhng/.cache/huggingface/datasets/codeparrot___parquet/codeparrot--github-jupyter-code-to-text-cf9b56d996fd17e1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Found cached dataset parquet (C:/Users/anhng/.cache/huggingface/datasets/codeparrot___parquet/codeparrot--github-jupyter-code-to-text-cf9b56d996fd17e1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


Dataset({
    features: ['repo_name', 'path', 'license', 'content'],
    num_rows: 47452
}) Dataset({
    features: ['repo_name', 'path', 'license', 'content'],
    num_rows: 11864
})


In [4]:
#Split New line Sentence
train_split = [split for text in train['content'] for split in text.split('\n') if split != ""]
test_split = [split for text in test['content'] for split in text.split('\n') if split != ""]

In [5]:
len(train_split), len(test_split)

(11367363, 2875424)

In [6]:
from torchtext.data.utils import get_tokenizer
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

tokenized_dataset_train = yield_tokens(train_split[:int(len(train_split)/100)])
tokenized_dataset_test = yield_tokens(test_split[:int(len(test_split)/100)])

In [7]:
import re
import string

def preprocessing(text):
    # Remove HTML tags
    text = re.sub('<.*?>', '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Convert to lowercase
    text = text.lower()
    
    # Split the text into words
    words = text.split()
    
    # Remove stop words
    stop_words = {'a', 'an', 'and', 'the', 'this', 'that', 'is'}
    words = [word for word in words if word not in stop_words]
    
    # Lemmatize the words
    lemmatized_words = []
    for word in words:
        if word.endswith('s'):
            word = word[:-1]  # remove plural 's'
        lemmatized_words.append(word)
    
    # Join the words back into a string
    text = ' '.join(lemmatized_words)
    
    return text

In [8]:
from torchtext.vocab import build_vocab_from_iterator
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(preprocessing(text))

vocab = build_vocab_from_iterator(yield_tokens(train_split[:int(len(test_split)/100)]), min_freq=5) 
vocab.insert_token('<unk>', 0)           
vocab.insert_token('<eos>', 1)            
vocab.set_default_index(vocab['<unk>'])   
print('Vocab Size',len(vocab))                         
print(vocab.get_itos()[:10])     

Vocab Size 2956
['<unk>', '<eos>', 'of', 'explanation', 'to', 'in', 'for', 'end', 'a', 'value']


In [9]:
with open('vocab.txt', 'w', encoding='utf-8') as f:
    for item in vocab.get_itos():
        f.write("%s\n" % item)
    print('Done')

v = [line.rstrip() for line in open('vocab.txt', mode = 'r', encoding='utf-8')]
print('Vocab Size check', len(v))

Done
Vocab Size check 2956


In [10]:
import pickle
# Store data (serialize)
with open('vocab.pickle', 'wb') as handle:
    pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Load data (deserialize)
with open('vocab.pickle', 'rb') as handle:
    check_vocab = pickle.load(handle)
check_vocab #Good

Vocab()

In [11]:
def get_data(dataset, vocab, batch_size):
    data = []                                                   
    for example in dataset:       
        #appends eos so we know it ends....so model learn how to end...                             
        tokens = example.append('<eos>') #end of sentence
        #numericalize          
        tokens = [vocab[token] for token in example] 
        data.extend(tokens)                                    
    data = torch.LongTensor(data)                                 
    num_batches = data.shape[0] // batch_size 
    data = data[:num_batches * batch_size]                       
    data = data.view(batch_size, num_batches)        
    return data

In [12]:
batch_size = 128
train_data = get_data(tokenized_dataset_train, vocab, batch_size)
valid_data = get_data(tokenized_dataset_test, vocab, batch_size)

In [13]:
train_data.shape #[batch_size, all the next length]

torch.Size([128, 8296])

In [14]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, dropout_rate):
                
        super().__init__()
        self.hid_dim = hid_dim
        self.num_layers = num_layers
        self.embedding = nn.Embedding(vocab_size,emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers, 
                                        dropout = dropout_rate, batch_first = True)
        self.dropout = nn.Dropout(dropout_rate)
        #when you do LM, you look forward, so it does not make sense to do bidirectional
        self.fc = nn.Linear(hid_dim,vocab_size)

    def init_hidden(self, batch_size, device):
        #this function gonna be run in the beginning of the epoch
        hidden = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        cell   = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)

        return hidden, cell #return as tuple

    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach() #removing this hidden from gradients graph
        cell =  cell.detach() #removing this hidden from gradients graph
        return hidden, cell

    def forward(self, src, hidden):
        #src: [batch_size, seq_len]

        #embed 
        embedded = self.embedding(src)
        #embed : [batch_size, seq_len, emb_dim]

        #send this to the lstm
        #we want to put hidden here... because we want to reset hidden .....
        output, hidden = self.lstm(embedded, hidden)
        #output : [batch_size, seq_len, hid_dim] ==> all hidden states
        #hidden : [batch_size, seq_len, hid_dim] ==> last hidden states from each layer

        output = self.dropout(output)
        prediction = self.fc(output)
        #prediction: [batch size, seq_len, vocab_size]
        return prediction, hidden

In [15]:
vocab_size = len(vocab)
emb_dim = 1024                
hid_dim = 1024               
num_layers = 2              
dropout_rate = 0.65              
lr = 1e-3     

In [16]:
model = LSTM(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 22,850,444 trainable parameters


In [17]:
def get_batch(data, seq_len, idx):
    src    = data[:, idx:idx+seq_len]                   
    target = data[:, idx+1:idx+seq_len+1]  #target simply is ahead of src by 1            
    return src, target

In [18]:
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    
    epoch_loss = 0
    model.train()
    # drop all batches that are not a multiple of seq_len
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)
    
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):
        optimizer.zero_grad()
        hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, idx) #src, target: [batch size, seq len]
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        prediction, hidden = model(src, hidden)               

        prediction = prediction.reshape(batch_size * seq_len, -1)  #prediction: [batch size * seq len, vocab size]  
        target = target.reshape(-1)
        loss = criterion(prediction, target)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip) #prevent gradient explosion - clip is basically 
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [19]:
def evaluate(model, data, criterion, batch_size, seq_len, device):
    
    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches


In [20]:
n_epochs = 30
seq_len = 50
clip = 0.25

lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

best_valid_loss = float('inf')

for epoch in range(n_epochs):
    train_loss = train(model, train_data, optimizer, criterion, batch_size, seq_len, clip, device)
    valid_loss = evaluate(model, valid_data, criterion, batch_size, seq_len, device)

    lr_scheduler.step(valid_loss)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'LSTM.pt')

    print(f'Epoch: {epoch+1}')
    print(f'Train Perplexity: {math.exp(train_loss):.3f}')
    print(f'Valid Perplexity: {math.exp(valid_loss):.3f}')

                                                           

Epoch: 1
Train Perplexity: 9.812
Valid Perplexity: 8.015


                                                           

Epoch: 2
Train Perplexity: 6.820
Valid Perplexity: 7.195


                                                           

Epoch: 3
Train Perplexity: 6.179
Valid Perplexity: 6.904


                                                           

Epoch: 4
Train Perplexity: 5.823
Valid Perplexity: 6.703


                                                           

Epoch: 5
Train Perplexity: 5.574
Valid Perplexity: 6.569


                                                           

Epoch: 6
Train Perplexity: 5.362
Valid Perplexity: 6.465


                                                           

Epoch: 7
Train Perplexity: 5.170
Valid Perplexity: 6.373


                                                           

Epoch: 8
Train Perplexity: 5.016
Valid Perplexity: 6.323


                                                           

Epoch: 9
Train Perplexity: 4.868
Valid Perplexity: 6.306


                                                           

Epoch: 10
Train Perplexity: 4.732
Valid Perplexity: 6.271


Training:  95%|█████████▍| 156/165 [13:06<00:46,  5.17s/it]