In [1]:
import torch, torchdata, torchtext
from torch import nn
import torch.nn.functional as F

import random, math, time

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

#make our work comparable if restarted the kernel
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

  from .autonotebook import tqdm as notebook_tqdm


cpu


In [2]:
torch.__version__

'1.13.1+cpu'

In [3]:
torchtext.__version__

'0.14.1'

Find a dataset about Python(or similar; not too big) - you guys can search code parrot by hugging face....(https://huggingface.co/datasets/codeparrot/github-jupyter-code-to-text) they have dataset for python - don’t take everything - it will be too big

# 1. Load data - Wiki Text

In [4]:
import datasets 
train = datasets.load_dataset("codeparrot/github-jupyter-code-to-text", split="train")
test  = datasets.load_dataset("codeparrot/github-jupyter-code-to-text", split="test")
print(train, test)

Found cached dataset parquet (C:/Users/anhng/.cache/huggingface/datasets/codeparrot___parquet/codeparrot--github-jupyter-code-to-text-cf9b56d996fd17e1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Found cached dataset parquet (C:/Users/anhng/.cache/huggingface/datasets/codeparrot___parquet/codeparrot--github-jupyter-code-to-text-cf9b56d996fd17e1/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


Dataset({
    features: ['repo_name', 'path', 'license', 'content'],
    num_rows: 47452
}) Dataset({
    features: ['repo_name', 'path', 'license', 'content'],
    num_rows: 11864
})


In [5]:
#Split New line Sentence
train_split = [split for text in train['content'] for split in text.split('\n') if split != ""]
test_split = [split for text in test['content'] for split in text.split('\n') if split != ""]

In [6]:
len(train_split), len(test_split)

(11367363, 2875424)

# 2. Preprocessing

In [7]:
from torchtext.data.utils import get_tokenizer
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

tokenized_dataset_train = yield_tokens(train_split[:int(len(train_split)/100)])
tokenized_dataset_test = yield_tokens(test_split[:int(len(test_split)/100)])

In [8]:
import re
import string

def preprocessing(text):
    # Remove HTML tags
    text = re.sub('<.*?>', '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Convert to lowercase
    text = text.lower()
    
    # Split the text into words
    words = text.split()
    
    # Remove stop words
    stop_words = {'a', 'an', 'and', 'the', 'this', 'that', 'is'}
    words = [word for word in words if word not in stop_words]
    
    # Lemmatize the words
    lemmatized_words = []
    for word in words:
        if word.endswith('s'):
            word = word[:-1]  # remove plural 's'
        lemmatized_words.append(word)
    
    # Join the words back into a string
    text = ' '.join(lemmatized_words)
    
    return text

In [9]:
from torchtext.vocab import build_vocab_from_iterator
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(preprocessing(text))

vocab = build_vocab_from_iterator(yield_tokens(train_split[:int(len(test_split)/100)]), min_freq=5) 
vocab.insert_token('<unk>', 0)           
vocab.insert_token('<eos>', 1)            
vocab.set_default_index(vocab['<unk>'])   
print('Vocab Size',len(vocab))                         
print(vocab.get_itos()[:10])     

Vocab Size 2956
['<unk>', '<eos>', 'of', 'explanation', 'to', 'in', 'for', 'end', 'a', 'value']


In [10]:
with open('vocab.txt', 'w', encoding='utf-8') as f:
    for item in vocab.get_itos():
        f.write("%s\n" % item)
    print('Done')

v = [line.rstrip() for line in open('vocab.txt', mode = 'r', encoding='utf-8')]
print('Vocab Size check', len(v))

Done
Vocab Size check 2956


In [11]:
import pickle
# Store data (serialize)
with open('vocab.pickle', 'wb') as handle:
    pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Load data (deserialize)
with open('vocab.pickle', 'rb') as handle:
    check_vocab = pickle.load(handle)
check_vocab #Good

Vocab()

# 3. Prepare the batch loader

In [12]:
def get_data(dataset, vocab, batch_size):
    data = []                                                   
    for example in dataset:       
        #appends eos so we know it ends....so model learn how to end...                             
        tokens = example.append('<eos>') #end of sentence
        #numericalize          
        tokens = [vocab[token] for token in example] 
        data.extend(tokens)                                    
    data = torch.LongTensor(data)                                 
    num_batches = data.shape[0] // batch_size 
    data = data[:num_batches * batch_size]                       
    data = data.view(batch_size, num_batches)        
    return data

In [13]:
batch_size = 64
train_data = get_data(tokenized_dataset_train, vocab, batch_size)
valid_data = get_data(tokenized_dataset_test, vocab, batch_size)

In [14]:
train_data.shape #[batch_size, all the next length]

torch.Size([64, 16592])

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, dropout_rate):
                
        super().__init__()
        self.hid_dim = hid_dim
        self.num_layers = num_layers
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.positional_encoding = PositionalEncoding(emb_dim, max_len=512)
        self.decoder_layers = nn.ModuleList([DecoderLayer(hid_dim, dropout_rate) for _ in range(num_layers)])
        self.fc = nn.Linear(hid_dim, vocab_size)

    def init_hidden(self, batch_size, device):
        # There is no hidden state in the Transformer, so we return None
        return None

    def forward(self, src, tgt, src_mask, tgt_mask):
        # src: [batch_size, src_len]
        # tgt: [batch_size, tgt_len]
        # src_mask: [batch_size, 1, 1, src_len]
        # tgt_mask: [batch_size, 1, tgt_len, tgt_len]

        embedded_tgt = self.embedding(tgt)
        # embedded_tgt: [batch_size, tgt_len, emb_dim]

        # Apply positional encoding
        embedded_tgt = self.positional_encoding(embedded_tgt)

        # Apply the decoder layers
        for layer in self.decoder_layers:
            embedded_tgt = layer(embedded_tgt, src_mask, tgt_mask)

        # Apply the final linear layer to get the predicted next word
        prediction = self.fc(embedded_tgt)
        # prediction: [batch_size, tgt_len, vocab_size]

        return prediction, None
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super().__init__()

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x
class DecoderLayer(nn.Module):
    def __init__(self, hid_dim, dropout_rate):
        super().__init__()
        self.self_attn = nn.MultiheadAttention(hid_dim, num_heads=8, dropout=dropout_rate)
        self.feed_forward = nn.Sequential(
            nn.Linear(hid_dim, 4*hid_dim),
            nn.ReLU(),
            nn.Linear(4*hid_dim, hid_dim),
            nn.Dropout(dropout_rate)
        )
        self.layer_norm1 = nn.LayerNorm(hid_dim)
        self.layer_norm2 = nn.LayerNorm(hid_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, tgt, memory, tgt_mask):
        # tgt: [batch_size, tgt_len, hid_dim]
        # memory: [batch_size, src_len, hid_dim]
        # tgt_mask: [batch_size, tgt_len, tgt_len]

        # Apply self-attention with dropout and residual connection
        tgt2, _ = self.self_attn(tgt, tgt, tgt, attn_mask=tgt_mask)
        tgt = tgt + self.dropout(tgt2)
        tgt = self.layer_norm1(tgt)

        # Apply encoder-decoder attention with dropout and residual connection
        tgt2, _ = nn.MultiheadAttention(hid_dim, num_heads=8, dropout=dropout_rate)(tgt, memory, memory, attn_mask=None)
        tgt = tgt + self.dropout(tgt2)
        tgt = self.layer_norm2(tgt)

        # Apply feedforward with dropout and residual connection
        tgt2 = self.feed_forward(tgt)
        tgt = tgt + self.dropout(tgt2)
        tgt = self.layer_norm2(tgt)

        return tgt

# 5. Training

In [16]:
vocab_size = len(vocab)
emb_dim = 512                
hid_dim = 2048               
num_layers = 6              
dropout_rate = 0.1              
lr = 1e-4 
import torch.optim as optim
decoder = TransformerDecoder(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)
optimizer = optim.Adam(decoder.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in decoder.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 309,719,948 trainable parameters


In [17]:
def get_batch(data, seq_len, idx):
    src = data[:, idx:idx+seq_len]
    target = data[:, idx+1:idx+seq_len+1]
    return src, target

In [18]:
def train(model, loader, optimizer, criterion, batch_size, seq_len, clip, device):
    
    model.train()
    epoch_loss = 0
    
    for src, trg in loader:
        
        src = src.to(device)
        trg = trg.to(device)
        
        optimizer.zero_grad()
        
        output, _ = model(src, trg[:,:-1])
        
        #trg    = [batch size, trg len]
        #output = [batch size, trg len - 1, output dim]
        output_dim = output.shape[-1]
        
        #the loss function only works on 2d inputs with 1d targets thus we need to flatten each of them
        output = output.reshape(-1, output_dim)
        trg    = trg[:,1:].reshape(-1)
        #trg    = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        #clip the gradients to prevent them from exploding (a common issue in RNNs)
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(loader)

In [19]:
def evaluate(model, loader, criterion, batch_size, seq_len, device):
    
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
    
        for src, trg in loader:

            src = src.to(device)
            trg = trg.to(device)

            output, _ = model(src, trg[:,:-1])

            #trg    = [batch size, trg len]
            #output = [batch size, trg len - 1, output dim]
            output_dim = output.shape[-1]

            #the loss function only works on 2d inputs with 1d targets thus we need to flatten each of them
            output = output.reshape(-1, output_dim)
            trg    = trg[:,1:].reshape(-1)
            #trg    = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(loader)

In [20]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [21]:
best_valid_loss = float('inf')
num_epochs = 5
clip = 1

save_path = f'{decoder.__class__.__name__}_general.pt'

train_losses = []
valid_losses = []

for epoch in range(num_epochs):

    start_time = time.time()

    train_loss = train(decoder, train_data, optimizer, criterion, batch_size, seq_len, clip, device)
    valid_loss = evaluate(decoder, valid_data, criterion, batch_size, seq_len, device)

    # for plotting
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(decoder.state_dict(), save_path)

    print(f'Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\tVal. Loss: {valid_loss:.3f} | Val. PPL: {math.exp(valid_loss):7.3f}')


NameError: name 'seq_len' is not defined

In [None]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)
            
            #prediction: [batch size, seq len, vocab size]
            #prediction[:, -1]: [batch size, vocab size] #probability of last vocab
            
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)  
            prediction = torch.multinomial(probs, num_samples=1).item()    
            
            while prediction == vocab['<unk>']: #if it is unk, we sample again
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:    #if it is eos, we stop
                break

            indices.append(prediction) #autoregressive, thus output becomes input

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [None]:
prompt = 'deep learning model'
max_seq_len = 30
seed = 0
            #superdiverse   more diverse
temperatures = [0.5, 0.7, 0.75, 0.8, 1.0] 
#sample from this distribution higher probability will get more change
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, tokenizer, 
                          vocab, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')

0.5
deep learning model will be used to open a model that can be used to classify all of these data to a model that will be used to calculate a dataset in a

0.7
deep learning model should be able to use a pretty good department or a particular model that would be used to optimize your model that you need to recover a small distribution of

0.75
deep learning model should be able to use one form which can be used by a powerful department or just more or more to 1 on a small group of individual science or

0.8
deep learning model should be able to use open first movie or equal than more or low than 1 or more likely than one sample with every pair of a line is being

1.0
deep learning model should be able to use open 70 to not free efficiently division dimension or high or white estimation

