<a href="https://colab.research.google.com/github/EliyaKaheni/LLM-Playground/blob/main/LSTM_CodeGen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [40]:
!pip install -U datasets



In [41]:
from datasets import load_dataset

dataset = load_dataset('code_search_net', 'python')

Using the latest cached version of the dataset since code_search_net couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'python' at /root/.cache/huggingface/datasets/code_search_net/python/1.0.0/8f2524e6b62f65af5f5d65c53715c654db7b08dc93e0b7bcce2ab2f286a75be1 (last modified on Thu Jul 10 14:53:23 2025).


In [42]:
dataset

DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 412178
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 22176
    })
    validation: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 23107
    })
})

In [43]:
train_data = dataset['train']
validation_data = dataset['validation']
test_data = dataset['test']

In [44]:
code_corpus = ''

for example in train_data.select(range(1000)):
  code_corpus += example['whole_func_string'] + '\n'

In [45]:
import re

text = code_corpus.strip()

text = text.replace('\n', ' <NL> ')

punctuations = ['(', ')', ':', ',', '%', '!', '*', '+', '=']
for p in punctuations:
  text = text.replace(p, f' {p} ')

tokens = text.split()
print(tokens)



In [46]:
vocab = sorted(set(tokens))
vocab_size = len(vocab)
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}
print(f"Vocabulary size: {vocab_size}")
print(f"Sample: {list(vocab)[:10]}")

Vocabulary size: 18204
Sample: ['!', '"', '""', '"""', '"""<', '"""Activate', '"""Actually', '"""Add', '"""Adds', '"""Aggregates']


In [47]:
seq_length = 10
tokens_indices = [word_to_idx[w] for w in tokens]

inputs = []
targets = []

for i in range(len(tokens_indices) - seq_length):
  seq = tokens_indices[i: i+seq_length]
  target = tokens_indices[i + seq_length]
  inputs.append(seq)
  targets.append(target)


In [48]:
import torch

inputs = torch.tensor(inputs, dtype=torch.long)
targets = torch.tensor(targets, dtype=torch.long)

print("Training samples:", inputs.shape[0])

Training samples: 203065


In [49]:
from torch.utils.data import TensorDataset, DataLoader

dataset = TensorDataset(inputs, targets)
batch_size = 32
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [72]:
from torch import nn

class CodeLSTMModel(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers=1):
    super(CodeLSTMModel, self).__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True)
    self.fc = nn.Linear(hidden_size, vocab_size)

  def forward(self, x, hidden=None):
    embeds = self.embedding(x)
    if hidden is None:
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        hidden = (h0, c0)

    out, hidden = self.lstm(embeds, hidden)
    last_out = out[:, -1, :]
    logits = self.fc(last_out)
    return logits, hidden



embedding_dim = 100
hidden_size = 128
num_layers = 1

model = CodeLSTMModel(vocab_size, embedding_dim, hidden_size, num_layers)
print(model)


CodeLSTMModel(
  (embedding): Embedding(18204, 100)
  (lstm): LSTM(100, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=18204, bias=True)
)


In [73]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [74]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 20

In [77]:
model. train()
for epoch in range(1, num_epochs+1):
  total_loss = 0.0
  for batch_inputs, batch_targets in loader:
    batch_inputs = batch_inputs.to(device)
    batch_targets = batch_targets.to(device)

    optimizer.zero_grad()
    logits, _ = model(batch_inputs)
    loss = criterion(logits, batch_targets)

    loss.backward()
    optimizer.step()

    total_loss += loss.item()

  avg_loss = total_loss / len(loader)
  print(f'Epoch {epoch}/{num_epochs} - Training loss: {avg_loss:.4f}')

Epoch 1/20 - Training loss: 4.7727
Epoch 2/20 - Training loss: 3.6668
Epoch 3/20 - Training loss: 3.1329
Epoch 4/20 - Training loss: 2.7423
Epoch 5/20 - Training loss: 2.4222
Epoch 6/20 - Training loss: 2.1515
Epoch 7/20 - Training loss: 1.9194
Epoch 8/20 - Training loss: 1.7164
Epoch 9/20 - Training loss: 1.5387
Epoch 10/20 - Training loss: 1.3822
Epoch 11/20 - Training loss: 1.2465
Epoch 12/20 - Training loss: 1.1273
Epoch 13/20 - Training loss: 1.0251
Epoch 14/20 - Training loss: 0.9369
Epoch 15/20 - Training loss: 0.8557
Epoch 16/20 - Training loss: 0.7902
Epoch 17/20 - Training loss: 0.7280
Epoch 18/20 - Training loss: 0.6773
Epoch 19/20 - Training loss: 0.6285
Epoch 20/20 - Training loss: 0.5892
