In [2]:
# Install required libraries
!pip install sentencepiece torch tqdm pandas

# Import libraries
import torch
import torch.nn as nn
import torch.optim as optim
import sentencepiece as spm
import pandas as pd
import os
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# Enable detailed CUDA error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# Load Dataset
df = pd.read_csv("spoc-train.csv")
df = df.dropna()

# Save pseudocode & C++ pairs for tokenizer training
pseudo_file = "pseudocode.txt"
cpp_file = "cpp.txt"
df["text"].to_csv(pseudo_file, index=False, header=False)
df["code"].to_csv(cpp_file, index=False, header=False)

# Train BPE Tokenizer
vocab_size = 10000
spm.SentencePieceTrainer.train(input=f"{pseudo_file},{cpp_file}", model_prefix="bpe", vocab_size=vocab_size)

# Load Tokenizer
sp = spm.SentencePieceProcessor()
sp.load("bpe.model")

# Dataset Class
class CodeDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.data = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        pseudo = self.data.iloc[idx]["text"]
        cpp = self.data.iloc[idx]["code"]
        pseudo_ids = self.tokenizer.encode(pseudo, out_type=int)
        cpp_ids = self.tokenizer.encode(cpp, out_type=int)
        return torch.tensor(pseudo_ids), torch.tensor(cpp_ids)

# Create DataLoader
dataset = CodeDataset(df, sp)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=lambda x: x)

# Transformer Model
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model=256, nhead=8, num_layers=6):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_layers, num_decoder_layers=num_layers)
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, src, tgt):
        src = self.embedding(src).permute(1, 0, 2)
        tgt = self.embedding(tgt).permute(1, 0, 2)
        output = self.transformer(src, tgt)
        return self.fc_out(output).permute(1, 0, 2)

# Training Setup
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

vocab_size = 10000
model = TransformerModel(vocab_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-4)

def train_model(model, dataloader, epochs=3):
    model.train()
    for epoch in range(epochs):
        loop = tqdm(dataloader, desc=f"Epoch {epoch+1}")
        for batch in loop:
            pseudo_batch, cpp_batch = zip(*batch)
            pseudo_batch = nn.utils.rnn.pad_sequence(pseudo_batch, batch_first=True).long().to(device)
            cpp_batch = nn.utils.rnn.pad_sequence(cpp_batch, batch_first=True).long().to(device)

            optimizer.zero_grad()
            output = model(pseudo_batch, cpp_batch[:, :-1])
            loss = criterion(output.reshape(-1, vocab_size), cpp_batch[:, 1:].reshape(-1))
            loss.backward()
            optimizer.step()
            loop.set_postfix(loss=loss.item())

train_model(model, dataloader)

# Save Model & Tokenizer
torch.save(model.state_dict(), "transformer_model.pth")

# Save the tokenizer model
with open("bpe.model", "wb") as f:
    f.write(sp.serialized_model_proto())

# Testing
def generate(model, tokenizer, pseudo):
    model.eval()
    with torch.no_grad():
        pseudo_ids = tokenizer.encode(pseudo, out_type=int)
        pseudo_tensor = torch.tensor(pseudo_ids).unsqueeze(0).to(device)
        output = model(pseudo_tensor, pseudo_tensor)
        predicted_ids = torch.argmax(output, dim=-1).squeeze().tolist()
        return tokenizer.decode(predicted_ids)

sample_pseudo = "Sort the array using quicksort"
predicted_cpp = generate(model, sp, sample_pseudo)
print("Generated C++ Code:\n", predicted_cpp)

Using device: cuda


Epoch 1: 100%|██████████| 6758/6758 [11:00<00:00, 10.23it/s, loss=0.341]
Epoch 2: 100%|██████████| 6758/6758 [10:53<00:00, 10.34it/s, loss=0.299]
Epoch 3: 100%|██████████| 6758/6758 [10:56<00:00, 10.29it/s, loss=0.135]


Generated C++ Code:
 my  G0; mmendend[12]


In [12]:
sample_pseudo = "create n"
predicted_cpp = generate(model, sp, sample_pseudo)
print("Generated C++ Code:\n", predicted_cpp)

Generated C++ Code:
 nma
