In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import sentencepiece as spm
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# Load Dataset
df = pd.read_csv("spoc-train.csv")
df = df.dropna()

# Save C++ & pseudocode pairs for tokenizer training
cpp_file = "cpp.txt"
pseudo_file = "pseudocode.txt"
df["code"].to_csv(cpp_file, index=False, header=False)
df["text"].to_csv(pseudo_file, index=False, header=False)

# Train BPE Tokenizer with a moderate vocabulary size
spm.SentencePieceTrainer.train(input=f"{cpp_file},{pseudo_file}", model_prefix="bpe", vocab_size=8000)

# Load Tokenizer
sp = spm.SentencePieceProcessor()
sp.load("bpe.model")

# Dataset Class
class CodeDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.data = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        cpp = self.data.iloc[idx]["code"]
        pseudo = self.data.iloc[idx]["text"]
        cpp_ids = self.tokenizer.encode(cpp, out_type=int)
        pseudo_ids = self.tokenizer.encode(pseudo, out_type=int)
        return torch.tensor(cpp_ids), torch.tensor(pseudo_ids)

# Create DataLoader
dataset = CodeDataset(df, sp)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=lambda x: x)

# Transformer Model with moderate capacity
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model=256, nhead=8, num_layers=4):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_layers, num_decoder_layers=num_layers)
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, src, tgt):
        src = self.embedding(src).permute(1, 0, 2)
        tgt = self.embedding(tgt).permute(1, 0, 2)
        output = self.transformer(src, tgt)
        return self.fc_out(output).permute(1, 0, 2)

# Training Setup
vocab_size = 16000
model = TransformerModel(vocab_size).to("cuda" if torch.cuda.is_available() else "cpu")
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)
scheduler = ReduceLROnPlateau(optimizer, mode="min", factor=0.5, patience=2, verbose=True)

def train_model(model, dataloader, epochs=8):
    model.train()
    for epoch in range(epochs):
        loop = tqdm(dataloader, desc=f"Epoch {epoch+1}")
        epoch_loss = 0
        for batch in loop:
            cpp_batch, pseudo_batch = zip(*batch)
            cpp_batch = nn.utils.rnn.pad_sequence(cpp_batch, batch_first=True).long().to("cuda")
            pseudo_batch = nn.utils.rnn.pad_sequence(pseudo_batch, batch_first=True).long().to("cuda")

            optimizer.zero_grad()
            output = model(cpp_batch, pseudo_batch[:, :-1])
            loss = criterion(output.reshape(-1, vocab_size), pseudo_batch[:, 1:].reshape(-1))
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
            loop.set_postfix(loss=loss.item())

        # Adjust learning rate based on epoch loss
        scheduler.step(epoch_loss / len(dataloader))

train_model(model, dataloader)

# Save Model & Tokenizer
torch.save(model.state_dict(), "transformer_model.pth")

# Save the tokenizer model
with open("bpe.model", "wb") as f:
    f.write(sp.serialized_model_proto())

# Testing
def generate(model, tokenizer, code):
    model.eval()
    with torch.no_grad():
        code_ids = tokenizer.encode(code, out_type=int)
        code_tensor = torch.tensor(code_ids).unsqueeze(0).to("cuda")
        output = model(code_tensor, code_tensor)
        predicted_ids = torch.argmax(output, dim=-1).squeeze().tolist()
        return tokenizer.decode(predicted_ids)

# Example: Generate pseudocode from C++ code
sample_code = """
        quicksort(arr, pi + 1, high);
"""
predicted_pseudo = generate(model, sp, sample_code)
print("Generated Pseudocode:\n", predicted_pseudo)

Epoch 1: 100%|██████████| 6758/6758 [05:18<00:00, 21.19it/s, loss=0.612]
Epoch 2: 100%|██████████| 6758/6758 [05:20<00:00, 21.11it/s, loss=0.802]
Epoch 3: 100%|██████████| 6758/6758 [05:18<00:00, 21.24it/s, loss=nan]
Epoch 4: 100%|██████████| 6758/6758 [05:16<00:00, 21.35it/s, loss=4.31]
Epoch 5: 100%|██████████| 6758/6758 [05:17<00:00, 21.31it/s, loss=2.16]
Epoch 6: 100%|██████████| 6758/6758 [05:17<00:00, 21.31it/s, loss=2.14]
Epoch 7: 100%|██████████| 6758/6758 [05:16<00:00, 21.36it/s, loss=nan]
Epoch 8: 100%|██████████| 6758/6758 [05:16<00:00, 21.36it/s, loss=1.32]


Generated Pseudocode:
 ((arri high,, 1, high)(


In [10]:
# Example: Generate pseudocode from C++ code
sample_code = """
        int x;
        if (n >= 5)
        {
          cout"number is greater";
        }
"""
predicted_pseudo = generate(model, sp, sample_code)
print("Generated Pseudocode:\n", predicted_pseudo)

Generated Pseudocode:
  ⁇  is ⁇  (n" 5);" (; is greater than; ⁇ n
