In [3]:
import torch
import torch.nn as nn
import sentencepiece as spm
import pandas as pd

# Load Dataset & Train BPE Tokenizer
df = pd.read_csv("spoc-train.csv").dropna()

cpp_file = "cpp.txt"
pseudo_file = "pseudocode.txt"
df["code"].to_csv(cpp_file, index=False, header=False)
df["text"].to_csv(pseudo_file, index=False, header=False)

spm.SentencePieceTrainer.train(input=f"{cpp_file},{pseudo_file}", model_prefix="bpe", vocab_size=8000)

# Load Trained Tokenizer
sp = spm.SentencePieceProcessor()
sp.load("bpe.model")

# Define Transformer Model
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model=256, nhead=8, num_layers=4):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.transformer = nn.Transformer(d_model=d_model, nhead=nhead, num_encoder_layers=num_layers, num_decoder_layers=num_layers)
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, src, tgt):
        src = self.embedding(src).permute(1, 0, 2)
        tgt = self.embedding(tgt).permute(1, 0, 2)
        output = self.transformer(src, tgt)
        return self.fc_out(output).permute(1, 0, 2)

# Load Pretrained Transformer Model
vocab_size = 16000
device = "cuda" if torch.cuda.is_available() else "cpu"

model = TransformerModel(vocab_size).to(device)
model.load_state_dict(torch.load("transformer_model.pth", map_location=device))
model.eval()

# Inference Function
def generate(model, tokenizer, code):
    with torch.no_grad():
        code_ids = tokenizer.encode(code, out_type=int)
        code_tensor = torch.tensor(code_ids).unsqueeze(0).to(device)
        output = model(code_tensor, code_tensor)
        predicted_ids = torch.argmax(output, dim=-1).squeeze().tolist()
        return tokenizer.decode(predicted_ids)

# Run Inference
sample_code = """
        int x;
"""
predicted_pseudo = generate(model, sp, sample_code)
print("Generated Pseudocode:\n", predicted_pseudo)


  model.load_state_dict(torch.load("transformer_model.pth", map_location=device))


Generated Pseudocode:
 x; x


In [18]:
# Run Inference
sample_code = """
        char x, y,z= 5
"""
predicted_pseudo = generate(model, sp, sample_code)
print("Generated Pseudocode:\n", predicted_pseudo)


Generated Pseudocode:
 x, y= y, 5 char
