<a href="https://colab.research.google.com/github/AlishbaNazir/Computer-Network-Project/blob/main/pToCpp_(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch torchvision torchaudio
!pip install sentencepiece
!pip install transformers
!pip install datasets
!pip install streamlit


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
from google.colab import drive
drive.mount('/content/drive')

data upload

In [3]:
import pandas as pd

from google.colab import files
uploaded = files.upload()


df = pd.read_csv("spoc-train.tsv", sep="\t")

df = df[["text", "code"]]

# Display sample data
print(df.head())


Saving spoc-train.tsv to spoc-train.tsv
                                              text  \
0                in the function gcd(a,b=integers)   
1  if b=1 return a, else call function gcd(b, a%b)   
2                                              NaN   
3                                              NaN   
4               n , nn, ans = integers with ans =0   

                             code  
0         int gcd(int a, int b) {  
1  return !b ? a : gcd(b, a % b);  
2                               }  
3                    int main() {  
4             int n, nn, ans = 0;  


preprocesss

In [4]:
import pandas as pd

# Load the dataset (assuming it's in Colab's local storage)
df = pd.read_csv("spoc-train.tsv", sep="\t").dropna(subset=['text', 'code'])

# Group by 'subid' to reconstruct full pseudocode-C++ pairs
grouped_data = df.groupby("subid").agg({"text": " ".join, "code": " ".join}).reset_index()

# Display some examples
print(grouped_data.head())


     subid                                               text  \
0   795859  function get_ref with long long argument a tha...   
1  1497213  let x, y, z, t, k, m, a, b, c and n be long lo...   
2  1608000  create integer arrays p of size 101 within an ...   
3  1646095  a = integer array of size 100001 n = integer r...   
4  1646579  n, i, a, ans, d = integers with ans = 0, a, d ...   

                                                code  
0  long long get_ref(long long a) { long long ans...  
1  long long x = 1, y = 1, z = 1, t, k, m, a, b, ...  
2  int p[101][101], ai[1000], bi[1000]; unsigned ...  
3  int a[100001]; int n; cin >> n; for (int i = 1...  
4  int n, i, a[1111111], ans = 0, d[1111111]; cin...  


tokenization

In [27]:
import sentencepiece as spm

# Save the training data for tokenization
with open("pseudocode.txt", "w") as f:
    f.write("\n".join(grouped_data["text"].tolist()))

with open("cpp_code.txt", "w") as f:
    f.write("\n".join(grouped_data["code"].tolist()))

# Train SentencePiece tokenizer for both pseudocode and C++ code
# Train SentencePiece tokenizer for both pseudocode and C++ code
spm.SentencePieceTrainer.train(input="pseudocode.txt", model_prefix="pseudocode", vocab_size=6000)
spm.SentencePieceTrainer.train(input="cpp_code.txt", model_prefix="cpp_code", vocab_size=6000)



# Load trained tokenizers
pseudocode_tokenizer = spm.SentencePieceProcessor(model_file="pseudocode.model")
cpp_tokenizer = spm.SentencePieceProcessor(model_file="cpp_code.model")

# Test tokenization
print(pseudocode_tokenizer.encode("function gcd(a,b) return a if b=0 else gcd(b, a%b)", out_type=str))
print(cpp_tokenizer.encode("int gcd(int a, int b) { return b ? gcd(b, a % b) : a; }", out_type=str))


['▁function', '▁gcd', '(', 'a', ',', 'b', ')', '▁return', '▁a', '▁if', '▁b', '=0', '▁else', '▁gcd', '(', 'b', ',', '▁a', '%', 'b', ')']
['▁int', '▁gcd', '(', 'int', '▁a', ',', '▁int', '▁b', ')', '▁{', '▁return', '▁b', '▁', '?', '▁gcd', '(', 'b', ',', '▁a', '▁%', '▁b', ')', '▁:', '▁a', ';', '▁}']


data loader

In [6]:
import torch
from torch.utils.data import Dataset, DataLoader

class PseudoCodeDataset(Dataset):
    def __init__(self, data, pseudo_tokenizer, cpp_tokenizer, max_length=128):
        self.pseudo_tokenizer = pseudo_tokenizer
        self.cpp_tokenizer = cpp_tokenizer
        self.max_length = max_length
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        pseudo = self.data.iloc[idx]["text"]
        cpp = self.data.iloc[idx]["code"]

        pseudo_ids = self.pseudo_tokenizer.encode(pseudo, out_type=int)
        cpp_ids = self.cpp_tokenizer.encode(cpp, out_type=int)

        # Padding
        pseudo_ids = pseudo_ids[:self.max_length] + [0] * (self.max_length - len(pseudo_ids))
        cpp_ids = cpp_ids[:self.max_length] + [0] * (self.max_length - len(cpp_ids))

        return torch.tensor(pseudo_ids), torch.tensor(cpp_ids)

# Create dataset
dataset = PseudoCodeDataset(grouped_data, pseudocode_tokenizer, cpp_tokenizer)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Test the DataLoader
for pseudo_batch, cpp_batch in dataloader:
    print("Pseudocode Batch:", pseudo_batch.shape)
    print("C++ Code Batch:", cpp_batch.shape)
    break


Pseudocode Batch: torch.Size([32, 128])
C++ Code Batch: torch.Size([32, 128])


transformer

In [30]:
import torch
import torch.nn as nn
import torch.optim as optim
import math

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=128):  # Reduced from 5000 to 128
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :].to(x.device)

class TransformerSeq2Seq(nn.Module):
    def __init__(self, vocab_size, d_model=512, num_heads=8, num_layers=6):
        super(TransformerSeq2Seq, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=num_heads)
        self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=num_heads)
        self.decoder = nn.TransformerDecoder(self.decoder_layer, num_layers=num_layers)
        self.output_layer = nn.Linear(d_model, vocab_size)

    def forward(self, src, tgt):
        src_emb = self.embedding(src)
        tgt_emb = self.embedding(tgt)
        memory = self.encoder(src_emb)
        decoder_output = self.decoder(tgt_emb, memory)
        logits = self.output_layer(decoder_output)
        return logits

# Define model
model = TransformerSeq2Seq(vocab_size=8000, d_model=512).to(device)

# Updated Loss & Optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding
optimizer = optim.Adam(model.parameters(), lr=0.0005)  # Increased LR
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)




In [None]:
for i, (pseudo_batch, cpp_batch) in enumerate(dataloader):
    pseudo_batch, cpp_batch = pseudo_batch.to(device), cpp_batch.to(device)  # Move to GPU

    optimizer.zero_grad()
    output = model(pseudo_batch, cpp_batch)

    loss = criterion(output.view(-1, 8000), cpp_batch.view(-1))  # Ensure same device
    loss.backward()
    optimizer.step()
    scheduler.step()  # Reduce learning rate every few epochs to avoid overfitting



train model

In [8]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
num_epochs = 50  # Start with 50 epochs
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for src, tgt in dataloader:  # Load data batch-by-batch
        src, tgt = src.to(device), tgt.to(device)

        optimizer.zero_grad()

        output = model(src, tgt)  # Forward pass
        output = output.view(-1, output.shape[-1])  # (batch_size * seq_length, vocab_size)
        tgt = tgt.view(-1)  # (batch_size * seq_length)

        loss = criterion(output, tgt)  # Compute loss
        loss.backward()  # Backpropagation
        optimizer.step()  # Update weights

        total_loss += loss.item()

    scheduler.step()  # Adjust learning rate

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss / len(dataloader):.4f}")


Epoch [1/50], Loss: 4.9543
Epoch [2/50], Loss: 4.8890
Epoch [3/50], Loss: 4.8851
Epoch [4/50], Loss: 4.8815
Epoch [5/50], Loss: 4.8792
Epoch [6/50], Loss: 4.8720
Epoch [7/50], Loss: 4.8714
Epoch [8/50], Loss: 4.8712
Epoch [9/50], Loss: 4.8710
Epoch [10/50], Loss: 4.8706
Epoch [11/50], Loss: 4.8674
Epoch [12/50], Loss: 4.8675
Epoch [13/50], Loss: 4.8668
Epoch [14/50], Loss: 4.8671
Epoch [15/50], Loss: 4.8670
Epoch [16/50], Loss: 4.8652
Epoch [17/50], Loss: 4.8656
Epoch [18/50], Loss: 4.8653
Epoch [19/50], Loss: 4.8651
Epoch [20/50], Loss: 4.8651
Epoch [21/50], Loss: 4.8645
Epoch [22/50], Loss: 4.8640
Epoch [23/50], Loss: 4.8640
Epoch [24/50], Loss: 4.8639
Epoch [25/50], Loss: 4.8639
Epoch [26/50], Loss: 4.8633
Epoch [27/50], Loss: 4.8634
Epoch [28/50], Loss: 4.8631
Epoch [29/50], Loss: 4.8630
Epoch [30/50], Loss: 4.8631
Epoch [31/50], Loss: 4.8629
Epoch [32/50], Loss: 4.8632
Epoch [33/50], Loss: 4.8633
Epoch [34/50], Loss: 4.8628
Epoch [35/50], Loss: 4.8629
Epoch [36/50], Loss: 4.8631
E

dataset/ data loader

In [33]:
from torch.utils.data import Dataset, DataLoader

class CodeDataset(Dataset):
    def __init__(self, pseudocode_texts, cpp_texts, tokenizer, max_length=128):
        self.pseudocode_texts = pseudocode_texts
        self.cpp_texts = cpp_texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.pseudocode_texts)

    def __getitem__(self, idx):
        src = self.tokenizer.encode(self.pseudocode_texts[idx], out_type=int)
        tgt = self.tokenizer.encode(self.cpp_texts[idx], out_type=int)

        # Pad sequences
        src = src[:self.max_length] + [0] * (self.max_length - len(src))
        tgt = tgt[:self.max_length] + [0] * (self.max_length - len(tgt))

        return torch.tensor(src), torch.tensor(tgt)


In [38]:
# Example pseudo-code and C++ pairs (Replace with real data)
pseudocode_samples = ["function gcd(a,b) return a if b=0 else gcd(b, a%b)"]
cpp_samples = ["int gcd(int a, int b) { return b ? gcd(b, a % b) : a; }"]

# Load tokenizers
pseudocode_tokenizer = spm.SentencePieceProcessor(model_file="pseudocode.model")
cpp_tokenizer = spm.SentencePieceProcessor(model_file="cpp_code.model")

# Create dataset
dataset = CodeDataset(pseudocode_samples, cpp_samples, pseudocode_tokenizer)

# Create DataLoader
def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)

    src_batch = torch.nn.utils.rnn.pad_sequence([torch.tensor(x) for x in src_batch], batch_first=True, padding_value=0)
    tgt_batch = torch.nn.utils.rnn.pad_sequence([torch.tensor(x) for x in tgt_batch], batch_first=True, padding_value=0)

    return src_batch.to(device), tgt_batch.to(device)

train_dataloader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(dataset, batch_size=16, collate_fn=collate_fn)



In [39]:
for batch in train_dataloader:
    src, tgt = batch
    print("Source Shape:", src.shape)
    print("Target Shape:", tgt.shape)
    break  # Only print one batch


Source Shape: torch.Size([1, 128])
Target Shape: torch.Size([1, 128])


  src_batch = torch.nn.utils.rnn.pad_sequence([torch.tensor(x) for x in src_batch], batch_first=True, padding_value=0)
  tgt_batch = torch.nn.utils.rnn.pad_sequence([torch.tensor(x) for x in tgt_batch], batch_first=True, padding_value=0)


In [40]:
import torch
import torch.nn.functional as F

def train_model(model, train_dataloader, val_dataloader, optimizer, criterion, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in train_dataloader:
            src, tgt = batch
            src, tgt = src.to(device), tgt.to(device)

            optimizer.zero_grad()
            output = model(src, tgt[:, :-1])  # Shift target left for teacher forcing
            loss = criterion(output.view(-1, output.shape[-1]), tgt[:, 1:].reshape(-1))  # Ignore <START>

            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_dataloader)}")


In [41]:
num_epochs = 50
train_model(model, train_dataloader, val_dataloader, optimizer, criterion, num_epochs)


  src_batch = torch.nn.utils.rnn.pad_sequence([torch.tensor(x) for x in src_batch], batch_first=True, padding_value=0)
  tgt_batch = torch.nn.utils.rnn.pad_sequence([torch.tensor(x) for x in tgt_batch], batch_first=True, padding_value=0)


RuntimeError: shape '[1, 1016, 64]' is invalid for input of size 65536

In [9]:
torch.save(model.state_dict(), "transformer_model.pth")
print("Model saved successfully!")


Model saved successfully!


In [10]:
# Load trained model
model.load_state_dict(torch.load("transformer_model.pth", map_location=device))
model.eval()  # Set to evaluation mode
print("Model loaded successfully!")


Model loaded successfully!


  model.load_state_dict(torch.load("transformer_model.pth", map_location=device))


In [11]:
print("Model vocab size:", 8000)  # Should match training vocab size
print("Tokenizer vocab size:", cpp_tokenizer.get_piece_size())  # Check tokenizer vocab size


Model vocab size: 8000
Tokenizer vocab size: 6000


test token

In [23]:
test_sentence = "function gcd(a,b) return a if b=0 else gcd(b, a%b)"
print("Tokenized:", pseudocode_tokenizer.encode(test_sentence, out_type=int))
print("Decoded:", pseudocode_tokenizer.decode(pseudocode_tokenizer.encode(test_sentence, out_type=int)))


Tokenized: [102, 217, 84, 81, 5, 122, 55, 68, 7, 9, 25, 109, 30, 217, 84, 122, 5, 7, 1145, 122, 55]
Decoded: function gcd(a,b) return a if b=0 else gcd(b, a%b)


testing

In [28]:
def beam_search_decode(model, src, src_mask, tokenizer, max_length=100, beam_width=5):
    model.eval()  # Set model to evaluation mode

    with torch.no_grad():
        src_emb = model.embedding(src)
        memory = model.encoder(src_emb)

        start_token = tokenizer.piece_to_id('<s>')  # Start token
        end_token = tokenizer.piece_to_id('</s>')  # End token

        sequences = [[start_token]]
        scores = [0]

        for _ in range(max_length):
            all_candidates = []
            for i, seq in enumerate(sequences):
                if seq[-1] == end_token:
                    all_candidates.append((seq, scores[i]))  # End sequence
                    continue

                seq_tensor = torch.tensor(seq, dtype=torch.long).unsqueeze(0).to(device)
                tgt_emb = model.embedding(seq_tensor)
                output = model.decoder(tgt_emb, memory)

                logits = model.output_layer(output[:, -1, :])
                probs = torch.nn.functional.log_softmax(logits, dim=-1)

                top_k_probs, top_k_indices = probs.topk(beam_width)

                for j in range(beam_width):
                    new_seq = seq + [top_k_indices[0, j].item()]
                    new_score = scores[i] + top_k_probs[0, j].item()
                    all_candidates.append((new_seq, new_score))

            sequences = sorted(all_candidates, key=lambda x: x[1], reverse=True)[:beam_width]

        best_seq = sequences[0][0]
        decoded_text = tokenizer.decode(best_seq)

        return decoded_text.replace('<s>', '').replace('</s>', '').strip()




In [29]:
src_sentence = "function gcd(a,b) return a if b=0 else gcd(b, a%b)"
src_tokens = pseudocode_tokenizer.encode(src_sentence)  # Convert to token IDs
src_tensor = torch.tensor([src_tokens], dtype=torch.long).to(device)

generated_code = beam_search_decode(model, src_tensor, None, cpp_tokenizer)
print("Generated C++ Code:\n", generated_code)



AttributeError: 'TransformerSeq2Seq' object has no attribute 'output_layer'

In [None]:
src_sentence = "function gcd(a,b) return a if b=0 else gcd(b, a%b)"

# Convert pseudocode into tokenized format using SentencePiece
src_tokens = pseudocode_tokenizer.encode(src_sentence, out_type=int)

# Convert to tensor and move to device
src_tensor = torch.tensor([src_tokens], dtype=torch.long).to(device)

# Generate the C++ code using beam search
generated_code = beam_search_decode(model, src_tensor, None, pseudocode_tokenizer, max_length=150, beam_width=5)


print("Generated C++ Code:\n", generated_code)


In [13]:
test_pseudocode = "function gcd(a,b) return a if b=0 else gcd(b, a%b)"
generated_cpp = generate_cpp_code(test_pseudocode, model, pseudocode_tokenizer, cpp_tokenizer)
print("Generated C++ Code:\n", generated_cpp)


Generated C++ Code:
 = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
