In [None]:
import pandas as pd
from pathlib import Path

# Define file paths (adjust these to your actual paths if needed)
base_path = Path("./")  # Change to your dataset directory if not current dir
train_file = base_path / "spoc-train.tsv"
testp_file = base_path / "spoc-testp.tsv"
testw_file = base_path / "spoc-testw.tsv"
split_dir = base_path / "train/split"  # For split details, if available

# 1. Row Counts
def get_row_counts():
    train_df = pd.read_csv(train_file, sep='\t', names=['text', 'code', 'workerid', 'probid', 'subid', 'line', 'indent'])
    testp_df = pd.read_csv(testp_file, sep='\t', names=['text', 'code', 'workerid', 'probid', 'subid', 'line', 'indent'])
    testw_df = pd.read_csv(testw_file, sep='\t', names=['text', 'code', 'workerid', 'probid', 'subid', 'line', 'indent'])

    print(f"Rows in spoc-train.tsv: {len(train_df)}")
    print(f"Rows in spoc-testp.tsv: {len(testp_df)}")
    print(f"Rows in spoc-testw.tsv: {len(testw_df)}")
    return train_df, testp_df, testw_df

# 2. Unique Programs in spoc-train.tsv
def get_unique_programs(train_df):
    unique_programs = train_df.groupby(['probid', 'subid']).ngroups
    print(f"Unique probid/subid combinations in spoc-train.tsv: {unique_programs}")

# 3. Split Details (if split directory exists)
def get_split_details():
    if split_dir.exists():
        split_files = list(split_dir.glob("*.tsv"))
        for split_file in split_files:
            split_df = pd.read_csv(split_file, sep='\t', names=['text', 'code', 'workerid', 'probid', 'subid', 'line', 'indent'])
            print(f"Rows in {split_file.name}: {len(split_df)}")
    else:
        print("Split directory not found. Please provide path or confirm its absence.")

# 4. Sample Lines from spoc-train.tsv
def get_sample_lines(train_df, num_samples=3):
    samples = train_df.sample(n=num_samples)
    print("\nSample rows from spoc-train.tsv:")
    for idx, row in samples.iterrows():
        print(f"Row {idx}: text='{row['text']}', code='{row['code']}', workerid={row['workerid']}, "
              f"probid={row['probid']}, subid={row['subid']}, line={row['line']}, indent={row['indent']}")

# Run everything
print("Extracting dataset info...\n")
train_df, testp_df, testw_df = get_row_counts()
get_unique_programs(train_df)
get_split_details()
get_sample_lines(train_df)


Extracting dataset info...



  train_df = pd.read_csv(train_file, sep='\t', names=['text', 'code', 'workerid', 'probid', 'subid', 'line', 'indent'])


Rows in spoc-train.tsv: 272989
Rows in spoc-testp.tsv: 52058
Rows in spoc-testw.tsv: 34899
Unique probid/subid combinations in spoc-train.tsv: 13454
Split directory not found. Please provide path or confirm its absence.

Sample rows from spoc-train.tsv:
Row 203791: text='nan', code='}', workerid=54, probid=86A, subid=18134121, line=8, indent=0
Row 250891: text='nan', code='int main() {', workerid=5, probid=630A, subid=48425743, line=0, indent=0
Row 100883: text='ans=1', code='long long ans = 1;', workerid=31, probid=553A, subid=40995002, line=27, indent=2


In [9]:
import pandas as pd
import torch
from collections import Counter
from pathlib import Path
import re

torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

base_path = Path("./")
train_file = base_path / "spoc-train.tsv"

def load_spoc_tsv(file_path):
    df = pd.read_csv(file_path, sep='\t', names=['text', 'code', 'workerid', 'probid', 'subid', 'line', 'indent'],
                     low_memory=False, skiprows=1)
    df['text'] = df['text'].fillna('')
    print(f"Loaded {len(df)} rows from {file_path}")
    return df

def group_into_programs(df):
    programs = []
    grouped = df.groupby(['probid', 'subid'])
    for (probid, subid), group in grouped:
        group = group.sort_values('line')
        pseudo_lines = group['text'].tolist()  # Pseudocode as source
        cpp_lines = group['code'].tolist()     # C++ as target
        indents = group['indent'].astype(int).tolist()
        pseudo_program = '\n'.join(f"{'  ' * indent}{line}" for indent, line in zip(indents, pseudo_lines))
        cpp_program = '\n'.join(f"{'  ' * indent}{line}" for indent, line in zip(indents, cpp_lines))
        programs.append((pseudo_program, cpp_program))  # (src, tgt)
    print(f"Grouped into {len(programs)} full programs")
    return programs

def tokenize(text):
    tokens = re.findall(r'\w+|[^\w\s]', text, re.UNICODE)
    return tokens

def build_vocab(programs, min_freq=2):
    pseudo_counter = Counter()  # Source
    cpp_counter = Counter()     # Target
    for pseudo, cpp in programs:
        pseudo_counter.update(tokenize(pseudo))
        cpp_counter.update(tokenize(cpp))
    specials = ['<pad>', '<sos>', '<eos>', '<unk>']
    pseudo_vocab = specials + [word for word, freq in pseudo_counter.items() if freq >= min_freq]
    cpp_vocab = specials + [word for word, freq in cpp_counter.items() if freq >= min_freq]
    pseudo2idx = {word: idx for idx, word in enumerate(pseudo_vocab)}
    idx2pseudo = {idx: word for word, idx in pseudo2idx.items()}
    cpp2idx = {word: idx for idx, word in enumerate(cpp_vocab)}
    idx2cpp = {idx: word for word, idx in cpp2idx.items()}
    print(f"Pseudocode vocab size: {len(pseudo_vocab)}, C++ vocab size: {len(cpp_vocab)}")
    return pseudo2idx, idx2pseudo, cpp2idx, idx2cpp

def text_to_tensor(text, vocab, tokenizer, max_len=200):
    tokens = ['<sos>'] + tokenizer(text)[:max_len - 2] + ['<eos>']
    tensor = [vocab.get(token, vocab['<unk>']) for token in tokens]
    return torch.tensor(tensor, dtype=torch.long)

def get_batches(programs, batch_size=32, max_len=200):
    for i in range(0, len(programs), batch_size):
        batch = programs[i:i + batch_size]
        src_batch = [text_to_tensor(pseudo, pseudo2idx, tokenize, max_len) for pseudo, _ in batch]
        tgt_batch = [text_to_tensor(cpp, cpp2idx, tokenize, max_len) for _, cpp in batch]
        src_batch = torch.nn.utils.rnn.pad_sequence(src_batch, padding_value=pseudo2idx['<pad>'], batch_first=True).to(device)
        tgt_batch = torch.nn.utils.rnn.pad_sequence(tgt_batch, padding_value=cpp2idx['<pad>'], batch_first=True).to(device)
        yield src_batch, tgt_batch

train_df = load_spoc_tsv(train_file)
train_programs = group_into_programs(train_df)
pseudo2idx, idx2pseudo, cpp2idx, idx2cpp = build_vocab(train_programs)
torch.save(pseudo2idx, "pseudo2idx.pt")
torch.save(idx2pseudo, "idx2pseudo.pt")
torch.save(cpp2idx, "cpp2idx.pt")
torch.save(idx2cpp, "idx2cpp.pt")
print("Chunk 1 completed: Data preprocessed and vocab saved.")

Loaded 293854 rows from spoc-train.tsv
Grouped into 14548 full programs
Pseudocode vocab size: 6217, C++ vocab size: 5643
Chunk 1 completed: Data preprocessed and vocab saved.


In [10]:
import math
import torch.nn as nn

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=200):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attn = torch.softmax(scores, dim=-1)
        return torch.matmul(attn, V)

    def forward(self, Q, K, V, mask=None):
        batch_size = Q.size(0)
        Q = self.W_q(Q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        K = self.W_k(K).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v(V).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        return self.W_o(output)

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.linear2(self.relu(self.linear1(x)))

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_output = self.mha(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.ff(x)
        return self.norm2(x + self.dropout(ff_output))

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
        attn1_output = self.mha1(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn1_output))
        attn2_output = self.mha2(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn2_output))
        ff_output = self.ff(x)
        return self.norm3(x + self.dropout(ff_output))

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=256, num_heads=8, num_layers=3, d_ff=1024, max_len=200, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != pseudo2idx['<pad>']).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != cpp2idx['<pad>']).unsqueeze(1).unsqueeze(3)
        seq_len = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_len, seq_len), diagonal=1)).bool().to(device)
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.pos_encoding(self.src_embedding(src) * math.sqrt(self.d_model)))
        tgt_embedded = self.dropout(self.pos_encoding(self.tgt_embedding(tgt) * math.sqrt(self.d_model)))
        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)
        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)
        return self.fc_out(dec_output)

print("Chunk 2 completed: Transformer model defined.")

Chunk 2 completed: Transformer model defined.


In [11]:
import torch.optim as optim

model = Transformer(
    src_vocab_size=len(pseudo2idx),
    tgt_vocab_size=len(cpp2idx),
    d_model=256,
    num_heads=8,
    num_layers=3,
    d_ff=1024,
    max_len=200,
    dropout=0.1
).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=cpp2idx['<pad>'])
optimizer = optim.Adam(model.parameters(), lr=0.0001)

def train(model, programs, epochs=35, batch_size=32):  # Increased to 50 epochs
    model.train()
    total_batches = len(programs) // batch_size + (1 if len(programs) % batch_size else 0)
    print(f"Total batches per epoch: {total_batches}")
    for epoch in range(epochs):
        total_loss = 0
        for batch_idx, (src_batch, tgt_batch) in enumerate(get_batches(programs, batch_size), 1):
            if batch_idx % 50 == 0:
                print(f"Epoch {epoch + 1}, Batch {batch_idx}/{total_batches}")
            optimizer.zero_grad()
            output = model(src_batch, tgt_batch[:, :-1])
            loss = criterion(output.reshape(-1, len(cpp2idx)), tgt_batch[:, 1:].reshape(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / total_batches
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")
    torch.save(model.state_dict(), "pseudo2cpp_model.pt")
    print("Model saved to pseudo2cpp_model.pt")

def translate(model, pseudo_program, max_len=200):
    model.eval()
    with torch.no_grad():
        src = text_to_tensor(pseudo_program, pseudo2idx, tokenize, max_len).unsqueeze(0).to(device)
        tgt = torch.tensor([cpp2idx['<sos>']], dtype=torch.long).unsqueeze(0).to(device)
        for _ in range(max_len):
            output = model(src, tgt)
            next_token = output[:, -1, :].argmax(dim=-1).item()
            if next_token == cpp2idx['<eos>']:
                break
            tgt = torch.cat([tgt, torch.tensor([[next_token]], dtype=torch.long).to(device)], dim=1)
        translated = [idx2cpp[idx.item()] for idx in tgt[0] if idx.item() in idx2cpp]
        cpp = ' '.join(translated[1:])
        # Improved post-processing for C++ syntax
        cpp = cpp.replace('int main', '#include <iostream>\nusing namespace std;\n\nint main')
        cpp = re.sub(r'(\bint\b|\bfor\b|\bwhile\b|\bif\b|\belse\b|\{|\})', r'\n\1', cpp)
        cpp = re.sub(r'input (\w+)', r'cin >> \1;', cpp)
        cpp = re.sub(r'mod', '%', cpp)
        cpp = re.sub(r'(\w+) = (\w+)', r'\1 == \2', cpp, count=1)  # Fix first '=' to '==' in 'if'
        cpp = re.sub(r'(\w+) = ', r'\1 = ', cpp)  # Preserve other assignments
        cpp = re.sub(r'to', '<=', cpp)
        cpp = re.sub(r'then', '', cpp)
        lines = cpp.split('\n')
        indented = []
        indent_level = 0
        for line in lines:
            line = line.strip()
            if line:
                if '}' in line:
                    indent_level = max(0, indent_level - 1)
                indented.append('  ' * indent_level + line + (';' if line[-1] not in '{;}' else ''))
                if '{' in line and '}' not in line:
                    indent_level += 1
        return '\n'.join(indented).strip()

print("Starting training...")
train(model, train_programs, epochs=35)  # ~10-20h GPU
print("Training completed.")

test_pseudo = """
declare gcd(a, b)
  if b = 0 then return a
  else return gcd(b, a mod b)
declare main
  declare n, nn, ans = 0
  input n
  for i = 2 to n - 1
    nn = n
    while nn > 0
      ans = ans + (nn mod i)
      nn = nn / i
"""
translated = translate(model, test_pseudo)
print(f"Input Pseudocode:\n{test_pseudo}")
print(f"Translated C++:\n{translated}")

print("Chunk 3 completed: Model trained and saved.")

Starting training...
Total batches per epoch: 455
Epoch 1, Batch 50/455
Epoch 1, Batch 100/455
Epoch 1, Batch 150/455
Epoch 1, Batch 200/455
Epoch 1, Batch 250/455
Epoch 1, Batch 300/455
Epoch 1, Batch 350/455
Epoch 1, Batch 400/455
Epoch 1, Batch 450/455
Epoch 1/35, Loss: 3.3352
Epoch 2, Batch 50/455
Epoch 2, Batch 100/455
Epoch 2, Batch 150/455
Epoch 2, Batch 200/455
Epoch 2, Batch 250/455
Epoch 2, Batch 300/455
Epoch 2, Batch 350/455
Epoch 2, Batch 400/455
Epoch 2, Batch 450/455
Epoch 2/35, Loss: 2.1442
Epoch 3, Batch 50/455
Epoch 3, Batch 100/455
Epoch 3, Batch 150/455
Epoch 3, Batch 200/455
Epoch 3, Batch 250/455
Epoch 3, Batch 300/455
Epoch 3, Batch 350/455
Epoch 3, Batch 400/455
Epoch 3, Batch 450/455
Epoch 3/35, Loss: 1.8764
Epoch 4, Batch 50/455
Epoch 4, Batch 100/455
Epoch 4, Batch 150/455
Epoch 4, Batch 200/455
Epoch 4, Batch 250/455
Epoch 4, Batch 300/455
Epoch 4, Batch 350/455
Epoch 4, Batch 400/455
Epoch 4, Batch 450/455
Epoch 4/35, Loss: 1.7056
Epoch 5, Batch 50/455
Epoc

In [None]:
# Test complex pseudocode lines
test_lines = [
    "in the function gcd(a,b=integers)",
    "if b=1 return a, else call function gcd(b, a%b)",
    "n , nn, ans = integers with ans =0",
    "Read n",
    "for i=2 to n-1 execute",
    "set nn to n",
    "while nn is not equal to 0, set ans to ans + nn%i, and also set nn= nn/i"
]

print("Testing complex pseudocode lines:")
for pseudo in test_lines:
    cpp = translate(model, pseudo)
    print(f"Input: {pseudo}")
    print(f"Translated: {cpp}\n")

Testing complex pseudocode lines:
Input: in the function gcd(a,b=integers)
Translated: void <unk> {

Input: if b=1 return a, else call function gcd(b, a%b)
Translated: return gcd(b, a < b ? gcd(b, a : gcd(b, a - b);

Input: n , nn, ans = integers with ans =0
Translated: int n, ans = 0;

Input: Read n
Translated: cin >> n;

Input: for i=2 to n-1 execute
Translated: for (int i = 2; i <= n - 2; i++) {

Input: set nn to n
Translated: nn = n;

Input: while nn is not equal to 0, set ans to ans + nn%i, and also set nn= nn/i
Translated: while <unk> != 0) { ans = <unk> + <unk> }



In [6]:
pip install gradio

Collecting gradio
  Downloading gradio-5.20.1-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.11-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.7.2 (from gradio)
  Downloading gradio_client-1.7.2-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3

In [12]:
import gradio as gr
import torch
import re

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load vocab
pseudo2idx = torch.load("pseudo2idx.pt")
idx2pseudo = torch.load("idx2pseudo.pt")
cpp2idx = torch.load("cpp2idx.pt")
idx2cpp = torch.load("idx2cpp.pt")

def tokenize(text):
    return re.findall(r'\w+|[^\w\s]', text, re.UNICODE)

def text_to_tensor(text, vocab, tokenizer, max_len=200):
    tokens = ['<sos>'] + tokenizer(text)[:max_len - 2] + ['<eos>']
    tensor = [vocab.get(token, vocab['<unk>']) for token in tokens]
    return torch.tensor(tensor, dtype=torch.long)

# Transformer model (from Chunk 2)
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=200):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        attn = torch.softmax(scores, dim=-1)
        return torch.matmul(attn, V)

    def forward(self, Q, K, V, mask=None):
        batch_size = Q.size(0)
        Q = self.W_q(Q).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        K = self.W_k(K).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        V = self.W_v(V).view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        return self.W_o(output)

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.linear2(self.relu(self.linear1(x)))

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_output = self.mha(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.ff(x)
        return self.norm2(x + self.dropout(ff_output))

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super().__init__()
        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
        attn1_output = self.mha1(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn1_output))
        attn2_output = self.mha2(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn2_output))
        ff_output = self.ff(x)
        return self.norm3(x + self.dropout(ff_output))

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=256, num_heads=8, num_layers=3, d_ff=1024, max_len=200, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.fc_out = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != pseudo2idx['<pad>']).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != cpp2idx['<pad>']).unsqueeze(1).unsqueeze(3)
        seq_len = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_len, seq_len), diagonal=1)).bool().to(device)
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)
        src_embedded = self.dropout(self.pos_encoding(self.src_embedding(src) * math.sqrt(self.d_model)))
        tgt_embedded = self.dropout(self.pos_encoding(self.tgt_embedding(tgt) * math.sqrt(self.d_model)))
        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)
        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)
        return self.fc_out(dec_output)

# Load model
model = Transformer(src_vocab_size=len(pseudo2idx), tgt_vocab_size=len(cpp2idx)).to(device)
model.load_state_dict(torch.load("pseudo2cpp_model.pt"))
model.eval()

# Translate function
def translate(pseudo_program):
    with torch.no_grad():
        src = text_to_tensor(pseudo_program, pseudo2idx, tokenize).unsqueeze(0).to(device)
        tgt = torch.tensor([cpp2idx['<sos>']], dtype=torch.long).unsqueeze(0).to(device)
        for _ in range(200):
            output = model(src, tgt)
            next_token = output[:, -1, :].argmax(dim=-1).item()
            if next_token == cpp2idx['<eos>']:
                break
            tgt = torch.cat([tgt, torch.tensor([[next_token]], dtype=torch.long).to(device)], dim=1)
        translated = [idx2cpp[idx.item()] for idx in tgt[0] if idx.item() in idx2cpp]
        cpp = ' '.join(translated[1:])
        cpp = re.sub(r'(\bint\b|\bfor\b|\bwhile\b|\bif\b|\belse\b|\{|\})', r'\n\1', cpp)
        lines = cpp.split('\n')
        indented = []
        indent_level = 0
        for line in lines:
            line = line.strip()
            if line:
                if '}' in line:
                    indent_level = max(0, indent_level - 1)
                indented.append('  ' * indent_level + line)
                if '{' in line and '}' not in line:
                    indent_level += 1
        return '\n'.join(indented).strip()

# Gradio interface
interface = gr.Interface(
    fn=translate,
    inputs=gr.Textbox(lines=10, placeholder="Enter pseudocode (e.g., declare gcd(a, b)..."),
    outputs=gr.Textbox(lines=10),
    title="Pseudocode to C++ Translator",
    description="Converts multi-line pseudocode to C++ using a pre-trained Transformer."
)
interface.launch()  # No share=True, for HF Spaces
print("Chunk 4 completed: Gradio deployed with loaded model.")

  pseudo2idx = torch.load("pseudo2idx.pt")
  idx2pseudo = torch.load("idx2pseudo.pt")
  cpp2idx = torch.load("cpp2idx.pt")
  idx2cpp = torch.load("idx2cpp.pt")
  model.load_state_dict(torch.load("pseudo2cpp_model.pt"))


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7766f489a6a2e40f6e.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Chunk 4 completed: Gradio deployed with loaded model.
