In [None]:
from huggingface_hub import login
login("Your Hugging face API key")

In [None]:
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader

# Load dataset
def load_data(file_path):
    df = pd.read_csv(file_path, sep='\t').dropna()
    return list(df['text']), list(df['code'])

# Define dataset class
class PseudoCodeDataset(Dataset):
    def __init__(self, inputs, targets, tokenizer, max_length=128):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_encoding = self.tokenizer(self.inputs[idx], truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        target_encoding = self.tokenizer(self.targets[idx], truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return {
            'input_ids': input_encoding['input_ids'].squeeze(),
            'attention_mask': input_encoding['attention_mask'].squeeze(),
            'labels': target_encoding['input_ids'].squeeze()
        }

# Load tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small", legacy=False)
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Load training data
train_inputs, train_targets = load_data("/content/spoc-train-train.tsv")
train_dataset = PseudoCodeDataset(train_inputs, train_targets, tokenizer)

# Training arguments for faster training
training_args = TrainingArguments(
    output_dir="pseudo_to_cpp_model",
    per_device_train_batch_size=32,  # Increased batch size for efficiency
    num_train_epochs=1,  # Reduce epochs to save time
    fp16=True,  # Enable mixed precision for faster training
    save_total_limit=2,
    save_steps=500,
    logging_steps=100,
    report_to="none",  # Disable logging to external services
    optim="adamw_torch"
)

# Trainer API for efficient training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Train the model
trainer.train()

# Save model
model.save_pretrained("pseudo_to_cpp_model")
tokenizer.save_pretrained("pseudo_to_cpp_model")


Step,Training Loss
100,3.423
200,0.3432
300,0.1842
400,0.1295
500,0.1064
600,0.0932
700,0.0875
800,0.0799
900,0.0719
1000,0.0667


Step,Training Loss
100,3.423
200,0.3432
300,0.1842
400,0.1295
500,0.1064
600,0.0932
700,0.0875
800,0.0799
900,0.0719
1000,0.0667


('pseudo_to_cpp_model/tokenizer_config.json',
 'pseudo_to_cpp_model/special_tokens_map.json',
 'pseudo_to_cpp_model/spiece.model',
 'pseudo_to_cpp_model/added_tokens.json')

In [None]:
# Load testing data
test_inputs, test_targets = load_data("/content/spoc-testp.tsv")
test_dataset = PseudoCodeDataset(test_inputs, test_targets, tokenizer)


In [None]:
# Evaluate model on test dataset
results = trainer.evaluate(eval_dataset=test_dataset)
print(f"Test Loss: {results['eval_loss']}")


Test Loss: 0.030760500580072403


In [None]:
model.eval()
num =0
print("Enter number of predications you want to generate :")
input(int(num))
for i in range(20):  # Generate predictions for 20 test samples
    input_text = test_inputs[i]
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)

    output_ids = model.generate(input_ids)
    predicted_code = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    print(f"📝 Pseudocode: {input_text}")
    print(f"✅ Predicted Code: {predicted_code}\n")


Enter number of predications you want to generate :
020
📝 Pseudocode: declare string variable s
✅ Predicted Code: string s;

📝 Pseudocode: declare short int sz
✅ Predicted Code: short int sz;

📝 Pseudocode: declare boolean called flag = false
✅ Predicted Code: bool flag = false;

📝 Pseudocode: read s
✅ Predicted Code: cin >> s;

📝 Pseudocode: for integer i = length of s - 1 to 0 inclusive counting down
✅ Predicted Code: for (int i = s.length() - 1; i

📝 Pseudocode: if s[i] != '/'
✅ Predicted Code: if (s[i]!= '/')

📝 Pseudocode: sz = i
✅ Predicted Code: sz = i;

📝 Pseudocode: break the loop
✅ Predicted Code: break;

📝 Pseudocode: for i = 0 to sz inclusive
✅ Predicted Code: for (int i = 0; i = sz;

📝 Pseudocode: if flag = false and s[i] = '/'
✅ Predicted Code: if (flag == false && s[i] == '

📝 Pseudocode: set flag to true
✅ Predicted Code: flag = true;

📝 Pseudocode: print s[i]
✅ Predicted Code: cout  s[i]  endl;

📝 Pseudocode: else if s[i] != '/'
✅ Predicted Code: else if (s[i]!= '/')

