In [1]:
import pandas as pd
import sentencepiece as spm
import torch
import torch.nn as nn
import torch.optim as optim
import os
import math
from torch.utils.data import Dataset, DataLoader

# Enable CUDA debugging for better error tracking
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'


In [4]:
# Load datasets
train_path = "/content/spoc-train-train.tsv"
val_path = "/content/spoc-train-eval.tsv"
test_path = "/content/spoc-train-test.tsv"

train_df = pd.read_csv(train_path, sep="\t")
val_df = pd.read_csv(val_path, sep="\t")
test_df = pd.read_csv(test_path, sep="\t")

# Drop rows where pseudocode ('text') is missing
train_df = train_df.dropna(subset=["text"])
val_df = val_df.dropna(subset=["text"])
test_df = test_df.dropna(subset=["text"])

# Combine lines of pseudocode and code per problem and submission
def group_data(df):
    return df.groupby(["probid", "subid"]).agg({
        "text": lambda x: "\n".join(x),
        "code": lambda x: "\n".join(x)
    }).reset_index()

train_df = group_data(train_df)
val_df = group_data(val_df)
test_df = group_data(test_df)

print("Preprocessing complete. Data loaded and structured.")


Preprocessing complete. Data loaded and structured.


In [5]:
#Tokenizing
# Prepare training data for tokenizer
all_text = list(train_df["text"]) + list(train_df["code"])

# Train SentencePiece tokenizer directly from in-memory data
spm.SentencePieceTrainer.train(
    sentence_iterator=iter(all_text),
    model_prefix="/content/spoc_tokenizer",
    vocab_size=24000,
    character_coverage=1.0,
    model_type="bpe"
)

# Load trained tokenizer
sp = spm.SentencePieceProcessor(model_file="/content/spoc_tokenizer.model")

# Special tokens
sos_token = sp.piece_to_id("<s>")
eos_token = sp.piece_to_id("</s>")
pad_token = 23999  # Set padding token to 0 or another valid index

# Tokenize datasets
def tokenize_data(df, sp):
    df["text_tokenized"] = df["text"].apply(lambda x: sp.encode(x, out_type=int))
    df["code_tokenized"] = df["code"].apply(lambda x: sp.encode(x, out_type=int))
    return df

train_df = tokenize_data(train_df, sp)
val_df = tokenize_data(val_df, sp)
test_df = tokenize_data(test_df, sp)


In [6]:
# Replace invalid tokens (-1) with the padding token
def replace_invalid_tokens(sequences, pad_token):
    return [[pad_token if token == -1 else token for token in seq] for seq in sequences]

train_df["text_tokenized"] = replace_invalid_tokens(train_df["text_tokenized"], pad_token)
train_df["code_tokenized"] = replace_invalid_tokens(train_df["code_tokenized"], pad_token)
val_df["text_tokenized"] = replace_invalid_tokens(val_df["text_tokenized"], pad_token)
val_df["code_tokenized"] = replace_invalid_tokens(val_df["code_tokenized"], pad_token)
test_df["text_tokenized"] = replace_invalid_tokens(test_df["text_tokenized"], pad_token)
test_df["code_tokenized"] = replace_invalid_tokens(test_df["code_tokenized"], pad_token)

# Pad sequences to max length
def pad_sequences(sequences, max_length, pad_value):
    return [seq[:max_length] + [pad_value] * max(0, max_length - len(seq)) for seq in sequences]

max_length = 256  # Define max sequence length
train_df["text_tokenized"] = pad_sequences(train_df["text_tokenized"], max_length, pad_token)
train_df["code_tokenized"] = pad_sequences(train_df["code_tokenized"], max_length, pad_token)
val_df["text_tokenized"] = pad_sequences(val_df["text_tokenized"], max_length, pad_token)
val_df["code_tokenized"] = pad_sequences(val_df["code_tokenized"], max_length, pad_token)
test_df["text_tokenized"] = pad_sequences(test_df["text_tokenized"], max_length, pad_token)
test_df["code_tokenized"] = pad_sequences(test_df["code_tokenized"], max_length, pad_token)

# PyTorch Dataset class
class PseudocodeDataset(Dataset):
    def __init__(self, df):
        self.inputs = torch.tensor(df["text_tokenized"].tolist(), dtype=torch.long)
        self.targets = torch.tensor(df["code_tokenized"].tolist(), dtype=torch.long)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]

# Create DataLoaders
batch_size = 4  # Reduce batch size
train_dataset = PseudocodeDataset(train_df)
val_dataset = PseudocodeDataset(val_df)
test_dataset = PseudocodeDataset(test_df)



train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Define vocab_size
vocab_size = 24000
# Function to check for invalid indices and NaN/Inf values
def check_target_range(tgt):
    print("Target min index:", tgt.min().item())
    print("Target max index:", tgt.max().item())
    print("Vocab size:", vocab_size)
    print("Padding token index:", pad_token)

def check_for_nan_inf(tensor):
    if torch.isnan(tensor).any():
        print("NaN values found!")
    if torch.isinf(tensor).any():
        print("Inf values found!")

# Check the range of token indices in your dataset
for src, tgt in train_loader:
    check_target_range(tgt)
    check_for_nan_inf(src)
    check_for_nan_inf(tgt)
    break

Target min index: 3
Target max index: 23999
Vocab size: 24000
Padding token index: 23999


In [7]:
# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

In [9]:
# Transformer Model
class TransformerSeq2Seq(nn.Module):
    def __init__(self, vocab_size, d_model=512, nhead=8, num_layers=8, dim_feedforward=4096, dropout=0.1):
        super(TransformerSeq2Seq, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)  # Add positional encoding
        self.dropout = nn.Dropout(dropout)  # Add dropout
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout
        )
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, src, tgt):
        src = self.embedding(src).permute(1, 0, 2)
        src = self.pos_encoder(src)  # Apply positional encoding
        src = self.dropout(src)  # Apply dropout
        tgt = self.embedding(tgt).permute(1, 0, 2)
        tgt = self.pos_encoder(tgt)  # Apply positional encoding
        tgt = self.dropout(tgt)  # Apply dropout
        output = self.transformer(src, tgt)
        return self.fc_out(output.permute(1, 0, 2))

# Model Initialization
vocab_size = 24000
model = TransformerSeq2Seq(vocab_size).to(device)

# Loss and Optimizer
optimizer = optim.Adam(model.parameters(), lr=1e-4, betas=(0.9, 0.98), eps=1e-9)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
criterion = nn.CrossEntropyLoss(ignore_index=pad_token)
# Training Loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for src, tgt in train_loader:
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        try:
            output = model(src, tgt[:, :-1])
            loss = criterion(output.reshape(-1, vocab_size), tgt[:, 1:].reshape(-1))
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            total_loss += loss.item()
        except RuntimeError as e:
            print("Error during training:", e)
            print("Skipping batch...")
            continue
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")

    # Validation Loop
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for src, tgt in val_loader:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, tgt[:, :-1])
            loss = criterion(output.reshape(-1, vocab_size), tgt[:, 1:].reshape(-1))
            val_loss += loss.item()
    print(f"Validation Loss: {val_loss / len(val_loader)}")

# Save trained model
model_path = "/content/transformer_seq2seq.pth"
torch.save(model.state_dict(), model_path)
print(f"Model saved at {model_path}")

print("Model training complete.")




Epoch 1, Loss: 3.0628425938482504
Validation Loss: 2.095234539456472
Epoch 2, Loss: 2.112791581256451
Validation Loss: 1.8497987571590675
Epoch 3, Loss: 1.8977895653098924
Validation Loss: 1.7664432168720725
Epoch 4, Loss: 1.7670134185793331
Validation Loss: 1.7060394243684833
Epoch 5, Loss: 1.6729491220258492
Validation Loss: 1.6683010728416328
Epoch 6, Loss: 1.596191001798774
Validation Loss: 1.6618177847353046
Epoch 7, Loss: 1.5361280166217637
Validation Loss: 1.6475590344912516
Epoch 8, Loss: 1.488094523538739
Validation Loss: 1.6356678132168547
Epoch 9, Loss: 1.443554959803468
Validation Loss: 1.6355549616371086
Epoch 10, Loss: 1.4027465101664964
Validation Loss: 1.634308675924937
Model saved at /content/transformer_seq2seq.pth
Model training complete.
