In [7]:
import numpy as np
import torch
import torch.nn as nn
from transformers import RobertaTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import Trainer, TrainingArguments
import matplotlib.pyplot as plt

In [8]:
# device
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print(f"Using {device}")

Using mps


In [9]:
# dataset
dataset = load_dataset('csv', data_files={'json': 'data3.csv'})

In [10]:
# Tokenizer, pretrained 
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")

# Tokenization function
def tokenize_function(examples):
    code_snippet = examples['code_snippet']
    return tokenizer(code_snippet, padding="max_length", truncation=True, return_tensors="pt")

# Tokenize the dataset
tokenized_datasets = dataset['json'].map(tokenize_function, batched=True)

# tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [11]:
model = AutoModelForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels=10)
model.to(device) # move model to mps or cpu
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    # eval_strategy = "epoch", 
    num_train_epochs=10,          # Number of epochs
    per_device_train_batch_size=8,  # Batch size (adjust based on memory)
    per_device_eval_batch_size=16,  # Batch size for evaluation
    logging_strategy="steps",     # Log training loss every few steps
    logging_steps=50,             # Log every 50 steps
    logging_dir="./logs",         # Auxiliary for tensorboard
    save_strategy="epoch",        # Save model after every epoch
    load_best_model_at_end=True,  # Load best model when finished training
    report_to="tensorboard",             # Report to tensorboard for visualization
    
)

# Split the tokenized dataset into train (80%) and test (20%) sets
train_test_split = tokenized_datasets.train_test_split(test_size=0.2, shuffle=True)


# Assign train and test datasets
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# train from Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()
results = trainer.evaluate()
print(results)

Epoch,Training Loss,Validation Loss
1,No log,0.877914
2,1.688100,0.282122
3,0.473300,0.132105
4,0.137400,0.144223
5,0.072300,0.138844
6,0.094200,0.114797
7,0.045900,0.085547
8,0.049400,0.068472
9,0.049400,0.069048
10,0.030100,0.071957


{'eval_loss': 0.06847164779901505, 'eval_runtime': 1.7396, 'eval_samples_per_second': 50.011, 'eval_steps_per_second': 3.449, 'epoch': 10.0}


### run the following command in terminal for visualization
% tensorboard --logdir ./logs

In [8]:
print("Launch TensorBoard with: tensorboard --logdir ./logs")

Launch TensorBoard with: tensorboard --logdir ./logs


# get attention matrix

In [13]:
code_snippet = "def add(a, b): return a + b"
inputs = tokenizer(code_snippet, return_tensors="pt", padding="max_length", truncation=True).to(model.device)
# Get attention outputs
with torch.no_grad():
    outputs = model(**inputs)
attentions = outputs.attentions  # Attention weights
print(attentions)

None


In [16]:
# Labels for prediction
labels = ["MaxCut", "MIS", "TSP", "Clique", "KColor", "Factor", "ADD", "MUL", "SUB", "VC"]

# Helper: max sequence length for this model
try:
    MAX_LEN = tokenizer.model_max_length
    if MAX_LEN is None or MAX_LEN <= 0 or MAX_LEN > 100000:
        # fall back if tokenizer.model_max_length is something weird like 1000000000000
        MAX_LEN = model.config.max_position_embeddings
except Exception:
    MAX_LEN = 512  # safe default

def predict(code: str):
    # Do NOT send to device yet – first check length
    inputs = tokenizer(code, return_tensors="pt")
    seq_len = inputs["input_ids"].shape[-1]

    if seq_len > MAX_LEN:
        # Raise a clear error; we'll catch it in the loop and report the index
        raise ValueError(
            f"Sequence length {seq_len} exceeds model max length {MAX_LEN}"
        )

    # Now safe to send to device and run the model
    inputs = inputs.to(device)
    outputs = model(**inputs)
    probabilities = torch.softmax(outputs.logits, dim=-1)
    max_prob, prediction = torch.max(probabilities, dim=-1)

    return labels[prediction.item()]

# Train/test split (you had train_test_split_1 but used train_test_split below)
train_test_split = tokenized_datasets.train_test_split(test_size=0.2, shuffle=False)
test_dataset = train_test_split['test']

# Run prediction on the test set and calculate accuracy manually
correct = 0
n_too_long = 0
too_long_indices = []

for i in range(len(test_dataset)):
    code = test_dataset[i]['code_snippet']
    true_label = test_dataset[i]['labels']

    try:
        predicted_label = predict(code)
    except ValueError as e:
        # Here we report the index that caused the length problem
        print(f"[LONG SEQUENCE] index={i}, error={e}")
        n_too_long += 1
        too_long_indices.append(i)
        continue  # skip this sample for accuracy

    if predicted_label == labels[true_label]:
        correct += 1

    print(f"Index {i} | True: {labels[true_label]}, Predicted: {predicted_label}")

# Calculate accuracy over the samples that were actually evaluated
evaluated = len(test_dataset) - n_too_long
if evaluated > 0:
    accuracy = correct / evaluated
    print(f"\nManual Accuracy (excluding too-long sequences): {accuracy * 100:.2f}%")
else:
    print("\nNo sequences were evaluated (all were too long).")

print(f"Total too-long sequences: {n_too_long}")
print(f"Indices of too-long sequences: {too_long_indices}")

Index 0 | True: ADD, Predicted: ADD
Index 1 | True: ADD, Predicted: ADD
Index 2 | True: SUB, Predicted: SUB
Index 3 | True: SUB, Predicted: SUB
Index 4 | True: SUB, Predicted: SUB
Index 5 | True: SUB, Predicted: SUB
Index 6 | True: MUL, Predicted: MUL
Index 7 | True: MUL, Predicted: MUL
Index 8 | True: MUL, Predicted: MUL
Index 9 | True: MUL, Predicted: MUL
Index 10 | True: MaxCut, Predicted: MaxCut
Index 11 | True: MIS, Predicted: MIS
Index 12 | True: Clique, Predicted: Clique
Index 13 | True: TSP, Predicted: TSP
Index 14 | True: KColor, Predicted: KColor
Index 15 | True: KColor, Predicted: KColor
Index 16 | True: MIS, Predicted: MIS
Index 17 | True: MaxCut, Predicted: MaxCut
Index 18 | True: KColor, Predicted: KColor
Index 19 | True: ADD, Predicted: ADD
Index 20 | True: ADD, Predicted: ADD
Index 21 | True: ADD, Predicted: ADD
Index 22 | True: ADD, Predicted: ADD
Index 23 | True: SUB, Predicted: SUB
Index 24 | True: SUB, Predicted: SUB
Index 25 | True: SUB, Predicted: SUB
Index 26 | T

In [15]:
from sklearn.metrics import classification_report
from tqdm import tqdm

# Labels
labels = ["MaxCut", "MIS", "TSP", "Clique", "KColor", "Factor", "ADD", "MUL", "SUB", "VC"]

# Prediction function
def predict(code: str):
    inputs = tokenizer(code, return_tensors="pt", truncation=True, padding=True).to(device)
    outputs = model(**inputs)
    probabilities = torch.softmax(outputs.logits, dim=-1)
    _, prediction = torch.max(probabilities, dim=-1)
    return prediction.item()

# Evaluate on the test set
y_true = []
y_pred = []

for i in tqdm(range(len(tokenized_datasets))):
    code = tokenized_datasets[i]["code_snippet"]
    true_label = tokenized_datasets[i]["labels"]
    pred_label = predict(code)

    y_true.append(true_label)
    y_pred.append(pred_label)

# Print evaluation report
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=labels, digits=3))


100%|██████████| 434/434 [01:05<00:00,  6.66it/s]


Classification Report:
              precision    recall  f1-score   support

      MaxCut      0.944     1.000     0.971        68
         MIS      1.000     0.943     0.971        70
         TSP      1.000     1.000     1.000        44
      Clique      1.000     1.000     1.000        33
      KColor      1.000     1.000     1.000        58
      Factor      1.000     1.000     1.000        31
         ADD      1.000     1.000     1.000        28
         MUL      1.000     1.000     1.000        27
         SUB      1.000     1.000     1.000        28
          VC      1.000     1.000     1.000        47

    accuracy                          0.991       434
   macro avg      0.994     0.994     0.994       434
weighted avg      0.991     0.991     0.991       434






In [9]:
# save trained model
# Save the model and tokenizer locally
# Takes some time to store it...
model_output_dir = "./saved_models_2025_12"
model.save_pretrained(model_output_dir)
tokenizer.save_pretrained(model_output_dir)
print(f"Model and tokenizer saved to {model_output_dir}")

Model and tokenizer saved to ./saved_models


# another code cell, manually train

In [21]:
import numpy as np
import torch
import torch.nn as nn
from transformers import RobertaTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader
from datasets import load_dataset

# Load the dataset
dataset = load_dataset('csv', data_files={'train': 'train.csv', 'test': 'test.csv'})

# Tokenizer
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")

# Tokenization function
def tokenize_function(examples):
    code_snippet = examples['code_snippet']
    return tokenizer(code_snippet, padding="max_length", truncation=True, return_tensors="pt")

# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Specify the fields in the dataset that need to be included
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Define DataLoader
train_loader = DataLoader(tokenized_datasets['train'], batch_size=8, shuffle=True)
eval_loader = DataLoader(tokenized_datasets['test'], batch_size=8)

# Initialize the model
model = AutoModelForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels=4)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    total_loss = 0
    for batch in train_loader:
        # Move input and labels to the device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss}")

# Evaluation loop
model.eval()  # Set model to evaluation mode
correct = 0
total = 0
with torch.no_grad():
    for batch in eval_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Predictions
        _, predicted = torch.max(outputs.logits, dim=-1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

# Calculate accuracy
accuracy = correct / total
print(f"Accuracy: {accuracy * 100:.2f}%")

Map:   0%|          | 0/107 [00:00<?, ? examples/s]

Map:   0%|          | 0/55 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


IndexError: Target 4 is out of bounds.

In [1]:
import networkx as nx

G = nx.Graph()
G.add_edges_from([(2, 3), (3, 4), (4, 1)])  # Add multiple edges at onc

EdgeView([(2, 3), (3, 4), (4, 1)])

  r = _umath_linalg.det(a, signature=signature)
  r = _umath_linalg.det(a, signature=signature)
  plot_bloch_multivector(state).show()
