In [133]:
import numpy as np
import torch
import torch.nn as nn
from transformers import RobertaTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import Trainer, TrainingArguments
import matplotlib.pyplot as plt

In [118]:
# device
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print(f"Using {device}")

Using mps


In [125]:
# dataset
dataset = load_dataset('csv', data_files={'data': 'data.csv'})

Generating data split: 0 examples [00:00, ? examples/s]

In [126]:
# Tokenizer, pretrained 
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")

# Tokenization function
def tokenize_function(examples):
    code_snippet = examples['code_snippet']
    return tokenizer(code_snippet, padding="max_length", truncation=True, return_tensors="pt")

# Tokenize the dataset
tokenized_datasets = dataset['data'].map(tokenize_function, batched=True)

# tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])



Map:   0%|          | 0/387 [00:00<?, ? examples/s]

In [127]:
model = AutoModelForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels=9)
model.to(device) # move model to mps or cpu
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    num_train_epochs=10,          # Number of epochs
    per_device_train_batch_size=8,  # Batch size (adjust based on memory)
    per_device_eval_batch_size=16,  # Batch size for evaluation
    logging_strategy="steps",     # Log training loss every few steps
    logging_steps=50,             # Log every 50 steps
    logging_dir="./logs",         # Auxiliary for tensorboard
    save_strategy="epoch",        # Save model after every epoch
    load_best_model_at_end=True,  # Load best model when finished training
    report_to="tensorboard",             # Report to tensorboard for visualization
    
)

# Split the tokenized dataset into train (80%) and test (20%) sets
train_test_split = tokenized_datasets.train_test_split(test_size=0.2, shuffle=True)

# Assign train and test datasets
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [128]:
# train from Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()
results = trainer.evaluate()
print(results)

Epoch,Training Loss,Validation Loss
1,No log,0.763739
2,1.489200,0.188712
3,0.208100,0.159397
4,0.102500,0.173352
5,0.102500,0.179891
6,0.063500,0.197007
7,0.061100,0.264439
8,0.041700,0.211927
9,0.018100,0.206464
10,0.018100,0.183931


{'eval_loss': 0.15939690172672272, 'eval_runtime': 2.0602, 'eval_samples_per_second': 37.861, 'eval_steps_per_second': 2.427, 'epoch': 10.0}


### run the following command in terminal for visualization
% tensorboard --logdir ./logs

In [101]:
print("Launch TensorBoard with: tensorboard --logdir ./logs")

Launch TensorBoard with: tensorboard --logdir ./logs


# get attention matrix

In [139]:
code_snippet = "def add(a, b): return a + b"
inputs = tokenizer(code_snippet, return_tensors="pt", padding="max_length", truncation=True).to(model.device)
# Get attention outputs
with torch.no_grad():
    outputs = model(**inputs)
attentions = outputs.attentions  # Attention weights
print(attentions)

(tensor([[[[9.2127e-01, 1.0307e-02, 6.3960e-03,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [5.5693e-01, 1.2368e-01, 4.4172e-02,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [1.5428e-01, 5.0276e-01, 8.2012e-02,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          ...,
          [9.9707e-01, 8.5430e-04, 1.7891e-04,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [9.9707e-01, 8.5430e-04, 1.7891e-04,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [9.9707e-01, 8.5430e-04, 1.7891e-04,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00]],

         [[8.1138e-01, 2.0101e-02, 1.6594e-02,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [2.3425e-02, 7.0841e-02, 1.3837e-01,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [9.9893e-03, 1.3761e-01, 1.1056e-01,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          ...,
          [7.4790e-01, 6.0576e-02, 2.2615e-02,  ..., 0.000

In [129]:
# Labels for prediction
labels = ["MaxCut", "MIS", "TSP", "Clique", "KColor", "Factor","ADD", "MUL", "SUB", "Unknown"]
# Prediction function
def predict(code: str):
    inputs = tokenizer(code, return_tensors="pt").to(device)
    outputs = model(**inputs)
    probabilities = torch.softmax(outputs.logits, dim=-1)  # Get probabilities
    max_prob, prediction = torch.max(probabilities, dim=-1)

    return labels[prediction.item()]


# Run prediction on the test set and calculate accuracy manually for illustration
correct = 0
for i in range(len(eval_dataset)):
    code = eval_dataset[i]['code_snippet']
    true_label = eval_dataset[i]['labels']
    predicted_label = predict(code)
    if predicted_label == labels[true_label]:
        correct += 1
    print(f"True: {labels[true_label]}, Predicted: {predicted_label}")

# Calculate and print accuracy manually
accuracy = correct / len(eval_dataset)
print(f"Manual Accuracy: {accuracy * 100:.2f}%")

True: SUB, Predicted: SUB
True: MIS, Predicted: MaxCut
True: MIS, Predicted: MIS
True: MIS, Predicted: MIS
True: MIS, Predicted: MIS
True: MaxCut, Predicted: MaxCut
True: TSP, Predicted: TSP
True: MUL, Predicted: MUL
True: TSP, Predicted: TSP
True: Clique, Predicted: Clique
True: MaxCut, Predicted: MaxCut
True: MIS, Predicted: MIS
True: MIS, Predicted: MIS
True: MaxCut, Predicted: MaxCut
True: KColor, Predicted: KColor
True: MaxCut, Predicted: MaxCut
True: MIS, Predicted: MIS
True: KColor, Predicted: KColor
True: MaxCut, Predicted: MaxCut
True: KColor, Predicted: KColor
True: Clique, Predicted: Clique
True: MaxCut, Predicted: MaxCut
True: ADD, Predicted: ADD
True: ADD, Predicted: ADD
True: KColor, Predicted: KColor
True: KColor, Predicted: KColor
True: MaxCut, Predicted: MaxCut
True: TSP, Predicted: TSP


Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors


True: Factor, Predicted: Factor
True: Factor, Predicted: Factor
True: KColor, Predicted: KColor
True: KColor, Predicted: KColor
True: MaxCut, Predicted: MaxCut
True: Clique, Predicted: Clique
True: MUL, Predicted: MUL
True: MaxCut, Predicted: MaxCut
True: MaxCut, Predicted: MaxCut
True: MIS, Predicted: MIS
True: Factor, Predicted: Factor
True: ADD, Predicted: ADD
True: MaxCut, Predicted: MaxCut
True: ADD, Predicted: ADD
True: Clique, Predicted: Clique
True: TSP, Predicted: TSP
True: MIS, Predicted: MIS
True: MIS, Predicted: MIS
True: Factor, Predicted: Factor
True: MIS, Predicted: MIS
True: MaxCut, Predicted: MaxCut
True: Factor, Predicted: Factor
True: TSP, Predicted: TSP
True: KColor, Predicted: KColor
True: Factor, Predicted: Factor
True: MUL, Predicted: MUL
True: MIS, Predicted: MIS
True: MUL, Predicted: MUL
True: KColor, Predicted: KColor
True: MUL, Predicted: MUL
True: KColor, Predicted: KColor
True: KColor, Predicted: KColor
True: Factor, Predicted: Factor
True: TSP, Predicted: 

In [130]:
# save trained model
# Save the model and tokenizer locally
# Takes some time to store it...
model_output_dir = "./saved_models"
model.save_pretrained(model_output_dir)
tokenizer.save_pretrained(model_output_dir)
print(f"Model and tokenizer saved to {model_output_dir}")

Model and tokenizer saved to ./saved_models


# another code cell, manually train

In [21]:
import numpy as np
import torch
import torch.nn as nn
from transformers import RobertaTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader
from datasets import load_dataset

# Load the dataset
dataset = load_dataset('csv', data_files={'train': 'train.csv', 'test': 'test.csv'})

# Tokenizer
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")

# Tokenization function
def tokenize_function(examples):
    code_snippet = examples['code_snippet']
    return tokenizer(code_snippet, padding="max_length", truncation=True, return_tensors="pt")

# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Specify the fields in the dataset that need to be included
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Define DataLoader
train_loader = DataLoader(tokenized_datasets['train'], batch_size=8, shuffle=True)
eval_loader = DataLoader(tokenized_datasets['test'], batch_size=8)

# Initialize the model
model = AutoModelForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels=4)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    total_loss = 0
    for batch in train_loader:
        # Move input and labels to the device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss}")

# Evaluation loop
model.eval()  # Set model to evaluation mode
correct = 0
total = 0
with torch.no_grad():
    for batch in eval_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Predictions
        _, predicted = torch.max(outputs.logits, dim=-1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

# Calculate accuracy
accuracy = correct / total
print(f"Accuracy: {accuracy * 100:.2f}%")

Map:   0%|          | 0/107 [00:00<?, ? examples/s]

Map:   0%|          | 0/55 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


IndexError: Target 4 is out of bounds.

In [1]:
import networkx as nx

G = nx.Graph()
G.add_edges_from([(2, 3), (3, 4), (4, 1)])  # Add multiple edges at onc

EdgeView([(2, 3), (3, 4), (4, 1)])