In [2]:
import numpy as np
import torch
import torch.nn as nn
from transformers import RobertaTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader
from datasets import load_dataset

In [19]:
# prepare data sets and tokenize
# Load the dataset
dataset = load_dataset('csv', data_files={'train': 'train.csv', 'test': 'test.csv'})
print(dataset['train'][12])

# AST parsing
import ast
module = ast.parse(dataset['train'][1].get('code_snippet'))
module = ast.parse(dataset['train'][35].get('code_snippet'))

# CodeBERT tokenizer
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")

# Tokenization function
def tokenize_function(examples):
    code_snippet = examples['code_snippet']
    return tokenizer(code_snippet, padding="max_length", truncation=True)

# Tokenize dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

{'code_snippet': 'import networkx as nx\\nimport random\\n\\ndef randomized_maximal_independent_set(G):\\n    independent_set = set()\\n    nodes = list(G.nodes())\\n    random.shuffle(nodes)\\n    while nodes:\\n        node = nodes.pop(0)\\n        independent_set.add(node)\\n        for neighbor in G.neighbors(node):\\n            if neighbor in nodes:\\n                nodes.remove(neighbor)\\n    return independent_set\\n\\nG = nx.Graph()\\nG.add_edges_from([(0, 1), (1, 2), (2, 3), (3, 0), (0, 2)])\\nindependent_set = randomized_maximal_independent_set(G)\\nprint(f"Maximal Independent Set: {independent_set}")', 'labels': 1}


In [20]:
# Assuming start_position and end_position are provided
start_position = 12
end_position = 25

# Sample code snippet
code_snippet = """
import networkx as nx

def kcoloring_networkx_greedy(G, k):
    colors = {}
    for node in G.nodes():
        available_colors = set(range(k)) - {colors[neighbor] for neighbor in G.neighbors(node) if neighbor in colors}
        if available_colors:
            colors[node] = min(available_colors)
    return colors

G = nx.Graph()
G.add_edges_from([(0, 1), (1, 2), (2, 3), (3, 0)])
kcoloring_networkx_greedy(G, 3)
"""

# Tokenize the code snippet
inputs = tokenizer(code_snippet, return_tensors="pt", padding="max_length", truncation=True)

# Extract the tokens based on start and end positions
token_ids = inputs['input_ids'][0]  # Get the token IDs from the input
extracted_token_ids = token_ids[start_position:end_position + 1]  # Slice the token sequence

# Convert the token IDs back to tokens (strings)
extracted_tokens = tokenizer.convert_ids_to_tokens(extracted_token_ids)

# Reconstruct the matrix (or code portion) by converting tokens back to string
extracted_code = tokenizer.convert_tokens_to_string(extracted_tokens)

# Output the extracted code
print(f"Extracted portion of the code: {extracted_code}")

Extracted portion of the code: coloring_networkx_greedy(G, k):



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AttributeError: 'list' object has no attribute 'to'

In [18]:
import numpy as np
import torch
import torch.nn as nn
from transformers import RobertaTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader
from datasets import load_dataset

# Load the dataset
dataset = load_dataset('csv', data_files={'train': 'train.csv', 'test': 'test.csv'})

# Tokenizer
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")

# Tokenization function
def tokenize_function(examples):
    code_snippet = examples['code_snippet']
    return tokenizer(code_snippet, padding="max_length", truncation=True, return_tensors="pt")

# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Specify the fields in the dataset that need to be included
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Define DataLoader
train_loader = DataLoader(tokenized_datasets['train'], batch_size=8, shuffle=True)
eval_loader = DataLoader(tokenized_datasets['test'], batch_size=8)

# Initialize the model
model = AutoModelForSequenceClassification.from_pretrained("microsoft/codebert-base", num_labels=4)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    total_loss = 0
    for batch in train_loader:
        # Move input and labels to the device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss}")

# Evaluation loop
model.eval()  # Set model to evaluation mode
correct = 0
total = 0
with torch.no_grad():
    for batch in eval_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Predictions
        _, predicted = torch.max(outputs.logits, dim=-1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)

# Calculate accuracy
accuracy = correct / total
print(f"Accuracy: {accuracy * 100:.2f}%")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10, Loss: 1.3306392431259155
Epoch 2/10, Loss: 1.2337381442387898
Epoch 3/10, Loss: 1.0924411863088608
Epoch 4/10, Loss: 0.6948065931598345
Epoch 5/10, Loss: 0.439535287519296
Epoch 6/10, Loss: 0.259065297121803
Epoch 7/10, Loss: 0.16744354863961539
Epoch 8/10, Loss: 0.15336805302649736
Epoch 9/10, Loss: 0.11567062015334766
Epoch 10/10, Loss: 0.11325222564240296
Accuracy: 95.56%
