In [1]:
pip install torch transformers scikit-learn pandas numpy tabulate

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from tabulate import tabulate

In [3]:
train_df = pd.read_csv('/Users/aswath/Downloads/Fall 2024/DM/Final Project/Code/train_scaled.csv', delimiter=',')
val_df = pd.read_csv('/Users/aswath/Downloads/Fall 2024/DM/Final Project/Code/valid_scaled.csv', delimiter=',')

# Print a few rows to confirm the data is loaded
print("Training DataFrame head:")
print(train_df.head())
print("Validation DataFrame head:")
print(val_df.head())

Training DataFrame head:
     project_name                                   project_version  label  \
0         Closure                                               144      2   
1          wicket  remotes/origin/bugs-dot-jar_WICKET-3764_48454f4d      1   
2            Lang                                                22      3   
3          wicket  remotes/origin/bugs-dot-jar_WICKET-4841_ce172da8      1   
4  jackrabbit-oak     remotes/origin/bugs-dot-jar_OAK-1977_4bfbfcdd      1   

                                                code  \
0  /**\n * Builds the function type, and puts it ...   
1  /**\n *  Returns whether or not this behavior ...   
2  /**\n * <p>Gets the greatest common divisor of...   
3  /**\n *  Returns base url without context or f...   
4  @Override\npublic Iterator<String> iterator() ...   

                                        code_comment  \
0  /**\n * Builds the function type, and puts it ...   
1  /**\n *  Returns whether or not this behavior ...   
2

In [4]:
print("Data type of labels in train_df:", train_df['label'].dtype)
print("Unique values in train_df['label']:", train_df['label'].unique())
print("Data type of labels in val_df:", val_df['label'].dtype)
print("Unique values in val_df['label']:", val_df['label'].unique())


Data type of labels in train_df: int64
Unique values in train_df['label']: [2 1 3 0]
Data type of labels in val_df: int64
Unique values in val_df['label']: [3 2 1 0]


In [5]:
# Ensure labels are integers
train_df['label'] = train_df['label'].astype(int)
val_df['label'] = val_df['label'].astype(int)


In [6]:
# Filter out any rows with labels outside the range 0-3
train_df = train_df[train_df['label'].between(0, 3)]
val_df = val_df[val_df['label'].between(0, 3)]


In [7]:
class BugSeverityDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        code = str(self.dataframe.iloc[index]['code'])
        label = self.dataframe.iloc[index]['label']
        
        # Tokenize the code snippet
        encoding = self.tokenizer.encode_plus(
            code,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Initialize the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

Downloading vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [8]:
BATCH_SIZE = 16
MAX_LEN = 512

# Create dataset objects
train_dataset = BugSeverityDataset(train_df, tokenizer, MAX_LEN)
val_dataset = BugSeverityDataset(val_df, tokenizer, MAX_LEN)

# Create DataLoader objects
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)


In [9]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=4)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)


Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 4  # Assuming 4 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
criterion = torch.nn.CrossEntropyLoss()




In [11]:
def train_epoch(model, data_loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0

    for batch in data_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

    return total_loss / len(data_loader)

def eval_model(model, data_loader, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            preds = torch.argmax(outputs.logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return total_loss / len(data_loader), all_preds, all_labels


In [12]:
print("Data type of labels in train_df:", train_df['label'].dtype)
print("Unique values in train_df['label']:", train_df['label'].unique())
print("Data type of labels in val_df:", val_df['label'].dtype)
print("Unique values in val_df['label']:", val_df['label'].unique())

Data type of labels in train_df: int64
Unique values in train_df['label']: [2 1 3 0]
Data type of labels in val_df: int64
Unique values in val_df['label']: [3 2 1 0]


In [13]:
EPOCHS = 4

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, device)
    val_loss, val_preds, val_labels = eval_model(model, val_loader, device)

    print(f'Training loss: {train_loss:.3f}')
    print(f'Validation loss: {val_loss:.3f}')
    print(classification_report(val_labels, val_preds))


Epoch 1/4


KeyboardInterrupt: 