In [1]:
!pip install imbalanced-learn





[notice] A new release of pip is available: 23.0.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from imblearn.over_sampling import RandomOverSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch
from transformers import get_linear_schedule_with_warmup
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, recall_score

In [2]:
df = pd.read_csv("bugs.csv")

In [3]:
# Preprocessing steps
# 1) Delete new line tags
df['Summary'] = df['Summary'].str.replace(r'\n', ' ')

# 2) Delete URLs
df['Summary'] = df['Summary'].str.replace(r'http\S+', '')

# 3) Delete stack traces
df['Summary'] = df['Summary'].str.replace(r'stack trace\S+', '')

# 4) Delete hex codes
df['Summary'] = df['Summary'].str.replace(r'\b[0-9a-fA-F]+\b', '')

# Remove rows with empty 'Summary' after preprocessing
df = df[df['Summary'].str.strip().astype(bool)]


In [4]:
# Delete components with fewer than 15 issues
component_counts = df['Component'].value_counts()
valid_components = component_counts[component_counts >= 15].index
df = df[df['Component'].isin(valid_components)]

In [5]:
# Balance the data using oversampling
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(df['Summary'].values.reshape(-1, 1), df['Component'])

df_resampled = pd.DataFrame({'Summary': X_resampled.flatten(), 'Component': y_resampled})

In [6]:
# Split the data into training and testing sets
train_data, test_data = train_test_split(df_resampled, test_size=0.2, random_state=42)


In [7]:
# Tokenize the summaries using the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [8]:
# Tokenize the training data
train_tokens = tokenizer(list(train_data['Summary']), padding=True, truncation=True, return_tensors='pt')


In [9]:
# Tokenize the testing data
test_tokens = tokenizer(list(test_data['Summary']), padding=True, truncation=True, return_tensors='pt')

In [10]:
# Convert labels to PyTorch tensors with long type
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_data['Component'].astype('category').cat.codes.values)
test_labels = label_encoder.transform(test_data['Component'].astype('category').cat.codes.values)


In [11]:
# Create DataLoader for training and testing data
train_dataset = TensorDataset(train_tokens['input_ids'], train_tokens['attention_mask'], torch.tensor(train_labels, dtype=torch.long))
test_dataset = TensorDataset(test_tokens['input_ids'], test_tokens['attention_mask'], torch.tensor(test_labels, dtype=torch.long))

In [12]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [13]:
# Initialize BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df_resampled['Component'].unique()))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# Set weight decay and learning rate
optimizer = AdamW(model.parameters(), lr=5e-6, weight_decay=5e-6)

# Set dropout value
model.config.hidden_dropout_prob = 0.1



In [15]:
# Train the model
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch_num, batch in enumerate(train_loader):
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        # Print statements for debugging
        print(f"Epoch {epoch + 1}, Batch {batch_num + 1}/{len(train_loader)}")
        print("Input IDs:", input_ids)
        print("Attention Mask:", attention_mask)
        print("Labels:", labels)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {average_loss}')


Epoch 1, Batch 1/125
Input IDs: tensor([[  101,  1000, 16542,  5119,  1000,  7020,  2140,  5432,  3431,  2000,
          9587,  5831,  4571,  1035,  1052,  3211,  2595,  1035,  7561,  1035,
          2025,  1035,  2664,  1035,  9398,  1035,  8196,  2043, 22042,  1000,
          2062,  2592,  1000,  2044, 15887,  5119,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101, 16770,  5530,  4995,  1005,  1056,  7919,  5854,  2044,  2183,
          2013,  4720,  2000,  3438,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  5587, 12117, 12326,  2015,  2000,  2543, 14876,  2595,  2097,
          3828,  1999,  2797, 16602,   102,     0,     0,     0,     0,     0,
             0,   

In [19]:
from sklearn.metrics import accuracy_score, classification_report

# Evaluation function
def evaluate(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return all_preds, all_labels

# Evaluation on the test set
test_preds, test_labels = evaluate(model, test_loader)

# Convert predictions to original labels
test_preds_original = label_encoder.inverse_transform(test_preds)
test_labels_original = label_encoder.inverse_transform(test_labels)

# Calculate accuracy, F1 score, and recall
accuracy = accuracy_score(test_labels_original, test_preds_original)
classification_report_result = classification_report(test_labels_original, test_preds_original)

# Print the results
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(classification_report_result)





Accuracy: 0.6184738955823293
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.71      0.80        14
           1       0.53      0.45      0.49        20
           2       0.67      0.11      0.19        18
           3       0.81      1.00      0.89        17
           4       0.67      0.50      0.57        16
           5       0.72      1.00      0.84        13
           6       0.30      0.86      0.44         7
           7       0.37      1.00      0.54        15
           8       0.56      0.71      0.63        21
           9       0.75      0.45      0.56        20
          10       0.80      0.80      0.80        15
          11       1.00      0.37      0.54        19
          12       1.00      0.11      0.19        19
          13       0.62      0.89      0.73        18
          14       0.76      0.76      0.76        17

    accuracy                           0.62       249
   macro avg       0.70     