In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import re
import numpy as np



In [2]:
dataset = pd.read_csv("../datasets/gcc_data.csv")
print("Shape of the dataset:", dataset.shape)
print("\nFirst 5 rows of the dataset:")
print(dataset.head())

Shape of the dataset: (2103, 5)

First 5 rows of the dataset:
   Bug_ID              Assignee  \
0   20654  Aaron W. LaFramboise   
1   18246  Aaron W. LaFramboise   
2   18107  Aaron W. LaFramboise   
3   19074  Aaron W. LaFramboise   
4   18103  Aaron W. LaFramboise   

                                             Summary  \
0  exception.o is not included in libgcj.a due to...   
1  struct-layout-1 generator broken on i686-pc-mi...   
2  [4.0 Regression] [meta-bug]  Bootstrap fails o...   
3             libgfortran bootstrap fails on Windows   
4  libgfortran system header conflict breaks boot...   

                                         Description  \
0  binutils ar was recently changed to exclude pa...   
1  The gcc.dg-struct-layout-1 executable crashes ...   
2  This is a meta-bug for all of the things causi...   
3  This patch <http://gcc.gnu.org/ml/gcc-patches/...   
4  mingw-runtime has a system header called <io.h...   

                          Status  
0  RESOLVED       

In [3]:
dataset.dropna(subset=['Assignee'], inplace=True)

dataset['Summary'] = dataset['Summary'].fillna('')
dataset['Description'] = dataset['Description'].fillna('')

temp = []
for i in range(len(dataset['Summary'])):
    temp.append(
        f"Summary = {dataset['Summary'].iloc[i]} | Description = {dataset['Description'].iloc[i]}"
    )


dataset['text_input'] = temp

print("\n--- Target Variable (Assignee) Analysis ---")
num_unique_assignees = dataset['Assignee'].nunique()
print(f"Number of unique assignees (classes): {num_unique_assignees}")

print("\nTop 10 Assignees by bug count:")
print(dataset['Assignee'].value_counts().nlargest(10))


--- Target Variable (Assignee) Analysis ---
Number of unique assignees (classes): 82

Top 10 Assignees by bug count:
Assignee
Tobias Burnus             264
Benjamin Kosnik           257
Alexandre Petit-Bianco    159
Paolo Bonzini             110
David Edelsohn             99
Alexandre Oliva            95
Alan Modra                 89
David Malcolm              70
Andrew Haley               67
Bryce McKinlay             57
Name: count, dtype: int64


In [4]:
X = dataset['text_input']
y = dataset['Assignee']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

try:
    train_data, test_data, train_labels, test_labels = train_test_split(
        X, y_encoded, test_size=0.3, random_state=42, shuffle=True, stratify=y_encoded
    )
except ValueError as e:
    print(f"\nWarning: Could not stratify due to small class sizes. Retrying without stratification. Error: {e}")
    train_data, test_data, train_labels, test_labels = train_test_split(
        X, y_encoded, test_size=0.3, random_state=42, shuffle=True
    )


print("\nTrain data size (number of bug reports):", train_data.shape[0])
print("Test data size (number of bug reports):", test_data.shape[0])



Train data size (number of bug reports): 1472
Test data size (number of bug reports): 631


In [5]:
def preprocess_text(text):
    text = str(text).lower() # Ensure text is string
    text = re.sub(r'[^\w\s]', '', text)
    return text

print("\nPreprocessing training data...")
train_data_processed = [preprocess_text(text) for text in train_data]

print("Preprocessing testing data...")
test_data_processed = [preprocess_text(text) for text in test_data]

if train_data_processed:
    print("\nSample of processed training data (first item):")
    print(train_data_processed[0][:500] + "..." if len(train_data_processed[0]) > 500 else train_data_processed[0]) # Print first 500 chars
else:
    print("\nNo data to process after splits.")
    exit()


Preprocessing training data...
Preprocessing testing data...

Sample of processed training data (first item):
summary  gc shouldnt have to scan data section  description  right now the gc has to scan all the data sectionswe should change this so that roots unrelated toclasses must be registered  this should greatlyimprove gc performance seehttpgccgnuorgmljava200311msg00207htmlthis is an incompatible change we should make it alongwith our other abibreaking changes


In [6]:
print("\nVectorizing text data using TF-IDF...")
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', min_df=2, max_df=0.95)

train_data_tfidf = tfidf_vectorizer.fit_transform(train_data_processed)
test_data_tfidf = tfidf_vectorizer.transform(test_data_processed)

print("Shape of TF-IDF matrix for training data:", train_data_tfidf.shape)
print("Shape of TF-IDF matrix for testing data:", test_data_tfidf.shape)




Vectorizing text data using TF-IDF...
Shape of TF-IDF matrix for training data: (1472, 5000)
Shape of TF-IDF matrix for testing data: (631, 5000)


In [7]:
print("\nTraining the Multinomial Naive Bayes classifier...")
naive_bayes_classifier = MultinomialNB(alpha=1.0)
naive_bayes_classifier.fit(train_data_tfidf, train_labels)


Training the Multinomial Naive Bayes classifier...


In [8]:
print("\nPredicting on the test set...")
y_pred_encoded = naive_bayes_classifier.predict(test_data_tfidf)

y_pred_original_labels = label_encoder.inverse_transform(y_pred_encoded)
test_original_labels = label_encoder.inverse_transform(test_labels)

accuracy = accuracy_score(test_labels, y_pred_encoded)

unique_labels_in_evaluation = np.union1d(test_labels, y_pred_encoded)
target_names_for_report = label_encoder.inverse_transform(unique_labels_in_evaluation)


print("\n--- Evaluation Results ---")
print(f"Overall Accuracy: {accuracy * 100:.2f}%")

print("\nClassification Report:")
report = classification_report(
    test_labels,
    y_pred_encoded,
    labels=unique_labels_in_evaluation,
    target_names=target_names_for_report,
    zero_division=0
)
print(report)


Predicting on the test set...

--- Evaluation Results ---
Overall Accuracy: 32.33%

Classification Report:
                        precision    recall  f1-score   support

  Aaron W. LaFramboise       0.00      0.00      0.00         1
          Adam Butcher       0.00      0.00      0.00         2
            Alan Modra       1.00      0.04      0.07        28
        Aldy Hernandez       0.00      0.00      0.00        14
     Alexander Monakov       0.00      0.00      0.00        11
       Alexandre Oliva       0.40      0.11      0.18        35
Alexandre Petit-Bianco       0.81      0.89      0.85        56
        Andreas Tobler       0.00      0.00      0.00        13
          Andrew Haley       0.00      0.00      0.00        26
        Andrew Macleod       0.00      0.00      0.00         3
    Andrey Belevantsev       0.00      0.00      0.00         8
        Arnaud Charlet       0.00      0.00      0.00         9
          Ben Elliston       0.00      0.00      0.00      