In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import re
import math
from collections import defaultdict

In [2]:
try:
    word_tokenize("test")
except LookupError:
    print("NLTK 'punkt' resource not found. Downloading...")
    nltk.download('punkt')
    nltk.download('punkt_tab')


try:
    dataset = pd.read_csv('../datasets/gcc_data.csv')
except FileNotFoundError:
    print("Error: gcc_data.csv not found. Please ensure the file is in the correct path.")
    exit()

print("Original dataset shape:", dataset.shape)

Original dataset shape: (2103, 5)


In [3]:
dataset.dropna(subset=['Assignee'], inplace=True)

dataset['Summary'] = dataset['Summary'].fillna('')
dataset['Description'] = dataset['Description'].fillna('')

temp = []
for i in range(len(dataset['Summary'])):
    temp.append(
        f"Summary = {dataset['Summary'].iloc[i]} | Description = {dataset['Description'].iloc[i]}"
    )


dataset['text_input'] = temp

print("\n--- Target Variable (Assignee) Analysis ---")
num_unique_assignees = dataset['Assignee'].nunique()
print(f"Number of unique assignees (classes): {num_unique_assignees}")

print("\nTop 10 Assignees by bug count:")
print(dataset['Assignee'].value_counts().nlargest(10))


--- Target Variable (Assignee) Analysis ---
Number of unique assignees (classes): 82

Top 10 Assignees by bug count:
Assignee
Tobias Burnus             264
Benjamin Kosnik           257
Alexandre Petit-Bianco    159
Paolo Bonzini             110
David Edelsohn             99
Alexandre Oliva            95
Alan Modra                 89
David Malcolm              70
Andrew Haley               67
Bryce McKinlay             57
Name: count, dtype: int64


In [4]:
# --- Target Variable (Assignee) Analysis ---
print("\n--- Assignee Analysis ---")
unique_assignees = dataset['Assignee'].unique()
num_classes = len(unique_assignees)
print(f"Number of unique assignees (classes): {num_classes}")

print("\nTop 10 Assignees by bug count:")
print(dataset['Assignee'].value_counts().nlargest(10))




--- Assignee Analysis ---
Number of unique assignees (classes): 82

Top 10 Assignees by bug count:
Assignee
Tobias Burnus             264
Benjamin Kosnik           257
Alexandre Petit-Bianco    159
Paolo Bonzini             110
David Edelsohn             99
Alexandre Oliva            95
Alan Modra                 89
David Malcolm              70
Andrew Haley               67
Bryce McKinlay             57
Name: count, dtype: int64


In [5]:
try:
    train_data_df, test_data_df = train_test_split(dataset, test_size=0.3, random_state=42, shuffle=True, stratify=dataset['Assignee'])
except ValueError:
    print("Warning: Stratification failed, possibly due to too few samples in some classes. Splitting without stratification.")
    train_data_df, test_data_df = train_test_split(dataset, test_size=0.3, random_state=42, shuffle=True)

train_texts = train_data_df['text_input'].tolist()
train_labels = train_data_df['Assignee'].tolist()

test_texts = test_data_df['text_input'].tolist()
test_labels = test_data_df['Assignee'].tolist()

print(f"\nTrain data size: {len(train_texts)} bug reports")
print(f"Test data size: {len(test_texts)} bug reports")


Train data size: 1472 bug reports
Test data size: 631 bug reports


In [6]:
# --- Text Preprocessing Function ---
def preprocess_text(text):
    text = str(text).lower() # Ensure it's a string and lowercase
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    # Consider removing digits: text = re.sub(r'\d+', '', text)
    # Consider stemming/lemmatization for more advanced preprocessing
    return text

print("\nPreprocessing training texts...")
preprocessed_train_texts = [preprocess_text(text) for text in train_texts]
print("Preprocessing testing texts...")
preprocessed_test_texts = [preprocess_text(text) for text in test_texts]

if preprocessed_train_texts:
    print("\nSample of processed training text (first item):")
    print(preprocessed_train_texts[0][:500] + "..." if len(preprocessed_train_texts[0]) > 500 else preprocessed_train_texts[0])
else:
    print("No training data to process.")
    exit()




Preprocessing training texts...
Preprocessing testing texts...

Sample of processed training text (first item):
summary  gc shouldnt have to scan data section  description  right now the gc has to scan all the data sectionswe should change this so that roots unrelated toclasses must be registered  this should greatlyimprove gc performance seehttpgccgnuorgmljava200311msg00207htmlthis is an incompatible change we should make it alongwith our other abibreaking changes


In [7]:
print("\nBuilding vocabulary...")
vocabulary = set()
for text_tokens in preprocessed_train_texts:
    words = word_tokenize(text_tokens)
    vocabulary.update(words)
V = len(vocabulary)
print(f"Vocabulary size: {V} unique words")



Building vocabulary...
Vocabulary size: 35799 unique words


In [8]:
# --- Calculating Prior Probabilities for each Assignee ---
print("\nCalculating prior probabilities...")
class_counts = FreqDist(train_labels)
total_train_samples = len(train_labels)
prior_probs = {assignee: count / total_train_samples for assignee, count in class_counts.items()}

# print("Prior Probabilities (sample):")
# for i, (assignee, prob) in enumerate(prior_probs.items()):
# if i < 5: print(f"  P({assignee}) = {prob:.4f}")




Calculating prior probabilities...


In [9]:
# --- Calculating Conditional Probabilities (Word Frequencies per Assignee) ---
print("\nCalculating word frequencies per assignee...")
# word_counts_per_class[assignee] = FreqDist of words for that assignee
word_counts_per_class = defaultdict(FreqDist)
# total_words_per_class[assignee] = total number of words for that assignee
total_words_per_class = defaultdict(int)

for text_tokens, label in zip(preprocessed_train_texts, train_labels):
    words = word_tokenize(text_tokens)
    word_counts_per_class[label].update(words)
    total_words_per_class[label] += len(words)




Calculating word frequencies per assignee...


In [10]:
# --- Calculating Conditional Word Probabilities P(word | Assignee) with Laplace Smoothing ---
print("\nCalculating conditional word probabilities with Laplace smoothing...")
# conditional_word_probs[assignee][word] = P(word | assignee)
conditional_word_probs = defaultdict(lambda: defaultdict(float))
alpha = 1 # Laplace smoothing factor

for assignee in unique_assignees: # Iterate over all known assignees
    denominator = total_words_per_class[assignee] + alpha * V
    for word in vocabulary:
        count = word_counts_per_class[assignee][word]
        conditional_word_probs[assignee][word] = (count + alpha) / denominator
    # Probability for unknown words given a class (not explicitly stored but handled by smoothing)
    # This is P(unknown_word | assignee) = alpha / denominator
    # We can store this if needed, but typically unknown words in test doc are ignored if not in vocabulary.




Calculating conditional word probabilities with Laplace smoothing...


In [11]:
# --- Implementing the Naive Bayes Classifier ---
print("\nSetting up Naive Bayes classifier predict function...")
def predict_assignee(text_to_classify):
    processed_text = preprocess_text(text_to_classify)
    words_in_text = word_tokenize(processed_text)
    
    log_probs_per_assignee = {}

    for assignee in unique_assignees: # Iterate through all possible assignees
        # Start with log prior probability
        # If an assignee had 0 training samples, their prior_prob might be missing or 0.
        # Handle cases where an assignee might not be in prior_probs (e.g., appeared only in test set, though unlikely with good split)
        # or if prior_prob is 0, log(0) is undefined.
        if prior_probs.get(assignee, 0) == 0:
            log_probs_per_assignee[assignee] = -float('inf') # Effectively impossible
            continue
        
        log_prob_assignee = math.log(prior_probs[assignee])
        
        # Add log conditional probabilities for words in the text
        for word in words_in_text:
            if word in vocabulary: # Only consider words seen in training vocabulary
                # If a word was in vocab but never seen for this specific assignee,
                # its conditional_word_probs[assignee][word] would use the smoothed value
                log_prob_assignee += math.log(conditional_word_probs[assignee][word])
            # else: word not in vocabulary, ignore it (standard practice)

        log_probs_per_assignee[assignee] = log_prob_assignee
        
    # Return the assignee with the highest log probability
    if not log_probs_per_assignee: # Should not happen if unique_assignees is populated
        return None 
    
    # Find assignee with max log probability
    best_assignee = max(log_probs_per_assignee, key=log_probs_per_assignee.get)
    return best_assignee

# --- Predict labels for test texts ---
print("\nPredicting assignees for test set...")
predicted_assignees = [predict_assignee(text) for text in preprocessed_test_texts]

# --- Calculate Accuracy ---
print("\nCalculating accuracy...")
correct_predictions = 0
for predicted, actual in zip(predicted_assignees, test_labels):
    if predicted == actual:
        correct_predictions += 1

accuracy = correct_predictions / len(test_labels) if len(test_labels) > 0 else 0

print(f"\nAccuracy on the test set: {accuracy * 100:.2f}%")


Setting up Naive Bayes classifier predict function...

Predicting assignees for test set...

Calculating accuracy...

Accuracy on the test set: 33.44%


In [12]:
# --- Predict labels for test texts ---
print("\nPredicting assignees for test set...")
predicted_assignees = [predict_assignee(text) for text in preprocessed_test_texts]


Predicting assignees for test set...


In [13]:
# --- Calculate Accuracy ---
print("\nCalculating accuracy...")
correct_predictions = 0
for predicted, actual in zip(predicted_assignees, test_labels):
    if predicted == actual:
        correct_predictions += 1

accuracy = correct_predictions / len(test_labels) if len(test_labels) > 0 else 0

print(f"\nAccuracy on the test set: {accuracy * 100:.2f}%")


Calculating accuracy...

Accuracy on the test set: 33.44%
