<a href="https://colab.research.google.com/github/Ak4nksha/duplicate-bug-detector/blob/main/notebooks/01_profile_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q pandas numpy scikit-learn rank-bm25 sentence-transformers

import os, re, zipfile, glob, json, math, random
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##Load Datasets

In [3]:
DATA_ROOT = '/content/drive/MyDrive/DuplicateBugsDetector'

projects = {
    'firefox': f'{DATA_ROOT}/firefox/Firefox_bugs.csv',
    'hadoop': f'{DATA_ROOT}/hadoop/hadoop_bugs.csv',
    'hbase': f'{DATA_ROOT}/hbase/hbase_bugs.csv'
}

#loading and tagging each project
dfs = []
for name, path in projects.items():
    df = pd.read_csv(path, low_memory=False)
    df['project'] = name
    dfs.append(df)

# merging them into one dataframe
bugs = pd.concat(dfs, ignore_index=True)
print("Combined shape:", bugs.shape)
bugs.head()

Combined shape: (32730, 10)


Unnamed: 0,Summary,Issue id,Status,Priority,Resolution,Created,Resolved,Description,project,Affects Version/s
0,Address bar doesn't elide origins correctly,1606532,RESOLVED,--,DUPLICATE,2020-01-01 05:10:54+00:00,2023-06-06 00:44:25+00:00,User Agent: Mozilla/5.0 (Windows NT 10.0; Win6...,firefox,
1,"""TypeError: info.PDFFormatVersion is undefined...",1606566,VERIFIED,--,FIXED,2020-01-01 18:26:12+00:00,2020-02-06 10:01:56+00:00,When the PDF version cannot be extracted from ...,firefox,
2,no suggestions while typing in urlbar,1606572,RESOLVED,--,WORKSFORME,2020-01-01 22:21:53+00:00,2020-01-14 13:08:25+00:00,User Agent: Mozilla/5.0 (X11; Linux x86_64; rv...,firefox,
3,File association Remote Code Execution via com...,1606596,RESOLVED,--,FIXED,2020-01-02 07:54:43+00:00,2024-05-30 17:06:02+00:00,Tested on Microsoft Windows 10 Enterprise vers...,firefox,
4,login button on faq page is not working,1606602,RESOLVED,--,INCOMPLETE,2020-01-02 10:11:03+00:00,2023-07-30 20:18:55+00:00,User Agent: Mozilla/5.0 (Windows NT 6.1; Win64...,firefox,


In [4]:
#Cleaning and Profiling

#lowercasing and replacing spaces with underscores
bugs = bugs.rename(columns=lambda c: re.sub(r'[^\w]+', '_', c.strip().lower()))

#Clumn filtering
keep_cols = ['project','issue_id','summary','description','status','priority','resolution','created','resolved']
present = [c for c in keep_cols if c in bugs.columns]
bugs = bugs[present].copy()

bugs['created'] = pd.to_datetime(bugs['created'], errors='coerce')
bugs['resolved'] = pd.to_datetime(bugs['resolved'], errors='coerce')


#a combined text field for later models
bugs['summary'] = bugs.get('summary','').fillna('').astype(str)
bugs['description'] = bugs.get('description','').fillna('').astype(str)
bugs['text'] = (bugs['summary'].str.strip() + "\n" + bugs['description'].str.strip()).str.strip()
bugs['len_tokens'] = bugs['text'].str.split().str.len()

n_projects = bugs['project'].nunique()
n_issues   = bugs['issue_id'].nunique()

print(f"Number of projects: {n_projects} and number of issues: {n_issues}")

Number of projects: 3 and number of issues: 32730


In [5]:
# per-project counts
by_proj = bugs.groupby('project').agg(
    bugs=('issue_id','nunique'),
    avg_len_tokens=('len_tokens','mean')
).reset_index().sort_values('bugs', ascending=False)

print("\nPer-project counts and avg length (tokens):")
display(by_proj)

for col in ['resolution','status','priority']:
    print(f"\n {col.upper()} distribution")
    display(bugs[col].fillna('Unknown').value_counts().head(20))

print("\nMissing values per column:")
display(bugs.isna().sum().to_frame('n_missing'))



Per-project counts and avg length (tokens):


Unnamed: 0,project,bugs,avg_len_tokens
0,firefox,24824,232.299871
2,hbase,5403,88.170646
1,hadoop,2503,91.359169



 RESOLUTION distribution


Unnamed: 0_level_0,count
resolution,Unnamed: 1_level_1
FIXED,7285
Unknown,6227
Fixed,4986
DUPLICATE,4495
INCOMPLETE,4214
INVALID,1837
WORKSFORME,1816
WONTFIX,855
Duplicate,291
Won't Fix,175



 STATUS distribution


Unnamed: 0_level_0,count
status,Unnamed: 1_level_1
RESOLVED,16680
Resolved,5806
VERIFIED,3935
NEW,3404
Open,1782
UNCONFIRMED,415
REOPENED,288
In Progress,114
ASSIGNED,102
Patch Available,88



 PRIORITY distribution


Unnamed: 0_level_0,count
priority,Unnamed: 1_level_1
--,12174
Major,5670
P5,4654
P3,3791
P1,2325
P2,1823
Minor,1623
Critical,269
Trivial,180
Blocker,164



Missing values per column:


Unnamed: 0,n_missing
project,0
issue_id,0
summary,0
description,0
status,0
priority,0
resolution,6227
created,7906
resolved,7906
text,0


In [6]:

# sample rows
print("\nSample rows:")
display(bugs.sample(5, random_state=0)[['project','issue_id','summary','resolution']])




Sample rows:


Unnamed: 0,project,issue_id,summary,resolution
30233,hbase,13341394,Support AES-192 and AES-256 in DefaultCipherPr...,
23421,firefox,1931979,[Section Expt] Fix inconsistent medium card st...,FIXED
25900,hadoop,13554547,S3A: IAMInstanceCredentialsProvider failing: F...,Fixed
22122,firefox,1912386,New Tab preview stays visible and hides the un...,
888,firefox,1621177,Two errors are recorded in the EventLog if -lo...,FIXED


In [7]:
def clean_text(s):
    if pd.isna(s): return np.nan
    s = str(s).strip().lower()
    s = s.replace("’", "'")
    s = re.sub(r"[^a-z0-9]+", "", s)   # drop spaces, apostrophes, punctuation
    return s if s else np.nan


for col in ["resolution", "status", "priority"]:
    bugs[col] = bugs[col].map(clean_text)

print("RESOLUTION VALUES")
res_counts = bugs["resolution"].value_counts(dropna=False).to_frame("count")
display(res_counts)

print("\nSTATUS VALUES")
status_counts = bugs["status"].value_counts(dropna=False).to_frame("count")
display(status_counts)

print("\nPRIORITY VALUES")
priority_counts = bugs["priority"].value_counts(dropna=False).to_frame("count")
display(priority_counts)

RESOLUTION VALUES


Unnamed: 0_level_0,count
resolution,Unnamed: 1_level_1
fixed,12271
,6227
duplicate,4786
incomplete,4231
invalid,1918
worksforme,1823
wontfix,1030
done,92
notaproblem,86
moved,83



STATUS VALUES


Unnamed: 0_level_0,count
status,Unnamed: 1_level_1
resolved,22486
verified,3935
new,3404
open,1782
unconfirmed,415
reopened,322
inprogress,114
assigned,102
patchavailable,88
closed,82



PRIORITY VALUES


Unnamed: 0_level_0,count
priority,Unnamed: 1_level_1
,12174
major,5670
p5,4654
p3,3791
p1,2325
p2,1823
minor,1623
critical,269
trivial,180
blocker,164


In [8]:
#Since there are too many different labels for resolution, I am keeping only a small set of these values ( with higher number of samples) and grouping remaining as "others"

RES_KEEP = {
    "fixed": "fixed",
    "duplicate": "duplicate",
    "incomplete": "incomplete",
    "wontfix": "wontfix",
    "worksforme": "worksforme",
    "invalid": "invalid",
}

def res_labels(k):
  if pd.isna(k): return "unknown"
  return RES_KEEP.get(k, "other")

bugs["resolution"] = bugs["resolution"].map(res_labels)
res_counts = bugs["resolution"].value_counts(dropna=False).to_frame("count")
display(res_counts)



Unnamed: 0_level_0,count
resolution,Unnamed: 1_level_1
fixed,12271
unknown,6227
duplicate,4786
incomplete,4231
invalid,1918
worksforme,1823
wontfix,1030
other,444


In [9]:
#Priority column cleanup: map NaN to 'unspecified'

def priority_labels(k):
  if pd.isna(k): return "unspecified"
  return k

bugs["priority"] = bugs["priority"].map(priority_labels)
priority_counts = bugs["priority"].value_counts(dropna=False).to_frame("count")
display(priority_counts)

Unnamed: 0_level_0,count
priority,Unnamed: 1_level_1
unspecified,12174
major,5670
p5,4654
p3,3791
p1,2325
p2,1823
minor,1623
critical,269
trivial,180
blocker,164


In [10]:
display(bugs.sample(5, random_state=7)[["project","issue_id","resolution","priority"]])

Unnamed: 0,project,issue_id,resolution,priority
32388,hbase,13375468,fixed,major
11252,firefox,1748519,duplicate,unspecified
23391,firefox,1931516,fixed,p2
905,firefox,1621536,incomplete,p5
29786,hbase,13399503,unknown,major


##Load Duplicate bugs files

In [11]:
DATA_ROOT = '/content/drive/MyDrive/DuplicateBugsDetector'

projects = {
    'firefox': f'{DATA_ROOT}/firefox/Firefox_bugs-combined.csv',
    'hadoop': f'{DATA_ROOT}/hadoop/hadoop_bugs-combined.csv',
    'hbase': f'{DATA_ROOT}/hbase/hbase_bugs-combined.csv'
}

#loading and tagging each project
dfs_dups = []
for name, path in projects.items():
    df = pd.read_csv(path, low_memory=False)
    df['project'] = name

    df.columns = [re.sub(r'[^\w]+', '_', str(c)).strip('_').lower() for c in df.columns]

    #since we have different columns in files representing same thing - duplicate Ids
    other_col = "duplicates" if "duplicates" in df.columns else "duplicate_id"
    df = df.rename(columns={other_col: 'duplicate_id'})
    df = df[['project', 'issue_id', 'duplicate_id']].copy()
    df = df.dropna(subset=['duplicate_id'])
    dfs_dups.append(df)

# merging them into one dataframe
duplicates = pd.concat(dfs_dups, ignore_index=True)
print("Combined shape:", duplicates.shape)
duplicates.head()


Combined shape: (6487, 3)


Unnamed: 0,project,issue_id,duplicate_id
0,firefox,1606532,1598175.0
1,firefox,1606814,1605940.0
2,firefox,1607086,1607780.0
3,firefox,1607160,1608072.0
4,firefox,1607185,1610812.0


In [12]:
def handle_id(x):
    if pd.isna(x): return np.nan
    #convert to strings and handle float Ids
    try:
        return str(int(float(x)))
    except Exception:
        return str(x).strip()

dup_pairs = duplicates.copy()
dup_pairs["issue_a"] = dup_pairs["issue_id"].map(handle_id)
dup_pairs["issue_b"] = dup_pairs["duplicate_id"].map(handle_id)
dup_pairs = dup_pairs.dropna(subset=["issue_a","issue_b"])[["project","issue_a","issue_b"]]

bugs["issue_id"] = bugs["issue_id"].map(handle_id)


##Creating groups of duplicates

In [13]:
#union–find to create connected duplicate groups

parent = {}
def find(x):
    parent.setdefault(x, x)
    if parent[x] != x:
        parent[x] = find(parent[x])
    return parent[x]

def union(a, b):
    ra, rb = find(a), find(b)
    if ra != rb:
        parent[rb] = ra

for _, r in dup_pairs.iterrows():
    union(r["issue_a"], r["issue_b"])


group_id = {node: find(node) for node in parent.keys()}

#adding the duplicate ids to bugs df
bugs["dup_group"] = bugs["issue_id"].map(group_id)
bugs["is_duplicate"] = bugs["dup_group"].notna()

#stats

overall = {
    "projects": bugs["project"].nunique(),
    "issues": bugs["issue_id"].nunique(),
    "issues_in_duplicate_groups": int(bugs["is_duplicate"].sum()),
    "duplicate_rate": round(float(bugs["is_duplicate"].mean()), 4),
}
print("Overall duplicate stats:", overall)


by_proj = bugs.groupby("project").agg(
    issues=("issue_id","nunique"),
    dup_issues=("is_duplicate","sum")
).reset_index()
by_proj["dup_rate"] = (by_proj["dup_issues"] / by_proj["issues"]).round(4)
display(by_proj.sort_values("issues", ascending=False))



Overall duplicate stats: {'projects': 3, 'issues': 32730, 'issues_in_duplicate_groups': 6493, 'duplicate_rate': 0.1984}


Unnamed: 0,project,issues,dup_issues,dup_rate
0,firefox,24824,6254,0.2519
2,hbase,5403,111,0.0205
1,hadoop,2503,128,0.0511


In [14]:
bugs.head()

Unnamed: 0,project,issue_id,summary,description,status,priority,resolution,created,resolved,text,len_tokens,dup_group,is_duplicate
0,firefox,1606532,Address bar doesn't elide origins correctly,User Agent: Mozilla/5.0 (Windows NT 10.0; Win6...,resolved,unspecified,duplicate,2020-01-01 05:10:54+00:00,2023-06-06 00:44:25+00:00,Address bar doesn't elide origins correctly\nU...,88,1942560.0,True
1,firefox,1606566,"""TypeError: info.PDFFormatVersion is undefined...",When the PDF version cannot be extracted from ...,verified,unspecified,fixed,2020-01-01 18:26:12+00:00,2020-02-06 10:01:56+00:00,"""TypeError: info.PDFFormatVersion is undefined...",82,,False
2,firefox,1606572,no suggestions while typing in urlbar,User Agent: Mozilla/5.0 (X11; Linux x86_64; rv...,resolved,unspecified,worksforme,2020-01-01 22:21:53+00:00,2020-01-14 13:08:25+00:00,no suggestions while typing in urlbar\nUser Ag...,46,,False
3,firefox,1606596,File association Remote Code Execution via com...,Tested on Microsoft Windows 10 Enterprise vers...,resolved,unspecified,fixed,2020-01-02 07:54:43+00:00,2024-05-30 17:06:02+00:00,File association Remote Code Execution via com...,413,,False
4,firefox,1606602,login button on faq page is not working,User Agent: Mozilla/5.0 (Windows NT 6.1; Win64...,resolved,unspecified,incomplete,2020-01-02 10:11:03+00:00,2023-07-30 20:18:55+00:00,login button on faq page is not working\nUser ...,58,,False


### Train/Test split per project

In [16]:
def time_split_per_project(df, test_ratio=0.2, random_state=42):
    #Return train_df, test_df, 80-20 split
    rng = np.random.RandomState(random_state)
    parts_train, parts_test = [], []
    for proj, g in df.groupby("project"):
        g = g.copy()
        if "created" in g.columns and not g["created"].isna().all():  # if we have creation dates for bugs
            g = g.sort_values("created") #sort by date/time of creation
            cut = int(len(g) * (1 - test_ratio)) #split size 80%
            train, test = g.iloc[:cut], g.iloc[cut:]
        else:
            idx = np.arange(len(g))    #random split - fallback option
            rng.shuffle(idx)
            cut = int(len(g) * (1 - test_ratio))
            train, test = g.iloc[idx[:cut]], g.iloc[idx[cut:]]
        parts_train.append(train)
        parts_test.append(test)


    return pd.concat(parts_train), pd.concat(parts_test)

train_df, test_df  = time_split_per_project(bugs)
print("Train/Test sizes:", train_df.shape, test_df.shape)

#check if training and test set have different ids
print("Check Disjoint IDs:", set(train_df.issue_id).isdisjoint(set(test_df.issue_id)))





Train/Test sizes: (26183, 13) (6547, 13)
Check Disjoint IDs: True


In [17]:
#Saving the cleaned files to new directories in drive for furhter use

out_dir = "/content/drive/MyDrive/DuplicateBugsDetector/cleaned_files"
os.makedirs(out_dir, exist_ok=True)
train_df.to_csv(f"{out_dir}/train.csv", index=False)
test_df.to_csv(f"{out_dir}/test.csv", index=False)
bugs.to_csv(f"{out_dir}/all_bugs_clean.csv", index=False)

print("Saved files:")
print(f"- {out_dir}/train.csv")
print(f"- {out_dir}/test.csv")
print(f"- {out_dir}/all_bugs_clean.csv")

# quick peek
display(train_df.sample(3, random_state=0)[["project","issue_id","resolution","is_duplicate"]])
display(test_df.sample(3, random_state=1)[["project","issue_id","resolution","is_duplicate"]])

Saved files:
- /content/drive/MyDrive/DuplicateBugsDetector/cleaned_files/train.csv
- /content/drive/MyDrive/DuplicateBugsDetector/cleaned_files/test.csv
- /content/drive/MyDrive/DuplicateBugsDetector/cleaned_files/all_bugs_clean.csv


Unnamed: 0,project,issue_id,resolution,is_duplicate
25722,hadoop,13305616,fixed,False
18699,firefox,1857118,fixed,True
28853,hbase,13534978,unknown,False


Unnamed: 0,project,issue_id,resolution,is_duplicate
24333,firefox,1945644,worksforme,False
28461,hbase,13419137,fixed,False
20554,firefox,1889077,incomplete,False
