In [33]:
!pip install rank_bm25



In [34]:
import numpy as np
import pandas as pd

import requests
import json
import urllib.request
import time
import re
import nltk
nltk.download('punkt')

from bs4 import BeautifulSoup
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\comka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# For Complete Data with Duplicates

In [35]:
dataset = pd.read_csv('../../PrimaryDataset/firefox_preproccessed_whole_dataset.csv')

print('Dimensions of Dataset: ', dataset.shape)

#To Create a Copy
data = dataset.copy()

#To add 'Summary' and 'Description' in a new Column 'Description1'
data['Description1'] = data['Summary']+ data['Description']

#To add only required Columns
data = data[['Bug ID', 'Description1', 'Duplicate_Bug_Ids', 'Resolution']]

#To Rename the Columns
data = data.rename(columns = {'Bug ID':'Bug_Id', 'Description1':'Description'})

#To show Data
data.head()

Dimensions of Dataset:  (38290, 11)


Unnamed: 0,Bug_Id,Description,Duplicate_Bug_Ids,Resolution
0,1399936,Shield unenrolls users in active preference ex...,,FIXED
1,1387208,browser_as_load_location.js is permafailing on...,,FIXED
2,1368432,web.skype.com is inaccessible: SEC_ERROR_OCSP_...,1368433.0,DUPLICATE
3,1383741,Perma failure when 56 merge to beta in browser...,,FIXED
4,1388753,Bookmarks and history pages in Library sub men...,,FIXED


In [36]:
#To Tokenize Data
sentences = data['Description']
tokens = []

for sentence in sentences:
    cleanedTex=re.sub(r'[^\w\s]','',str(sentence)).lower()
    words = (word_tokenize(cleanedTex))
    tokens.append(words)

#BM25
bm25 = BM25Okapi(tokens)

In [37]:
#Replacing Blank Values with 'NaN' Values
data.replace("", np.nan, inplace=True)

#Droping Data which has 'Nan' value for 'Description' Column
data.dropna(subset = ["Description"], inplace=True)

#To Reset the index
data = data.reset_index(drop=True)

data['Description']=data['Description'].values.astype('object')

#Filtering data based on 'Duplicate_Bug_Ids' Column to get Duplicate Bugs
# data = data[data['Duplicate_Bug_Ids'] > 0]
# print('Dimensions of Data with Duplicate Bug Ids: ', data.shape)

In [63]:
def getDuplicates(whole_data, data, tokens, bm25):
    count=0
    dup_indices_with_bug_id = {}
    similarity_scores_before = []
    similarity_scores = {}
    bugs = whole_data['Bug_Id']

    for i, row in data.iterrows():
        bug_id = row['Bug_Id']
        dup_bug_id = row['Duplicate_Bug_Ids']
    
        if dup_bug_id not in bugs.values:
            count+=1
            continue
        
        similarity_row = bm25.get_scores(tokens[i])
        argsort_similarity = np.argsort(similarity_row)[::-1][1:]
        #similarity_scores_before = (np.sort(similarity_row)[::-1][1:])
        dup_bug_index = list(bugs[bugs == dup_bug_id].index)[0]
        dup_ranking = np.argmax(argsort_similarity == np.int64(dup_bug_index)) 
        dup_indices_with_bug_id[bug_id] = dup_ranking
        
    return dup_indices_with_bug_id

In [39]:
def calculate_recall(dup_indices_with_bug_id):
    counts = {1: 0, 5: 0, 10: 0, 20: 0, 25: 0, 30: 0, 50: 0, 75: 0, 100: 0}

    for value in dup_indices_with_bug_id.values():
        for key in counts.keys():
            if(value < key):
                counts[key] += 1

    N = len(dup_indices_with_bug_id)
    
    for key in counts.keys():
        recall_rate = counts[key]/N
        print('Recall Rate at '+str(key)+': ', recall_rate)

In [None]:
#To get Duplicate Indices
dup_indices_with_bug_id = getDuplicates(data, data, tokens, bm25)

In [41]:
#To Calculate Recall
print('For Complete Dataset: ')
calculate_recall(dup_indices_with_bug_id)

For Complete Dataset: 
Recall Rate at 1:  0.16105082809822957
Recall Rate at 5:  0.27984009137635635
Recall Rate at 10:  0.33637921187892633
Recall Rate at 20:  0.39149057681324956
Recall Rate at 25:  0.4106225014277556
Recall Rate at 30:  0.4251856082238721
Recall Rate at 50:  0.46601941747572817
Recall Rate at 75:  0.5014277555682467
Recall Rate at 100:  0.5268418046830383


# For Textually Similar Data

In [64]:
dataset_sim = pd.read_csv('../../PreProcessedData/Firefox_final_sim.csv')

print('Dimensions of Dataset: ', dataset_sim.shape)

#To Create a Copy
data_sim = dataset_sim.copy()

#To add 'Summary' and 'Description' in a new Column 'Description1'
data_sim['Description1'] = data_sim['Summary']+ data_sim['Description']

#To add only required Columns
data_sim = data_sim[['Bug ID', 'Description1', 'Duplicate_Bug_Ids', 'Resolution']]

#To Rename the Columns
data_sim = data_sim.rename(columns = {'Bug ID':'Bug_Id', 'Description1':'Description'})

#To show Data
data_sim.head()

Dimensions of Dataset:  (1239, 14)


Unnamed: 0,Bug_Id,Description,Duplicate_Bug_Ids,Resolution
0,1369688,One-off search takes the initial value rather ...,1331736.0,DUPLICATE
1,1332106,[Deleted][Windows 10] Possible Cortana Search ...,1332118.0,DUPLICATE
2,1328104,Add Search Engine Button Does Not Workuser age...,1323525.0,DUPLICATE
3,1333599,URL Spoofing by using onbeforeunload and openi...,1481994.0,DUPLICATE
4,1351282,Cannot delete or edit invalid bookmarkcreated ...,1401401.0,DUPLICATE


In [65]:
#To Tokenize Data
sentences = data_sim['Description']
tokens_sim = []

for sentence in sentences:
    cleanedTex=re.sub(r'[^\w\s]','',str(sentence)).lower()
    words = (word_tokenize(cleanedTex))
    tokens_sim.append(words)

#BM25
bm25 = BM25Okapi(tokens)

In [67]:
#Replacing Blank Values with 'NaN' Values
data_sim.replace("", np.nan, inplace=True)

#Droping Data which has 'Nan' value for 'Description' Column
data_sim.dropna(subset = ["Description"], inplace=True)

#To Reset the index
data_sim = data_sim.reset_index(drop=True)

data_sim['Description']=data_sim['Description'].values.astype('object')

#Filtering data based on 'Duplicate_Bug_Ids' Column to get Duplicate Bugs
# data_sim = data_sim[data_sim['Duplicate_Bug_Ids'] > 0]
# print('Dimensions of Data with Duplicate Bug Ids: ', data_sim.shape)

In [None]:
#To get Duplicate Indices
dup_indices_with_bug_id_similar = getDuplicates(data, data_sim, tokens_sim, bm25)

In [69]:
#To Calculate Recall
print('For Textually Similar Dataset: ')
calculate_recall(dup_indices_with_bug_id_similar)

For Textually Similar Dataset: 
Recall Rate at 1:  0.11567164179104478
Recall Rate at 5:  0.208955223880597
Recall Rate at 10:  0.2667910447761194
Recall Rate at 20:  0.31716417910447764
Recall Rate at 25:  0.3358208955223881
Recall Rate at 30:  0.35634328358208955
Recall Rate at 50:  0.39552238805970147
Recall Rate at 75:  0.44029850746268656
Recall Rate at 100:  0.47201492537313433


# For Textually Dissimilar Data

In [70]:
dataset_dissim = pd.read_csv('../../PreProcessedData/Firefox_final_dis.csv')

print('Dimensions of Dataset: ', dataset_dissim.shape)

#To Create a Copy
data_dissim = dataset_dissim.copy()

#To add 'Summary' and 'Description' in a new Column 'Description1'
data_dissim['Description1'] = data_dissim['Summary']+ data_dissim['Description']

#To add only required Columns
data_dissim = data_dissim[['Bug ID', 'Description1', 'Duplicate_Bug_Ids', 'Resolution']]

#To Rename the Columns
data_dissim = data_dissim.rename(columns = {'Bug ID':'Bug_Id', 'Description1':'Description'})

#To show Data
data_dissim.head()

Dimensions of Dataset:  (1589, 14)


Unnamed: 0,Bug_Id,Description,Duplicate_Bug_Ids,Resolution
0,1357039,Should add the Customize Firefox tour in the o...,1357029.0,DUPLICATE
1,1359062,Increase contrast of the Downloads Indicator o...,1347543.0,DUPLICATE
2,1357056,Should not display the notification bar if use...,1357641.0,DUPLICATE
3,1357041,Should highlight the customize button in the h...,1357029.0,DUPLICATE
4,1378164,"Noisy debug in a debug build (of Thunderbird, ...",1377923.0,DUPLICATE


In [71]:
#To Tokenize Data
sentences = data_dissim['Description']
tokens_dissim = []

for sentence in sentences:
    cleanedTex=re.sub(r'[^\w\s]','',str(sentence)).lower()
    words = (word_tokenize(cleanedTex))
    tokens_dissim.append(words)

#BM25
bm25 = BM25Okapi(tokens)

In [73]:
#Replacing Blank Values with 'NaN' Values
data_dissim.replace("", np.nan, inplace=True)

#Droping Data which has 'Nan' value for 'Description' Column
data_dissim.dropna(subset = ["Description"], inplace=True)

#To Reset the index
data_dissim = data_dissim.reset_index(drop=True)

data_dissim['Description']=data_dissim['Description'].values.astype('object')

#Filtering data based on 'Duplicate_Bug_Ids' Column to get Duplicate Bugs
# data_dissim = data_dissim[data_dissim['Duplicate_Bug_Ids'] > 0]
# print('Dimensions of Data with Duplicate Bug Ids: ', data_dissim.shape)

In [None]:
#To get Duplicate Indices
dup_indices_with_bug_id_dissimilar = getDuplicates(data, data_dissim, tokens_dissim, bm25)

In [75]:
#To Calculate Recall
print('For Textually Dissimilar Dataset: ')
calculate_recall(dup_indices_with_bug_id_dissimilar)

For Textually Dissimilar Dataset: 
Recall Rate at 1:  0.20710784313725492
Recall Rate at 5:  0.34681372549019607
Recall Rate at 10:  0.4007352941176471
Recall Rate at 20:  0.4583333333333333
Recall Rate at 25:  0.47549019607843135
Recall Rate at 30:  0.4889705882352941
Recall Rate at 50:  0.5306372549019608
Recall Rate at 75:  0.5588235294117647
Recall Rate at 100:  0.5808823529411765
