In [144]:
import numpy as np
import pandas as pd

import requests
import json
import urllib.request
import time
import re
import nltk
nltk.download('punkt')

from bs4 import BeautifulSoup
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\comka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [145]:
dataset = pd.read_csv('../../PrimaryDataset/mobile_preproccessed_whole_dataset.csv')

print('Dimensions of Dataset: ', dataset.shape)

#To Create a Copy
data = dataset.copy()

#To add 'Summary' and 'Description' in a new Column 'Description1'
data['Description1'] = data['Summary']+ data['Description']

#To add only required Columns
data = data[['Bug ID', 'Description1', 'Duplicate_Bug_Ids', 'Resolution']]

#To Rename the Columns
data = data.rename(columns = {'Bug ID':'Bug_Id', 'Description1':'Description'})

#To show Data
data.head()

Dimensions of Dataset:  (5320, 12)


Unnamed: 0,Bug_Id,Description,Duplicate_Bug_Ids,Resolution
0,1668376,Test failure in dom/base/test/test_window_clos...,,FIXED
1,1631754,PanZoomController erroneously returns INPUT_RE...,,FIXED
2,1677838,NumberFormatExceptionjava.lang.Integer in pars...,,FIXED
3,1686100,Route Service.onTrimMemory notifications throu...,,FIXED
4,1630229,Crash in [@ java.lang.AssertionError: at org.m...,,FIXED


In [146]:
#To Tokenize Data
sentences = data['Description']
tokens = []

for sentence in sentences:
    cleanedTex=re.sub(r'[^\w\s]','',str(sentence)).lower()
    words = (word_tokenize(cleanedTex))
    tokens.append(words)

#BM25
bm25 = BM25Okapi(tokens)

In [147]:
#Replacing Blank Values with 'NaN' Values
data.replace("", np.nan, inplace=True)

#Droping Data which has 'Nan' value for 'Description' Column
data.dropna(subset = ["Description"], inplace=True)

#To Reset the index
data = data.reset_index(drop=True)

data['Description']=data['Description'].values.astype('object')

#Filtering data based on 'Duplicate_Bug_Ids' Column to get Duplicate Bugs
# data = data[data['Duplicate_Bug_Ids'] > 0]
# print('Dimensions of Data with Duplicate Bug Ids: ', data.shape)

In [148]:
def getDuplicates(whole_data, data, tokens, bm25):
    count=0
    dup_indices_with_bug_id = {}
    bugs = whole_data['Bug_Id']

    for i, row in data.iterrows():
        bug_id = row['Bug_Id']
        dup_bug_id = row['Duplicate_Bug_Ids']
    
        if dup_bug_id not in bugs.values:
            count+=1
            continue
        
        similarity_row = bm25.get_scores(tokens[i])
        argsort_similarity = np.argsort(similarity_row)[::-1][1:]
        dup_bug_index = list(bugs[bugs == dup_bug_id].index)[0]
        dup_ranking = np.argmax(argsort_similarity == np.int64(dup_bug_index)) 
        dup_indices_with_bug_id[bug_id] = dup_ranking
        
    return dup_indices_with_bug_id

In [149]:
def calculate_recall(dup_indices_with_bug_id):
    counts = {1: 0, 5: 0, 10: 0, 20: 0, 25: 0, 30: 0, 50: 0, 75: 0, 100: 0}

    for value in dup_indices_with_bug_id.values():
        for key in counts.keys():
            if(value < key):
                counts[key] += 1

    N = len(dup_indices_with_bug_id)
    
    for key in counts.keys():
        recall_rate = counts[key]/N
        print('Recall Rate at '+str(key)+': ', recall_rate)

In [150]:
#To get Duplicate Indices
dup_indices_with_bug_id = getDuplicates(data, data, tokens, bm25)

In [151]:
#To Calculate Recall
print('For Complete Dataset: ')
calculate_recall(dup_indices_with_bug_id)

For Complete Dataset: 
Recall Rate at 1:  0.17251461988304093
Recall Rate at 5:  0.2894736842105263
Recall Rate at 10:  0.3538011695906433
Recall Rate at 20:  0.4239766081871345
Recall Rate at 25:  0.4502923976608187
Recall Rate at 30:  0.4678362573099415
Recall Rate at 50:  0.5087719298245614
Recall Rate at 75:  0.5555555555555556
Recall Rate at 100:  0.5760233918128655


# For Textually Similar Data

In [152]:
dataset_sim = pd.read_csv('../../PreProcessedData/Mobile_final_sim.csv')

print('Dimensions of Dataset: ', dataset_sim.shape)

#To Create a Copy
data_sim = dataset_sim.copy()

#To add 'Summary' and 'Description' in a new Column 'Description1'
data_sim['Description1'] = data_sim['Summary']+ data_sim['Description']

#To add only required Columns
data_sim = data_sim[['Bug ID', 'Description1', 'Duplicate_Bug_Ids', 'Resolution']]

#To Rename the Columns
data_sim = data_sim.rename(columns = {'Bug ID':'Bug_Id', 'Description1':'Description'})

#To show Data
data_sim.head()

Dimensions of Dataset:  (122, 15)


Unnamed: 0,Bug_Id,Description,Duplicate_Bug_Ids,Resolution
0,1536820,Intermittent org.mozilla.geckoview.test.WebExt...,1535913.0,DUPLICATE
1,1466482,Intermittent org.mozilla.geckoview.test.GeckoS...,1464351.0,DUPLICATE
2,1677708,Intermittent org.mozilla.geckoview.test.PanZoo...,1678895.0,DUPLICATE
3,1703350,Intermittent org.mozilla.geckoview.test.TextIn...,1681261.0,DUPLICATE
4,1574141,Intermittent org.mozilla.geckoview.test.Conten...,1564920.0,DUPLICATE


In [153]:
#To Tokenize Data
sentences = data_sim['Description']
tokens_sim = []

for sentence in sentences:
    cleanedTex=re.sub(r'[^\w\s]','',str(sentence)).lower()
    words = (word_tokenize(cleanedTex))
    tokens_sim.append(words)

#BM25
bm25 = BM25Okapi(tokens)

In [154]:
#Replacing Blank Values with 'NaN' Values
data_sim.replace("", np.nan, inplace=True)

#Droping Data which has 'Nan' value for 'Description' Column
data_sim.dropna(subset = ["Description"], inplace=True)

#To Reset the index
data_sim = data_sim.reset_index(drop=True)

data_sim['Description']=data_sim['Description'].values.astype('object')

#Filtering data based on 'Duplicate_Bug_Ids' Column to get Duplicate Bugs
# data_sim = data_sim[data_sim['Duplicate_Bug_Ids'] > 0]
# print('Dimensions of Data with Duplicate Bug Ids: ', data_sim.shape)

In [155]:
#To get Duplicate Indices
dup_indices_with_bug_id_similar = getDuplicates(data, data_sim, tokens_sim, bm25)

In [156]:
#To Calculate Recall
print('For Textually Similar Dataset: ')
calculate_recall(dup_indices_with_bug_id_similar)

For Textually Similar Dataset: 
Recall Rate at 1:  0.22
Recall Rate at 5:  0.44
Recall Rate at 10:  0.48
Recall Rate at 20:  0.58
Recall Rate at 25:  0.6
Recall Rate at 30:  0.64
Recall Rate at 50:  0.7
Recall Rate at 75:  0.78
Recall Rate at 100:  0.78


# For Textually Dissimilar Data

In [157]:
dataset_dissim = pd.read_csv('../../PreProcessedData/Mobile_final_dis.csv')

print('Dimensions of Dataset: ', dataset_dissim.shape)

#To Create a Copy
data_dissim = dataset_dissim.copy()

#To add 'Summary' and 'Description' in a new Column 'Description1'
data_dissim['Description1'] = data_dissim['Summary']+ data_dissim['Description']

#To add only required Columns
data_dissim = data_dissim[['Bug ID', 'Description1', 'Duplicate_Bug_Ids', 'Resolution']]

#To Rename the Columns
data_dissim = data_dissim.rename(columns = {'Bug ID':'Bug_Id', 'Description1':'Description'})

#To show Data
data_dissim.head()

Dimensions of Dataset:  (131, 15)


Unnamed: 0,Bug_Id,Description,Duplicate_Bug_Ids,Resolution
0,1655196,High ratio of ERROR_NO_MINIDUMP_HEADERtalking ...,1644486.0,DUPLICATE
1,1482876,"Storage crash: lastInsertedRowID, 32-bit only,...",1482487.0,DUPLICATE
2,1331985,Crash: [UIApplication _cachedSystemAnimationFe...,1355440.0,DUPLICATE
3,1494388,Remaining mochitest and reftest failures in Te...,1460411.0,DUPLICATE
4,1568667,Extend ContentBlocking API to support Social T...,1568295.0,DUPLICATE


In [158]:
#To Tokenize Data
sentences = data_dissim['Description']
tokens_dissim = []

for sentence in sentences:
    cleanedTex=re.sub(r'[^\w\s]','',str(sentence)).lower()
    words = (word_tokenize(cleanedTex))
    tokens_dissim.append(words)

#BM25
bm25 = BM25Okapi(tokens)

In [159]:
len(tokens_dissim)

131

In [160]:
#Replacing Blank Values with 'NaN' Values
data_dissim.replace("", np.nan, inplace=True)

#Droping Data which has 'Nan' value for 'Description' Column
data_dissim.dropna(subset = ["Description"], inplace=True)

#To Reset the index
data_dissim = data_dissim.reset_index(drop=True)

data_dissim['Description']=data_dissim['Description'].values.astype('object')

#Filtering data based on 'Duplicate_Bug_Ids' Column to get Duplicate Bugs
# data_dissim = data_dissim[data_dissim['Duplicate_Bug_Ids'] > 0]
# print('Dimensions of Data with Duplicate Bug Ids: ', data_dissim.shape)

In [161]:
#To get Duplicate Indices
dup_indices_with_bug_id_dissimilar = getDuplicates(data, data_dissim, tokens_dissim, bm25)

In [162]:
#To Calculate Recall
print('For Textually Dissimilar Dataset: ')
calculate_recall(dup_indices_with_bug_id_dissimilar)

For Textually Dissimilar Dataset: 
Recall Rate at 1:  0.15730337078651685
Recall Rate at 5:  0.2808988764044944
Recall Rate at 10:  0.3595505617977528
Recall Rate at 20:  0.43820224719101125
Recall Rate at 25:  0.449438202247191
Recall Rate at 30:  0.47191011235955055
Recall Rate at 50:  0.5168539325842697
Recall Rate at 75:  0.5617977528089888
Recall Rate at 100:  0.5955056179775281
