In [1]:
!pip install rank_bm25



In [2]:
import numpy as np
import pandas as pd

import requests
import json
import urllib.request
import time
import re
import nltk
nltk.download('punkt')

from bs4 import BeautifulSoup
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\comka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# For Complete Data with Duplicates

In [3]:
dataset = pd.read_csv('../../PrimaryDataset/eclipse_preproccessed_whole_dataset.csv')

print('Dimensions of Dataset: ', dataset.shape)

#To Create a Copy
data = dataset.copy()

#To add 'Summary' and 'Description' in a new Column 'Description1'
data['Description1'] = data['Summary']+ data['Description']

#To add only required Columns
data = data[['Bug ID', 'Description1', 'Duplicate_Bug_Ids', 'Resolution']]

#To Rename the Columns
data = data.rename(columns = {'Bug ID':'Bug_Id', 'Description1':'Description'})

#To show Data
data.head()

Dimensions of Dataset:  (46316, 10)


Unnamed: 0,Bug_Id,Description,Duplicate_Bug_Ids,Resolution
0,518088.0,search widget is not workingsearch widget is n...,,INVALID
1,546444.0,Bugcreated attachment eclipse hello world,,NOT_ECLIPSE
2,533893.0,AntCompareCVSDebugDocIDEIncubatorPMCRelengReso...,,INVALID
3,519449.0,Problem with KEY_NAMEin product pluginsection...,,FIXED
4,519450.0,Problem with KEY_NAMEin renametyperefactoring...,,FIXED


In [5]:
#To Tokenize Data
sentences = data['Description']
tokens = []

for sentence in sentences:
    cleanedTex=re.sub(r'[^\w\s]','',str(sentence)).lower()
    words = (word_tokenize(cleanedTex))
    tokens.append(words)

#BM25
bm25 = BM25Okapi(tokens)

In [6]:
#Replacing Blank Values with 'NaN' Values
data.replace("", np.nan, inplace=True)

#Droping Data which has 'Nan' value for 'Description' Column
data.dropna(subset = ["Description"], inplace=True)

#To Reset the index
data = data.reset_index(drop=True)

data['Description']=data['Description'].values.astype('object')

#Filtering data based on 'Duplicate_Bug_Ids' Column to get Duplicate Bugs
# data = data[data['Duplicate_Bug_Ids'] > 0]
# print('Dimensions of Data with Duplicate Bug Ids: ', data.shape)

In [7]:
def getDuplicates(whole_data, data, tokens, bm25):
    count=0
    dup_indices_with_bug_id = {}
    similarity_scores_before = []
    similarity_scores = {}
    bugs = whole_data['Bug_Id']

    for i, row in data.iterrows():
        bug_id = row['Bug_Id']
        dup_bug_id = row['Duplicate_Bug_Ids']
    
        if dup_bug_id not in bugs.values:
            count+=1
            continue
        
        similarity_row = bm25.get_scores(tokens[i])
        argsort_similarity = np.argsort(similarity_row)[::-1][1:]
        #similarity_scores_before = (np.sort(similarity_row)[::-1][1:])
        dup_bug_index = list(bugs[bugs == dup_bug_id].index)[0]
        dup_ranking = np.argmax(argsort_similarity == np.int64(dup_bug_index)) 
        dup_indices_with_bug_id[bug_id] = dup_ranking
        
    return dup_indices_with_bug_id

In [8]:
def calculate_recall(dup_indices_with_bug_id):
    counts = {1: 0, 5: 0, 10: 0, 20: 0, 25: 0, 30: 0, 50: 0, 75: 0, 100: 0}

    for value in dup_indices_with_bug_id.values():
        for key in counts.keys():
            if(value < key):
                counts[key] += 1

    N = len(dup_indices_with_bug_id)
    
    for key in counts.keys():
        recall_rate = counts[key]/N
        print('Recall Rate at '+str(key)+': ', recall_rate)

In [9]:
#To get Duplicate Indices
dup_indices_with_bug_id = getDuplicates(data, data, tokens, bm25)

In [10]:
#To Calculate Recall
print('For Complete Dataset: ')
calculate_recall(dup_indices_with_bug_id)

For Complete Dataset: 
Recall Rate at 1:  0.007026789635485288
Recall Rate at 5:  0.00878348704435661
Recall Rate at 10:  0.00966183574879227
Recall Rate at 20:  0.010540184453227932
Recall Rate at 25:  0.011857707509881422
Recall Rate at 30:  0.011857707509881422
Recall Rate at 50:  0.013614404918752744
Recall Rate at 75:  0.015371102327624066
Recall Rate at 100:  0.015371102327624066


# For Textually Similar Data

In [11]:
dataset_sim = pd.read_csv('../../PreProcessedData/Eclipse/Eclipse_final_sim.csv')

print('Dimensions of Dataset: ', dataset_sim.shape)

#To Create a Copy
data_sim = dataset_sim.copy()

#To add 'Summary' and 'Description' in a new Column 'Description1'
data_sim['Description1'] = data_sim['Summary']+ data_sim['Description']

#To add only required Columns
data_sim = data_sim[['Bug ID', 'Description1', 'Duplicate_Bug_Ids', 'Resolution']]

#To Rename the Columns
data_sim = data_sim.rename(columns = {'Bug ID':'Bug_Id', 'Description1':'Description'})

#To show Data
data_sim.head()

Dimensions of Dataset:  (593, 13)


Unnamed: 0,Bug_Id,Description,Duplicate_Bug_Ids,Resolution
0,526539.0,"Executing an ""ant"" build in a non java project...",522581.0,DUPLICATE
1,521057.0,Internal compiler error: java.lang.NullPointer...,517951.0,DUPLICATE
2,516277.0,Exception when launching servercreated attachm...,517672.0,DUPLICATE
3,515243.0,Oxygen installation via Eclipse Installer curr...,515213.0,DUPLICATE
4,510223.0,Error when trying to save using Java editorthi...,457065.0,DUPLICATE


In [13]:
#To Tokenize Data
sentences = data_sim['Description']
tokens_sim = []

for sentence in sentences:
    cleanedTex=re.sub(r'[^\w\s]','',str(sentence)).lower()
    words = (word_tokenize(cleanedTex))
    tokens_sim.append(words)

#BM25
bm25 = BM25Okapi(tokens)

In [14]:
#Replacing Blank Values with 'NaN' Values
data_sim.replace("", np.nan, inplace=True)

#Droping Data which has 'Nan' value for 'Description' Column
data_sim.dropna(subset = ["Description"], inplace=True)

#To Reset the index
data_sim = data_sim.reset_index(drop=True)

data_sim['Description']=data_sim['Description'].values.astype('object')

#Filtering data based on 'Duplicate_Bug_Ids' Column to get Duplicate Bugs
# data_sim = data_sim[data_sim['Duplicate_Bug_Ids'] > 0]
# print('Dimensions of Data with Duplicate Bug Ids: ', data_sim.shape)

In [15]:
#To get Duplicate Indices
dup_indices_with_bug_id_similar = getDuplicates(data, data_sim, tokens_sim, bm25)

In [16]:
#To Calculate Recall
print('For Textually Similar Dataset: ')
calculate_recall(dup_indices_with_bug_id_similar)

For Textually Similar Dataset: 
Recall Rate at 1:  0.011210762331838564
Recall Rate at 5:  0.013452914798206279
Recall Rate at 10:  0.01569506726457399
Recall Rate at 20:  0.020179372197309416
Recall Rate at 25:  0.020179372197309416
Recall Rate at 30:  0.020179372197309416
Recall Rate at 50:  0.02242152466367713
Recall Rate at 75:  0.02242152466367713
Recall Rate at 100:  0.02242152466367713


# For Textually Dissimilar Data

In [17]:
dataset_dissim = pd.read_csv('../../PreProcessedData/Eclipse/Eclipse_final_dis.csv')

print('Dimensions of Dataset: ', dataset_dissim.shape)

#To Create a Copy
data_dissim = dataset_dissim.copy()

#To add 'Summary' and 'Description' in a new Column 'Description1'
data_dissim['Description1'] = data_dissim['Summary']+ data_dissim['Description']

#To add only required Columns
data_dissim = data_dissim[['Bug ID', 'Description1', 'Duplicate_Bug_Ids', 'Resolution']]

#To Rename the Columns
data_dissim = data_dissim.rename(columns = {'Bug ID':'Bug_Id', 'Description1':'Description'})

#To show Data
data_dissim.head()

Dimensions of Dataset:  (723, 13)


Unnamed: 0,Bug_Id,Description,Duplicate_Bug_Ids,Resolution
0,576714.0,Upgrade org.apache.sshd:sshd-core to version 2...,574220.0,DUPLICATE
1,513150.0,[release] technology.collections 8.1.0we ll us...,513149.0,DUPLICATE
2,540216.0,[release] technology.app4mc 0.9.2we ll use thi...,539997.0,DUPLICATE
3,542667.0,[release] modeling.mmt.qvtd 2018.0.0we ll use ...,542532.0,DUPLICATE
4,542639.0,[release] iot.wakaama 1.0.0we ll use this bug ...,542638.0,DUPLICATE


In [19]:
#To Tokenize Data
sentences = data_dissim['Description']
tokens_dissim = []

for sentence in sentences:
    cleanedTex=re.sub(r'[^\w\s]','',str(sentence)).lower()
    words = (word_tokenize(cleanedTex))
    tokens_dissim.append(words)

#BM25
bm25 = BM25Okapi(tokens)

In [20]:
#Replacing Blank Values with 'NaN' Values
data_dissim.replace("", np.nan, inplace=True)

#Droping Data which has 'Nan' value for 'Description' Column
data_dissim.dropna(subset = ["Description"], inplace=True)

#To Reset the index
data_dissim = data_dissim.reset_index(drop=True)

data_dissim['Description']=data_dissim['Description'].values.astype('object')

#Filtering data based on 'Duplicate_Bug_Ids' Column to get Duplicate Bugs
# data_dissim = data_dissim[data_dissim['Duplicate_Bug_Ids'] > 0]
# print('Dimensions of Data with Duplicate Bug Ids: ', data_dissim.shape)

In [21]:
#To get Duplicate Indices
dup_indices_with_bug_id_dissimilar = getDuplicates(data, data_dissim, tokens_dissim, bm25)

In [22]:
#To Calculate Recall
print('For Textually Dissimilar Dataset: ')
calculate_recall(dup_indices_with_bug_id_dissimilar)

For Textually Dissimilar Dataset: 
Recall Rate at 1:  0.025735294117647058
Recall Rate at 5:  0.029411764705882353
Recall Rate at 10:  0.04044117647058824
Recall Rate at 20:  0.04411764705882353
Recall Rate at 25:  0.04779411764705882
Recall Rate at 30:  0.04779411764705882
Recall Rate at 50:  0.058823529411764705
Recall Rate at 75:  0.0625
Recall Rate at 100:  0.0625
