In [50]:
# For Installation (Required)
!pip install pyLDAvis
!pip install glove-python-binary

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import re
import string
import nltk
import gensim
import time
import pickle
import warnings
warnings.filterwarnings('ignore')

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

from gensim import corpora,models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models import CoherenceModel
from gensim.models import Word2Vec,FastText
from gensim.test.utils import get_tmpfile

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from glove import Glove
from glove import Corpus

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA

from scipy.spatial import distance

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
#Import Whole Dataset
eclipse_data=pd.read_csv('/content/drive/Shareddrives/DSCI-644-Team-5/PrimaryDataset/eclipse_preproccessed_whole_dataset.csv')

#Cleaning the Data
eclipse_data = eclipse_data.reset_index()
eclipse_data = eclipse_data.drop(columns = ["index","Unnamed: 0"])
eclipse_data = eclipse_data.drop_duplicates()
eclipse_data = eclipse_data.dropna(axis=0, subset=['Bug ID'])

#To show Data
eclipse_data.head()

Unnamed: 0,Bug ID,Product,Component,Status,Resolution,Summary,Changed,Description,Duplicate_Bug_Ids
0,518088.0,Web Tools,Web Standard Tools,RESOLVED,INVALID,search widget is not working,10-06-2017 14:42,search widget is not working,
1,546444.0,z_Archived,PDT,CLOSED,NOT_ECLIPSE,Bug,14-05-2020 11:25,created attachment eclipse hello world,
2,533893.0,Platform,Website,RESOLVED,INVALID,AntCompareCVSDebugDocIDEIncubatorPMCRelengReso...,22-04-2018 12:23,created attachment error in website antcomp...,
3,519449.0,PDE,UI,VERIFIED,FIXED,Problem with KEY_NAME,03-08-2017 03:21,in product pluginsection recommendedbundles m...,
4,519450.0,JDT,UI,RESOLVED,FIXED,Problem with KEY_NAME,13-07-2017 09:43,in renametyperefactoring another type the tem...,


In [5]:
#To show Info
eclipse_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46273 entries, 0 to 46315
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Bug ID             46273 non-null  float64
 1   Product            46263 non-null  object 
 2   Component          46263 non-null  object 
 3   Status             46263 non-null  object 
 4   Resolution         46263 non-null  object 
 5   Summary            46263 non-null  object 
 6   Changed            46263 non-null  object 
 7   Description        46263 non-null  object 
 8   Duplicate_Bug_Ids  3238 non-null   float64
dtypes: float64(2), object(7)
memory usage: 3.5+ MB


# **Cleaning and Preprocessing**

In [6]:
#Text Cleaning to Remove Punctuations
def clean_text_round_1(text):
    text = re.sub(r'\w*\d\w*', '', text)
    text = re.sub(r'\w*\f\w*', '', text)
    text = re.sub(r'\(.*?\)', '', text)
    text = re.sub(r'\[.*]\)', '', text)
    text = text.lower()
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    return text

round1 = lambda x: clean_text_round_1(x)

#Text Cleaning to Remove Additional Punctuations
def clean_text_round_2(text):
    text = re.sub(r'[‘’“”…]', '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\t', '', text)
    return text

round2 = lambda x: clean_text_round_2(x)

In [7]:
#To Clean Data in 'Description' Column
eclipse_data["Description"]= eclipse_data["Description"].str.replace("fixed in HEAD", "", case = False)
eclipse_data["Description"]= eclipse_data["Description"].str.replace("has been marked as readonly", " ", case = False)

eclipse_data = eclipse_data.dropna(axis=0, subset=['Description'])

eclipse_data['Description'] = eclipse_data['Description'].apply(clean_text_round_1)
eclipse_data['Description'] = eclipse_data['Description'].apply(clean_text_round_2)

#To show Info
eclipse_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46263 entries, 0 to 46315
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Bug ID             46263 non-null  float64
 1   Product            46263 non-null  object 
 2   Component          46263 non-null  object 
 3   Status             46263 non-null  object 
 4   Resolution         46263 non-null  object 
 5   Summary            46263 non-null  object 
 6   Changed            46263 non-null  object 
 7   Description        46263 non-null  object 
 8   Duplicate_Bug_Ids  3238 non-null   float64
dtypes: float64(2), object(7)
memory usage: 3.5+ MB


In [8]:
#Helper Functions for Preprocessing
def lemmatize(text):
    return WordNetLemmatizer().lemmatize(text, pos='v')

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 5:
            result.append(lemmatize(token))
    return result

In [9]:
#To Preprocess Data in 'Description' Column
eclipse_data['Description'] = eclipse_data['Description'].map(preprocess)

print('Null Duplicate Bug Ids: ',eclipse_data['Duplicate_Bug_Ids'].isnull().sum())

#To save Duplicate Reports in a CSV File
duplicate_reports = eclipse_data.dropna(axis=0, subset=['Duplicate_Bug_Ids'])
duplicate_reports.reset_index(drop=True)
duplicate_reports.to_csv('eclipse_duplicate_reports.csv')

#Seperating all the master reports into a dataframe
master_reports = eclipse_data[eclipse_data.isnull().any(axis=1)]
master_reports.reset_index(drop=True)

print('NA Values in Master Report: ', master_reports.Description.isna().sum())

#To save Master Reports in a CSV File
master_reports.to_csv('eclipse_master_reports.csv')

Null Duplicate Bug Ids:  43025
NA Values in Master Report:  0


In [10]:
#Import Master Reports
master_reports = pd.read_csv('eclipse_master_reports.csv')
master_reports = master_reports.drop(columns=['Unnamed: 0'])

#To Preprocess Data in 'Description' Column
master_reports['Description'] = master_reports['Description'].map(preprocess)

In [11]:
#To Create a Dictionary
dictionary = gensim.corpora.Dictionary(master_reports['Description'])
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

#To Create BoW a Dictionary
bow_corpus = [dictionary.doc2bow(doc) for doc in master_reports['Description']]

In [12]:
#To Open Pickle File
file_bow = open('eclipse_bow_corpus.pickle', 'wb')

#To dump BoW data in Pickle File
pickle.dump(bow_corpus, file_bow)

#To Open Pickle File
file_dict = open('eclipse_dictionary.pickle', 'wb')

#To dump Dictionary data in Pickle File
pickle.dump(dictionary, file_dict)

In [15]:
#Parameters for LDA Model
corpus = bow_corpus
no_of_topics = 10
dictionary = dictionary
p = 20
k = 2
epochs = 100

#Training the LDA model on the BoW corpus
lda_model = gensim.models.LdaMulticore(corpus, num_topics=no_of_topics, id2word=dictionary, passes=p, workers=k, iterations=epochs)

In [16]:
#Saving the Model
lda_model.save('eclipse_lda_model.model')

In [17]:
#Evaluation of Model

#Perplexity
print('Perplexity: ', lda_model.log_perplexity(bow_corpus))  

#Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=eclipse_data['Description'], dictionary=dictionary, coherence='c_v')
print('\nCoherence Score: ', coherence_model_lda.get_coherence())

Perplexity:  -5.685573833701201

Coherence Score:  0.5700483061364353


In [18]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, bow_corpus, dictionary)

In [19]:
#Visualization
vis

In [20]:
#Creating 10 empty clusters and pushing master reports in each of them based on topic modeling and saving them in individual csv file
for c in range(10):
    exec('topic_{} = pd.DataFrame()'.format(c))
    for i in range(len(master_reports)):
        topic=lda_model[dictionary.doc2bow(master_reports.Description[i])]
        topic= np.asarray(topic)
        if int(topic[np.argmax(topic[:,1]),0])== c:
            exec('topic_{} = topic_{}.append(master_reports.loc[[i]])'.format(c,c))
            exec('topic_{} = topic_{}.reset_index(drop=True)'.format(c,c))
            exec('topic_{}.to_csv("topic_{}.csv")'.format(c,c))

In [22]:
#To Open Pickle File
file_bow = open('eclipse_bow_corpus.pickle', 'rb')
bow_corpus = pickle.load(file_bow)

file_dict = open('eclipse_dictionary.pickle', 'rb')
dictionary=pickle.load(file_dict)

#To load Trained Model
lda_model =  models.LdaModel.load('eclipse_lda_model.model')

In [23]:
#To Create corpus for Word2Vec and FastText models
for i in range(10):
    exec('sent_{} = []'.format(i))
    exec('x= topic_{}'.format(i))
    for j in range(len(x)):
        exec('sent_{}.append(topic_{}.Description[{}])'.format(i,i,j))

for sent in range(10):
    exec('print(len(sent_{}))'.format(sent))

2328
5275
7437
8040
4685
2858
5738
2095
3227
1330


In [50]:
#Training GloVe model for each cluster
for cluster in range(10):
    vector_size = 100
    exec('glove_corpus{}=Corpus()'.format(cluster, cluster)) 
    exec('glove_corpus{}.fit(sent_{})'.format(cluster, cluster))
    exec('glove{}= Glove(no_components=vector_size, learning_rate=0.18, alpha=0.75, max_count=100, max_loss=10.0, random_state=None)'.format(cluster, cluster))
    exec('glove{}.fit(glove_corpus{}.matrix, epochs=200, no_threads=3, verbose=True)'.format(cluster, cluster))
    exec('transformer = lambda dictionary2:glove{}.transform_paragraph(words, epochs=1000,ignore_missing=False)'.format(cluster, cluster))
    exec('glove{}.add_dictionary(glove_corpus{}.dictionary)'.format(cluster, cluster))

    #Save the all the models in individual file
    exec('path = get_tmpfile("glove{}.model")'.format(cluster))
    exec('glove{}.save("glove{}.model")'.format(cluster, cluster))

# **Classification**

In [25]:
#To Import all the Clusters created using LDA based Topic Modeling
for c in range(10):
    exec('topic_{} = pd.read_csv("topic_{}.csv")'.format(c,c))
    exec("topic_{}= topic_{}.drop(columns=['Unnamed: 0'])".format(c,c))
    exec("topic_{}['Description'] = topic_{}['Description'].map(preprocess)".format(c,c))

In [26]:
#To Import all the trained GloVe models
for mod in range(10):  
    exec('glove{} = Glove.load("glove{}.model")'.format(mod, mod))

In [27]:
#This will return the index of cluster in which the master report of duplicate report may reside
def sim_with_clusters_lda_topn(DR, n):
    vec_bow = dictionary.doc2bow(DR)
    x= lda_model[vec_bow]
    topic = np.asarray(x)
    sim=[]
    x= topic[np.argsort(topic[:,1])[-n:][::-1],0]

    for i in range(len(x)):
        sim.append(int(x[i]))

    return sim

#To get Similarity between two feature vectors using the average of Cosine Similarity & Euclidean Similarity
def sim(vec1, vec2): 
    sim1 = 1/(1+np.linalg.norm(np.array(vec1) - np.array(vec2)))
    sim2 = cosine_similarity(vec1, vec2)
    sim=(sim1+sim2)/2 
    return sim

# **LDA and GLOVE**

In [28]:
#Returns Top-N Master Reports
def compare_topn(model, cluster, sent, DR, topn, modal):
    similarity=[]
    
    if (modal == 'single'):
        vec_duplicate, master= feature_vectors_single_modality(DR, sent, model)
    else:
        raise ValueError('Invalid Modality entered')

    for doc in range(len(master)):
        vec_master = master[doc]
        vec_master= [vec_master]
        unified_sim = sim(vec_duplicate, vec_master)
        similarity.append(unified_sim)
    
    similarity = np.asarray(similarity)
    similarity= np.concatenate(similarity, axis=0 )
    similarity= np.concatenate(similarity, axis=0 )
    max_similar_reports=similarity.argsort()[-topn:][::-1]

    return(max_similar_reports)

In [29]:
# creation of feature vectors by singlemodality feature extraction
def feature_vectors_single_modality(DR, corpus, model):
    master = averaged_word_vectorizer_glove(corpus=sent, model=model, num_features=100)

    vec_duplicate = averaged_word_vectorizer_glove(corpus=DR, model=model, num_features=100)

    vec_duplicate = [vec_duplicate]

    return vec_duplicate, master

def averaged_word_vectorizer_glove(corpus, model, num_features):
    vocabulary = set(model.dictionary)
    if(any(isinstance(i, list) for i in corpus)):
        features = [average_word_vectors_glove(tokenized_sentence, model, vocabulary, num_features)
                      for tokenized_sentence in corpus]
        return np.array(features)
    else:
          features = average_word_vectors_glove(corpus, model, vocabulary, num_features)
    return np.array(features)

def average_word_vectors_glove(words, model, vocabulary, num_features):  
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.  

    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model.word_vectors[model.dictionary[word]])

    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector

# Complete Data

Determining the top-n values for the Recall Rate @ k

In [30]:
#Import Duplicate Reports
test = pd.read_csv('eclipse_duplicate_reports.csv')
test = test.drop(columns=['Unnamed: 0'])
test['Description']= test['Description'].fillna('').astype(str).map(preprocess)
test = test.rename(columns={'Bug ID':'Bug_ID'})

#To Decide the Iterations
number_of_samples = test.shape[0]

if(number_of_samples > 200):
  number_of_samples = 200

In [31]:
#Evaluating the performance by Recall Rate
def evaluate(vec_acc, t2, t1):
  sum = 0
  for i,num in enumerate(vec_acc):
      sum = sum + int(num)
  recall_rate = (sum/len(vec_acc))*100
  print("Recall Rate : {} %".format(recall_rate))
  print("Time : ", (t2-t1)/60, "min")

In [32]:
#For k=1
vec_acc_top1=[]
t1_top1 = time.time()

for i in range(number_of_samples):
    print(f'\rRunning sample {i}', end='')
    sample = test.Description[i] 
    n = 1
    max_cluster =sim_with_clusters_lda_topn(sample, n)
    v=[]

    for max in max_cluster:
        exec('cluster = topic_{}'.format(max)) 
        exec('model = glove{}'.format(max))                
        exec('sent = topic_{}.Description'.format(max))
        
        cluster = cluster.rename(columns={'Bug ID':'Bug_ID'})
        topn = 1  
        modal = 'single'                                 
        
        #This will return the Top-N predicted master reports
        max_sim = compare_topn(model, cluster, sent, sample, topn, modal)
        t2_top1 = time.time()

        #Comparing the predicted value to the ground truth
        for num in max_sim:
            if (cluster.Bug_ID[num] == test.Duplicate_Bug_Ids[i]):
                v.append("1")
            else:
                v.append("0")

    if(all(x==v[0] for x in v)):
        vec_acc_top1.append("0")
    else:
        vec_acc_top1.append("1")

Running sample 199

In [33]:
#For k=5
vec_acc_top5=[]
t1_top5 = time.time()

for i in range(number_of_samples):
    print(f'\rRunning sample {i}', end='')
    sample = test.Description[i] 
    n = 3
    max_cluster =sim_with_clusters_lda_topn(sample, n)
    v=[]

    for max in max_cluster:
        exec('cluster = topic_{}'.format(max)) 
        exec('model = glove{}'.format(max))                
        exec('sent = topic_{}.Description'.format(max))
        
        cluster = cluster.rename(columns={'Bug ID':'Bug_ID'})
        topn = 1 
        modal = 'single'                                 
        
        #This will return the Top-N predicted master reports
        max_sim = compare_topn(model, cluster, sent, sample, topn, modal)
        t2_top5 = time.time()

        #Comparing the predicted value to the ground truth
        for num in max_sim:
            if (cluster.Bug_ID[num] == test.Duplicate_Bug_Ids[i]):
                v.append("1")
            else:
                v.append("0")

    if(all(x==v[0] for x in v)):
        vec_acc_top5.append("0")
    else:
        vec_acc_top5.append("1")

Running sample 199

In [34]:
#For k=10
vec_acc_top10=[]
t1_top10 = time.time()

for i in range(number_of_samples):
    print(f'\rRunning sample {i}', end='')
    sample = test.Description[i] 
    n = 3
    max_cluster =sim_with_clusters_lda_topn(sample, n)
    v=[]

    for max in max_cluster:
        exec('cluster = topic_{}'.format(max)) 
        exec('model = glove{}'.format(max))                
        exec('sent = topic_{}.Description'.format(max))
        
        cluster = cluster.rename(columns={'Bug ID':'Bug_ID'})
        topn = 3  
        modal = 'single'                                 
        
        #This will return the Top-N predicted master reports
        max_sim = compare_topn(model, cluster, sent, sample, topn, modal)
        t2_top10 = time.time()

        #Comparing the predicted value to the ground truth
        for num in max_sim:
            if (cluster.Bug_ID[num] == test.Duplicate_Bug_Ids[i]):
                v.append("1")
            else:
                v.append("0")

    if(all(x==v[0] for x in v)):
        vec_acc_top10.append("0")
    else:
        vec_acc_top10.append("1")

Running sample 199

In [35]:
#For k=100
vec_acc_top100=[]
t1_top100 = time.time()

for i in range(number_of_samples):
    print(f'\rRunning sample {i}', end='')
    sample = test.Description[i] 
    n = 3
    max_cluster =sim_with_clusters_lda_topn(sample, n)
    v=[]

    for max in max_cluster:
        exec('cluster = topic_{}'.format(max)) 
        exec('model = glove{}'.format(max))                
        exec('sent = topic_{}.Description'.format(max))
        
        cluster = cluster.rename(columns={'Bug ID':'Bug_ID'})
        topn = 33   
        modal = 'single'                                 
        
        #This will return the Top-N predicted master reports
        max_sim = compare_topn(model, cluster, sent, sample, topn, modal)
        t2_top100 = time.time()

        #Comparing the predicted value to the ground truth
        for num in max_sim:
            if (cluster.Bug_ID[num] == test.Duplicate_Bug_Ids[i]):
                v.append("1")
            else:
                v.append("0")

    if(all(x==v[0] for x in v)):
        vec_acc_top100.append("0")
    else:
        vec_acc_top100.append("1")

Running sample 199

In [36]:
print('For k=1')
evaluate(vec_acc_top1, t2_top1, t1_top1)

print('\nFor k=5')
evaluate(vec_acc_top5, t2_top5, t1_top5)

print('\nFor k=10')
evaluate(vec_acc_top10, t2_top10, t1_top10)

print('\nFor k=100')
evaluate(vec_acc_top100, t2_top100, t1_top100)

For k=1
Recall Rate : 0.0 %
Time :  7.7113856196403505 min

For k=5
Recall Rate : 7.5 %
Time :  21.24055993159612 min

For k=10
Recall Rate : 9.5 %
Time :  21.13537961244583 min

For k=100
Recall Rate : 15.5 %
Time :  21.02414633433024 min


# Textually Similar Data

Determining the top-n values for the Recall Rate @ k

In [38]:
#Import Textually Similar Data
test_sim = pd.read_csv('/content/drive/Shareddrives/DSCI-644-Team-5/PreProcessedData/eclipse_final_sim.csv')

#Cleaning the Data
test_sim = test_sim.drop(columns=['Unnamed: 0'])
test_sim['Description']= test_sim['Description'].fillna('').astype(str).map(preprocess)
test_sim.replace("", np.nan, inplace=True)
test_sim.dropna(subset = ["Duplicate_Bug_Ids"], inplace=True)
test_sim = test_sim.reset_index(drop=True)

#To Decide the Iterations
number_of_samples_sim = test_sim.shape[0]

if(number_of_samples_sim > 200):
  number_of_samples_sim = 200

In [39]:
#For k=1
vec_acc_top1_sim=[]
t1_top1_sim = time.time()

for i in range(number_of_samples_sim):
    print(f'\rRunning sample {i}', end='')
    sample = test_sim.Description[i] 
    n = 1
    max_cluster =sim_with_clusters_lda_topn(sample, n)
    v=[]

    for max in max_cluster:
        exec('cluster = topic_{}'.format(max)) 
        exec('model = glove{}'.format(max))                
        exec('sent = topic_{}.Description'.format(max))
      
        cluster = cluster.rename(columns={'Bug ID':'Bug_ID'})
        topn = 1   
        modal = 'single'                                 
        
        #This will return the Top-N predicted master reports
        max_sim = compare_topn(model, cluster, sent, sample, topn, modal)
        t2_top1_sim = time.time()

        #Comparing the predicted value to the ground truth
        for num in max_sim:
            if (cluster.Bug_ID[num] == test_sim.Duplicate_Bug_Ids[i]):
                v.append("1")
            else:
                v.append("0")

    if(all(x==v[0] for x in v)):
        vec_acc_top1_sim.append("0")
    else:
        vec_acc_top1_sim.append("1")

Running sample 199

In [40]:
#For k=5
vec_acc_top5_sim=[]
t1_top5_sim = time.time()

for i in range(number_of_samples_sim):
    print(f'\rRunning sample {i}', end='')
    sample = test_sim.Description[i]
    n = 3
    max_cluster =sim_with_clusters_lda_topn(sample, n)
    v=[]

    for max in max_cluster:
        exec('cluster = topic_{}'.format(max))             
        exec('model = glove{}'.format(max))                
        exec('sent = topic_{}.Description'.format(max))     
        
        cluster = cluster.rename(columns={'Bug ID':'Bug_ID'})
        topn = 1           
        modal = 'single'        
        
        #This will return the Top-N predicted master reports
        max_sim = compare_topn(model, cluster, sent, sample, topn, modal)
        t2_top5_sim = time.time()

        #Comparing the predicted value to the ground truth
        for num in max_sim:
            if (cluster.Bug_ID[num] == test_sim.Duplicate_Bug_Ids[i]):
                v.append("1")
            else:
                v.append("0")

    if(all(x==v[0] for x in v)):
        vec_acc_top5_sim.append("0")
    else:
        vec_acc_top5_sim.append("1")

Running sample 199

In [41]:
#For k=10
vec_acc_top10_sim=[]
t1_top10_sim = time.time()

for i in range(number_of_samples_sim):
    print(f'\rRunning sample {i}', end='')
    sample = test_sim.Description[i] 
    n = 3
    max_cluster =sim_with_clusters_lda_topn(sample, n)
    v=[]

    for max in max_cluster:
        exec('cluster = topic_{}'.format(max))              
        exec('model = glove{}'.format(max))               
        exec('sent = topic_{}.Description'.format(max))    
        
        cluster = cluster.rename(columns={'Bug ID':'Bug_ID'})
        topn = 3          
        modal = 'single'                                    
        
        #This will return the Top-N predicted master reports
        max_sim = compare_topn(model, cluster, sent, sample, topn, modal)
        t2_top10_sim = time.time()

        #Comparing the predicted value to the ground truth
        for num in max_sim:
            if (cluster.Bug_ID[num] == test_sim.Duplicate_Bug_Ids[i]):
                v.append("1")
            else:
                v.append("0")

    if(all(x==v[0] for x in v)):
        vec_acc_top10_sim.append("0")
    else:
        vec_acc_top10_sim.append("1")

Running sample 199

In [42]:
#For k=100
vec_acc_top100_sim=[]
t1_top100_sim = time.time()

for i in range(number_of_samples_sim):
    print(f'\rRunning sample {i}', end='')
    sample = test_sim.Description[i]
    n = 3
    max_cluster =sim_with_clusters_lda_topn(sample, n)
    v=[]

    for max in max_cluster:
        exec('cluster = topic_{}'.format(max))              
        exec('model = glove{}'.format(max))               
        exec('sent = topic_{}.Description'.format(max))     
        
        cluster = cluster.rename(columns={'Bug ID':'Bug_ID'})
        topn = 33          
        modal = 'single'    

        #This will return the Top-N predicted master reports
        max_sim = compare_topn(model, cluster, sent, sample, topn, modal)
        t2_top100_sim = time.time()

        #Comparing the predicted value to the ground truth
        for num in max_sim:
            if (cluster.Bug_ID[num] == test_sim.Duplicate_Bug_Ids[i]):
                v.append("1")
            else:
                v.append("0")

    if(all(x==v[0] for x in v)):
        vec_acc_top100_sim.append("0")
    else:
        vec_acc_top100_sim.append("1")

Running sample 199

In [43]:
print('For k=1')
evaluate(vec_acc_top1_sim, t2_top1_sim, t1_top1_sim)

print('\nFor k=5')
evaluate(vec_acc_top5_sim, t2_top5_sim, t1_top5_sim)

print('\nFor k=10')
evaluate(vec_acc_top10_sim, t2_top10_sim, t1_top10_sim)

print('\nFor k=100')
evaluate(vec_acc_top100_sim, t2_top100_sim, t1_top100_sim)

For k=1
Recall Rate : 0.0 %
Time :  6.163674660523733 min

For k=5
Recall Rate : 9.0 %
Time :  19.13568847179413 min

For k=10
Recall Rate : 13.5 %
Time :  19.014440707365672 min

For k=100
Recall Rate : 20.0 %
Time :  19.117087312539418 min


# Textually Dissimilar Data

Determining the top-n values for the Recall Rate @ k

In [44]:
#Import Textually Similar Data
test_dissim = pd.read_csv('/content/drive/Shareddrives/DSCI-644-Team-5/PreProcessedData/eclipse_final_dis.csv')

#Cleaning the Data
test_dissim = test_dissim.drop(columns=['Unnamed: 0'])
test_dissim['Description']= test_dissim['Description'].fillna('').astype(str).map(preprocess)
test_dissim.replace("", np.nan, inplace=True)
test_dissim.dropna(subset = ["Duplicate_Bug_Ids"], inplace=True)
test_dissim = test_dissim.reset_index(drop=True)

#To Decide the Iterations
number_of_samples_dis = test_dissim.shape[0]

if(number_of_samples_dis > 200):
  number_of_samples_dis = 200

In [45]:
#For k=1
vec_acc_top1_dis=[]
t1_top1_dis = time.time()

for i in range(number_of_samples_dis):
    print(f'\rRunning sample {i}', end='')
    sample = test_dissim.Description[i] 
    n = 1
    max_cluster =sim_with_clusters_lda_topn(sample, n)
    v=[]
    
    for max in max_cluster:
        exec('cluster = topic_{}'.format(max))              
        exec('model = glove{}'.format(max))               
        exec('sent = topic_{}.Description'.format(max))     
        
        cluster = cluster.rename(columns={'Bug ID':'Bug_ID'})
        topn = 1              
        modal = 'single'                                    
        
        #This will return the Top-N predicted master reports
        max_sim = compare_topn(model, cluster, sent, sample, topn, modal)
        t2_top1_dis = time.time()

        #Comparing the predicted value to the ground truth
        for num in max_sim:
            if (cluster.Bug_ID[num] == test_dissim.Duplicate_Bug_Ids[i]):
                v.append("1")
            else:
                v.append("0")

    if(all(x==v[0] for x in v)):
        vec_acc_top1_dis.append("0")
    else:
        vec_acc_top1_dis.append("1")

Running sample 199

In [46]:
#For k=5
vec_acc_top5_dis=[]
t1_top5_dis = time.time()

for i in range(number_of_samples_dis):
    print(f'\rRunning sample {i}', end='')
    sample = test_dissim.Description[i] 
    n = 3
    max_cluster =sim_with_clusters_lda_topn(sample, n)
    v=[]

    for max in max_cluster:
        exec('cluster = topic_{}'.format(max))                       
        exec('model = glove{}'.format(max))                
        exec('sent = topic_{}.Description'.format(max))     
        
        cluster = cluster.rename(columns={'Bug ID':'Bug_ID'})
        topn = 1          
        modal = 'single'                                    
        
        #This will return the Top-N predicted master reports
        max_sim = compare_topn(model, cluster, sent, sample, topn, modal)
        t2_top5_dis = time.time()

        #Comparing the predicted value to the ground truth
        for num in max_sim:
            if (cluster.Bug_ID[num] == test_dissim.Duplicate_Bug_Ids[i]):
                v.append("1")
            else:
                v.append("0")

    if(all(x==v[0] for x in v)):
        vec_acc_top5_dis.append("0")
    else:
        vec_acc_top5_dis.append("1")

Running sample 199

In [47]:
#For k=10
vec_acc_top10_dis=[]
t1_top10_dis = time.time()

for i in range(number_of_samples_dis):
    print(f'\rRunning sample {i}', end='')
    sample = test_dissim.Description[i] 
    n = 3
    max_cluster =sim_with_clusters_lda_topn(sample, n)
    v=[]

    for max in max_cluster:
        exec('cluster = topic_{}'.format(max))                         
        exec('model = glove{}'.format(max))                
        exec('sent = topic_{}.Description'.format(max))     
        
        cluster = cluster.rename(columns={'Bug ID':'Bug_ID'})
        topn = 3      
        modal = 'single'                                  
        
        #This will return the Top-N predicted master reports
        max_sim = compare_topn(model, cluster, sent, sample, topn, modal)
        t2_top10_dis = time.time()

        #Comparing the predicted value to the ground truth
        for num in max_sim:
            if (cluster.Bug_ID[num] == test_dissim.Duplicate_Bug_Ids[i]):
                v.append("1")
            else:
                v.append("0")

    if(all(x==v[0] for x in v)):
        vec_acc_top10_dis.append("0")
    else:
        vec_acc_top10_dis.append("1")

Running sample 199

In [48]:
#For k=100
vec_acc_top100_dis=[]
t1_top100_dis = time.time()

for i in range(number_of_samples_dis):
    print(f'\rRunning sample {i}', end='')
    sample = test_dissim.Description[i] 
    n = 3
    max_cluster =sim_with_clusters_lda_topn(sample, n)
    v=[]

    for max in max_cluster:
        exec('cluster = topic_{}'.format(max))                
        exec('model = glove{}'.format(max))               
        exec('sent = topic_{}.Description'.format(max))    
        
        cluster = cluster.rename(columns={'Bug ID':'Bug_ID'})
        topn = 33         
        modal = 'single'                                   
        
        #This will return the Top-N predicted master reports
        max_sim = compare_topn(model, cluster, sent, sample, topn, modal)
        t2_top100_dis = time.time()

        #Comparing the predicted value to the ground truth
        for num in max_sim:
            if (cluster.Bug_ID[num] == test_dissim.Duplicate_Bug_Ids[i]):
                v.append("1")
            else:
                v.append("0")

    if(all(x==v[0] for x in v)):
        vec_acc_top100_dis.append("0")
    else:
        vec_acc_top100_dis.append("1")

Running sample 199

In [49]:
print('For k=1')
evaluate(vec_acc_top1_dis, t2_top1_dis, t1_top1_dis)

print('\nFor k=5')
evaluate(vec_acc_top5_dis, t2_top5_dis, t1_top5_dis)

print('\nFor k=10')
evaluate(vec_acc_top10_dis, t2_top10_dis, t1_top10_dis)

print('\nFor k=100')
evaluate(vec_acc_top100_dis, t2_top100_dis, t1_top100_dis)

For k=1
Recall Rate : 0.0 %
Time :  8.604792694250742 min

For k=5
Recall Rate : 5.5 %
Time :  23.08441285689672 min

For k=10
Recall Rate : 8.5 %
Time :  23.032829546928404 min

For k=100
Recall Rate : 15.5 %
Time :  22.9544872601827 min
