In [1]:
#Needed Modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
import seaborn as sns
import re

import os
import sys

from tqdm import tqdm

import json
from pathlib import Path

import matplotlib.pyplot as plt

print("Starting Notebook.")

sns.set(font_scale = 1.25)
sns.set_style("whitegrid")

Starting Notebook.


## Loading Data

In [2]:
SOURCES = ['Apache', 'Hyperledger', 'IntelDAOS', 
           'JFrog', 'Jira', 'JiraEcosystem', 
           'MariaDB', 'MongoDB', 'Qt', 
           'RedHat', 'Sakai', 'SecondLife', 
           'Sonatype', 'Spring']
# 'Mindville'

CONFIG = ['R_LTvNL', 'R_LTvNLOL', 'R_LTOLvNL']
LT = 'Duplication'

In [3]:
def print_linktypes(SOURCE):
    #Loading Issues
    filename = '../../data/crawl/issues_'+SOURCE.lower()+'.csv'
    issues = pd.read_csv(filename, encoding="UTF-8", low_memory=False, sep=';')
    
    issue_set = set(issues['issue_id'])
        
    #Loading Links
    filename = '../../data/crawl/clean_links_'+SOURCE.lower()+'.csv'
    links = pd.read_csv(filename, encoding="UTF-8", low_memory=False, index_col=0, sep=';')
    
    link_set = set(links['issue_id_1']).union(set(links['issue_id_2']))

    num_dups = len(links[links['linktype']=='Duplicate'])
        
    return len(issues), len(links), len(links.linktype.unique()), round(len(link_set)/len(issue_set), 3), num_dups

In [4]:
overview = pd.DataFrame(columns = ['Project', '#Issues', '#Links', '#Linktypes', '%IssuesWithLinks', '#NumDups'])
j=0
for s in SOURCES:
    i, l, ltu, pi, nd = print_linktypes(s)
    
    if s == 'JiraEcosystem':
        s = 'JiraEco.'
    
    overview.loc[j]=[s, i, l, ltu, pi, nd]
    
    j+=1

In [5]:
overview

Unnamed: 0,Project,#Issues,#Links,#Linktypes,%IssuesWithLinks,#NumDups
0,Apache,1014926,255767,22,0.285,25925
1,Hyperledger,28146,16304,8,0.549,638
2,IntelDAOS,9474,2599,13,0.308,252
3,JFrog,15535,3229,11,0.286,643
4,Jira,274545,99819,19,0.467,21685
5,JiraEco.,41866,11398,20,0.33,1741
6,MariaDB,31229,14618,8,0.445,1374
7,MongoDB,137172,63821,15,0.452,8587
8,Qt,148579,40105,12,0.302,4243
9,RedHat,353000,119669,21,0.392,5913


## Loading Model Results

In [6]:
valid_projects = []

for s in SOURCES:
    if (s == "MariaDB"):
        valid = True
        for c in CONFIG:
            filename = 'results/sccnn_'+s.lower()+'_'+LT+'_'+c+'_metrics.csv'
            metrics_df = pd.read_csv(filename, encoding="UTF-8", low_memory=False)

            LT_mets = metrics_df.iloc[0].values.tolist()[1:]
            NL_mets = metrics_df.iloc[1].values.tolist()[1:]
            OL_mets = metrics_df.iloc[2].values.tolist()[1:]

            valid = valid and not(np.isnan(LT_mets + OL_mets + NL_mets).any())

        if valid:
            valid_projects.append(s)

In [7]:
valid_projects

['MariaDB']

In [8]:
def get_tpfptnfn(conf_mat):
    
    tp = conf_mat.loc["DUPLICATION"][1] 
    fn = conf_mat.loc["DUPLICATION"][0] 
    fp = conf_mat.loc["NON-LINKS"][1] 
    tn = conf_mat.loc["NON-LINKS"][0] 
    
    return tp, fp, tn, fn

In [9]:
def get_results(c, trad):
    avg_d_pre = []
    avg_d_rec = []
    avg_d_f1 = []

    avg_nl_pre = []
    avg_nl_rec = []
    avg_nl_f1 = []

    avg_ol_0 = []
    avg_ol_1 = []

    avg_acc = []

    avg_pre = []
    avg_rec = []
    avg_f1 = []

    for s in valid_projects:
#         print(s.upper())
        filename = 'results/sccnn_latenight_'+s.lower()+'_'+LT+'_'+c+'_confmat.csv'
        confmat_df = pd.read_csv(filename, encoding="UTF-8", low_memory=False, index_col='Class')
        confmat_df = pd.DataFrame(confmat_df.values, index=['DUPLICATION', 'OTHER-LINKS', 'NON-LINKS'])
#         print(confmat_df.transpose())
        
    #     print(confmat_df)
    #     print(confmat_df.transpose()/confmat_df.sum(axis=1))
    #     print((confmat_df.transpose()/confmat_df.sum(axis=1)).index)

        avg_ol_0.append((confmat_df.transpose()/confmat_df.sum(axis=1)).loc[0]["OTHER-LINKS"])
        avg_ol_1.append((confmat_df.transpose()/confmat_df.sum(axis=1)).loc[1]["OTHER-LINKS"])

#         print("OL 0: "+ str(np.round((confmat_df.transpose()/confmat_df.sum(axis=1)).loc['0']["OTHER-LINKS"], 3)))
#         print("OL 1: "+ str(np.round((confmat_df.transpose()/confmat_df.sum(axis=1)).loc['1']["OTHER-LINKS"], 3)))
        
        if not trad:
            if c == 'R_LTOLvNL':
                new_confmat = [[confmat_df.loc["DUPLICATION"][0]+confmat_df.loc["OTHER-LINKS"][0], 
                                confmat_df.loc["DUPLICATION"][1]+confmat_df.loc["OTHER-LINKS"][1]],
                               [confmat_df.loc["NON-LINKS"][0],
                                confmat_df.loc["NON-LINKS"][1]]]
                new_confmat_df = pd.DataFrame(new_confmat, index=['DUPLICATION', 'NON-LINKS'])
            else:
                new_confmat = [[confmat_df.loc["DUPLICATION"][0], 
                                confmat_df.loc["DUPLICATION"][1]],
                               [confmat_df.loc["OTHER-LINKS"][0]+confmat_df.loc["NON-LINKS"][0],
                                confmat_df.loc["OTHER-LINKS"][1]+confmat_df.loc["NON-LINKS"][1]]]
                new_confmat_df = pd.DataFrame(new_confmat, index=['DUPLICATION', 'NON-LINKS'])
            confmat_df = new_confmat_df

        tp, fp, tn, fn = get_tpfptnfn(confmat_df)
        
        d_pre = tp/(tp+fp)
        d_rec = tp/(tp+fn)
        d_f1 = 2*(d_pre*d_rec)/(d_pre+d_rec)

        avg_d_pre.append(d_pre)
        avg_d_rec.append(d_rec)
        avg_d_f1.append(d_f1)

        nl_pre = tn/(tn+fn)
        nl_rec = tn/(tn+fp)
        nl_f1 = 2*(nl_pre*nl_rec)/(nl_pre+nl_rec)

        avg_nl_pre.append(nl_pre)
        avg_nl_rec.append(nl_rec)
        avg_nl_f1.append(nl_f1)

        pre = (d_pre+nl_pre)/2
        rec = (d_rec+nl_rec)/2
        f1 = 2*(pre*rec)/(pre+rec)
        
        avg_pre.append(pre)
        avg_rec.append(rec)
        avg_f1.append(f1)
        
        acc = (tp+tn)/(tp+fp+fn+tn)

        avg_acc.append(acc)

    print("+++++++++++++++++")
    print("ACC: "+str(round(np.mean(avg_acc),2)))
    print("+++++++++++++++++")
    print("PRE: "+str(round(np.mean(avg_pre),2)))
    print("REC: "+str(round(np.mean(avg_rec),2)))
    print("F1: "+str(round(np.mean(avg_f1),2)))
    print("+++++++++++++++++")
    print("D PRE: "+str(round(np.mean(avg_d_pre),2)))
    print("D REC: "+str(round(np.mean(avg_d_rec),2)))
    print("D F1: "+str(round(np.mean(avg_d_f1),2)))
    print("+++++++++++++++++")
    print("NL PRE: "+str(round(np.mean(avg_nl_pre),2)))
    print("NL REC: "+str(round(np.mean(avg_nl_rec),2)))
    print("NL F1: "+str(round(np.mean(avg_nl_f1),2)))
    print("+++++++++++++++++")
    print("OL 0: "+str(round(np.mean(avg_ol_0),2)))
    print("OL 1: "+str(round(np.mean(avg_ol_1),2)))

    print("OL STD: "+str(round(np.std(avg_ol_0),2)))

    
    res_dict = {
            'ACC' : avg_acc,
            'Pre': avg_pre,
            'Rec': avg_rec,
            'F1': avg_f1,
            'D_Pre': avg_d_pre,
            'D_Rec': avg_d_rec,
            'D_F1': avg_d_f1,
            'NL_Pre': avg_nl_pre,
            'NL_Rec': avg_nl_rec,
            'NL_F1': avg_nl_f1,
            'OL_Corr': avg_ol_0,
          }

    res_data= pd.DataFrame(res_dict, index=[valid_projects])
    
    return res_data

In [10]:
get_results('R_LTvNL', True)

+++++++++++++++++
ACC: 0.73
+++++++++++++++++
PRE: 0.74
REC: 0.73
F1: 0.73
+++++++++++++++++
D PRE: 0.68
D REC: 0.84
D F1: 0.75
+++++++++++++++++
NL PRE: 0.8
NL REC: 0.61
NL F1: 0.69
+++++++++++++++++
OL 0: 0.35
OL 1: 0.65
OL STD: 0.0


Unnamed: 0,ACC,Pre,Rec,F1,D_Pre,D_Rec,D_F1,NL_Pre,NL_Rec,NL_F1,OL_Corr
MariaDB,0.725455,0.738796,0.725455,0.732064,0.682353,0.843636,0.754472,0.795238,0.607273,0.68866,0.345455


In [11]:
get_results('R_LTvNL', False)

+++++++++++++++++
ACC: 0.6
+++++++++++++++++
PRE: 0.65
REC: 0.66
F1: 0.66
+++++++++++++++++
D PRE: 0.45
D REC: 0.84
D F1: 0.58
+++++++++++++++++
NL PRE: 0.86
NL REC: 0.48
NL F1: 0.61
+++++++++++++++++
OL 0: 0.35
OL 1: 0.65
OL STD: 0.0


Unnamed: 0,ACC,Pre,Rec,F1,D_Pre,D_Rec,D_F1,NL_Pre,NL_Rec,NL_F1,OL_Corr
MariaDB,0.598788,0.652585,0.66,0.656272,0.446154,0.843636,0.583648,0.859016,0.476364,0.612865,0.345455


In [12]:
get_results('R_LTvNLOL', True)

+++++++++++++++++
ACC: 0.69
+++++++++++++++++
PRE: 0.69
REC: 0.69
F1: 0.69
+++++++++++++++++
D PRE: 0.69
D REC: 0.7
D F1: 0.69
+++++++++++++++++
NL PRE: 0.69
NL REC: 0.69
NL F1: 0.69
+++++++++++++++++
OL 0: 0.71
OL 1: 0.29
OL STD: 0.0


Unnamed: 0,ACC,Pre,Rec,F1,D_Pre,D_Rec,D_F1,NL_Pre,NL_Rec,NL_F1,OL_Corr
MariaDB,0.692727,0.69275,0.692727,0.692739,0.690647,0.698182,0.694394,0.694853,0.687273,0.691042,0.709091


In [13]:
get_results('R_LTvNLOL', False)

+++++++++++++++++
ACC: 0.7
+++++++++++++++++
PRE: 0.68
REC: 0.7
F1: 0.69
+++++++++++++++++
D PRE: 0.54
D REC: 0.7
D F1: 0.61
+++++++++++++++++
NL PRE: 0.82
NL REC: 0.7
NL F1: 0.76
+++++++++++++++++
OL 0: 0.71
OL 1: 0.29
OL STD: 0.0


Unnamed: 0,ACC,Pre,Rec,F1,D_Pre,D_Rec,D_F1,NL_Pre,NL_Rec,NL_F1,OL_Corr
MariaDB,0.698182,0.679291,0.698182,0.688607,0.536313,0.698182,0.606635,0.82227,0.698182,0.755162,0.709091


In [14]:
get_results('R_LTOLvNL', True)

+++++++++++++++++
ACC: 0.7
+++++++++++++++++
PRE: 0.74
REC: 0.7
F1: 0.72
+++++++++++++++++
D PRE: 0.84
D REC: 0.5
D F1: 0.62
+++++++++++++++++
NL PRE: 0.64
NL REC: 0.9
NL F1: 0.75
+++++++++++++++++
OL 0: 0.49
OL 1: 0.51
OL STD: 0.0


Unnamed: 0,ACC,Pre,Rec,F1,D_Pre,D_Rec,D_F1,NL_Pre,NL_Rec,NL_F1,OL_Corr
MariaDB,0.7,0.738926,0.7,0.718937,0.835366,0.498182,0.624146,0.642487,0.901818,0.750378,0.490909


In [15]:
get_results('R_LTOLvNL', False)

+++++++++++++++++
ACC: 0.64
+++++++++++++++++
PRE: 0.69
REC: 0.7
F1: 0.7
+++++++++++++++++
D PRE: 0.91
D REC: 0.5
D F1: 0.65
+++++++++++++++++
NL PRE: 0.48
NL REC: 0.9
NL F1: 0.62
+++++++++++++++++
OL 0: 0.49
OL 1: 0.51
OL STD: 0.0


Unnamed: 0,ACC,Pre,Rec,F1,D_Pre,D_Rec,D_F1,NL_Pre,NL_Rec,NL_F1,OL_Corr
MariaDB,0.636364,0.693596,0.702727,0.698132,0.911184,0.503636,0.648712,0.476008,0.901818,0.623116,0.490909
