In [1]:
import pandas as pd
from tqdm import tqdm

In [2]:
tqdm.pandas()

In [3]:
SOURCES = ['Apache', 'Hyperledger', 'IntelDAOS', 'JFrog', 'Jira', 'JiraEcosystem', 'MariaDB', 'Mindville', 'MongoDB', 'Qt', 'RedHat', 'Sakai', 'SecondLife', 'Sonatype', 'Spring']

In [4]:
def add_linked_issues_to_df(df):
    df['issues']=''
    for i in tqdm(range(len(df))):
        df["issues"].iloc[i] = str(sorted(set([df.iloc[i]['issue_id_1'], df.iloc[i]['issue_id_2']])))

In [5]:
def load_data(source):
    #Loading Issues
    filename = '../../data/crawl/issues_'+source.lower()+'.csv'
    issue_df = pd.read_csv(filename, encoding="UTF-8", low_memory=False, sep=';', index_col=['issue_id'])

    #Loading Links
    filename = '../../data/crawl/links_'+source.lower()+'.csv'
    link_df = pd.read_csv(filename, encoding="UTF-8", low_memory=False, sep=';').drop_duplicates()
    
    return issue_df, link_df

In [6]:
def clean_issues(issue_df):
    #Remove issues with empty titles
    issue_df = issue_df[~issue_df['title'].fillna(' ').str.isspace()]
    print(f'After filtering out issues with empty titles, {len(issue_df)} issues remain')
    
    return issue_df

In [7]:
def clean_links(link_df):
    add_linked_issues_to_df(link_df)
    
    # remove links with uncrawled and filtered issues
    link_df = link_df[link_df[['issue_id_1', 'issue_id_2']].isin(issue_df.index.values).all(axis=1)]
    print(f'Left with {len(link_df)} links after removing half-private links')
    x = len(link_df)
    
    # cleanup links
    # only allow one linktype per issue-pair
    link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)
#     print(f'Left with {len(link_df)} links after removing issue-pairs with multiple links between them')

    # in case the name is the otherway around, like issue-1_issue-2 and issue-2_issue-1
    doublelinks = (link_df.issues.value_counts()>1).rename_axis('doubles').reset_index(name='valid')
    valid_doubles = set(doublelinks[doublelinks['valid']==True]['doubles'])

    for i in tqdm(valid_doubles):
        if len(set(link_df[link_df['issues']==i]['linktype']))>1:
            link_df = link_df[link_df.issues != i]
    print(f'Left with {len(link_df)} links after removing issue-pairs with multiple link types between them')
    
    print(round((x-len(link_df))/x)*100,2)

    del_mult_lt = (x-len(link_df))/x
    
    #Multiple links complete remove
    link_df.drop_duplicates(subset=['issues'], inplace=True)
    print(f'Left with {len(link_df)} links after removing issue-pairs with multiple entries')

    link_df.reset_index(inplace=True, drop=True)
    
    return link_df, del_mult_lt

In [8]:
sum_dml = 0
for s in SOURCES:
    print(s.upper())
    issue_df, link_df = load_data(s)
    print(f'Loaded {len(issue_df)} issues and {len(link_df)} links')
    
    issue_df = clean_issues(issue_df)
    
    link_df, del_mult_lt = clean_links(link_df)
    
    sum_dml+=del_mult_lt
    
    print(f'Cleaned {len(issue_df)} issues and {len(link_df)} links')
    
    link_df.to_csv('../../data/crawl/clean_links_'+s.lower()+'.csv', encoding='utf-8', index=True, sep=';')
    print("----------------------------")
print(sum_dml/15)

APACHE
Loaded 1014926 issues and 264107 links
After filtering out issues with empty titles, 1014925 issues remain


100%|████████████████████████████████████████████████████████████████████████| 264107/264107 [00:55<00:00, 4758.27it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)


Left with 263647 links after removing half-private links


100%|██████████████████████████████████████████████████████████████████████████████| 2218/2218 [01:14<00:00, 29.83it/s]


Left with 256253 links after removing issue-pairs with multiple link types between them
0 2
Left with 255767 links after removing issue-pairs with multiple entries
Cleaned 1014925 issues and 255767 links
----------------------------
HYPERLEDGER
Loaded 28146 issues and 16846 links
After filtering out issues with empty titles, 28146 issues remain


100%|██████████████████████████████████████████████████████████████████████████| 16846/16846 [00:03<00:00, 4721.32it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)


Left with 16733 links after removing half-private links


100%|███████████████████████████████████████████████████████████████████████████████| 141/141 [00:00<00:00, 520.30it/s]


Left with 16325 links after removing issue-pairs with multiple link types between them
0 2
Left with 16304 links after removing issue-pairs with multiple entries
Cleaned 28146 issues and 16304 links
----------------------------
INTELDAOS
Loaded 9474 issues and 2667 links
After filtering out issues with empty titles, 9474 issues remain


100%|████████████████████████████████████████████████████████████████████████████| 2667/2667 [00:00<00:00, 4535.71it/s]


Left with 2667 links after removing half-private links


100%|████████████████████████████████████████████████████████████████████████████████| 24/24 [00:00<00:00, 1846.05it/s]


Left with 2605 links after removing issue-pairs with multiple link types between them
0 2
Left with 2599 links after removing issue-pairs with multiple entries
Cleaned 9474 issues and 2599 links
----------------------------
JFROG
Loaded 15535 issues and 3303 links
After filtering out issues with empty titles, 15535 issues remain


100%|████████████████████████████████████████████████████████████████████████████| 3303/3303 [00:00<00:00, 4773.12it/s]


Left with 3303 links after removing half-private links


100%|████████████████████████████████████████████████████████████████████████████████| 24/24 [00:00<00:00, 1599.89it/s]


Left with 3233 links after removing issue-pairs with multiple link types between them
0 2
Left with 3229 links after removing issue-pairs with multiple entries
Cleaned 15535 issues and 3229 links
----------------------------
JIRA
Loaded 274545 issues and 110507 links
After filtering out issues with empty titles, 274543 issues remain


100%|████████████████████████████████████████████████████████████████████████| 110507/110507 [00:23<00:00, 4768.66it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)


Left with 102573 links after removing half-private links


100%|████████████████████████████████████████████████████████████████████████████████| 819/819 [00:10<00:00, 78.39it/s]


Left with 100096 links after removing issue-pairs with multiple link types between them
0 2
Left with 99819 links after removing issue-pairs with multiple entries
Cleaned 274543 issues and 99819 links
----------------------------
JIRAECOSYSTEM
Loaded 41866 issues and 12439 links
After filtering out issues with empty titles, 41865 issues remain


100%|██████████████████████████████████████████████████████████████████████████| 12439/12439 [00:02<00:00, 4779.45it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)


Left with 11598 links after removing half-private links


100%|█████████████████████████████████████████████████████████████████████████████████| 66/66 [00:00<00:00, 768.03it/s]


Left with 11414 links after removing issue-pairs with multiple link types between them
0 2
Left with 11398 links after removing issue-pairs with multiple entries
Cleaned 41865 issues and 11398 links
----------------------------
MARIADB
Loaded 31229 issues and 14950 links
After filtering out issues with empty titles, 31229 issues remain


100%|██████████████████████████████████████████████████████████████████████████| 14950/14950 [00:03<00:00, 4683.59it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)


Left with 14929 links after removing half-private links


100%|█████████████████████████████████████████████████████████████████████████████████| 98/98 [00:00<00:00, 700.01it/s]


Left with 14659 links after removing issue-pairs with multiple link types between them
0 2
Left with 14618 links after removing issue-pairs with multiple entries
Cleaned 31229 issues and 14618 links
----------------------------
MINDVILLE
Loaded 2134 issues and 46 links
After filtering out issues with empty titles, 2134 issues remain


100%|████████████████████████████████████████████████████████████████████████████████| 46/46 [00:00<00:00, 4600.00it/s]


Left with 46 links after removing half-private links


0it [00:00, ?it/s]


Left with 44 links after removing issue-pairs with multiple link types between them
0 2
Left with 44 links after removing issue-pairs with multiple entries
Cleaned 2134 issues and 44 links
----------------------------
MONGODB
Loaded 137172 issues and 92362 links
After filtering out issues with empty titles, 137171 issues remain


100%|██████████████████████████████████████████████████████████████████████████| 92362/92362 [00:19<00:00, 4756.52it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)


Left with 65240 links after removing half-private links


100%|███████████████████████████████████████████████████████████████████████████████| 389/389 [00:03<00:00, 101.91it/s]


Left with 63883 links after removing issue-pairs with multiple link types between them
0 2
Left with 63821 links after removing issue-pairs with multiple entries
Cleaned 137171 issues and 63821 links
----------------------------
QT
Loaded 148579 issues and 41426 links
After filtering out issues with empty titles, 148579 issues remain


100%|██████████████████████████████████████████████████████████████████████████| 41426/41426 [00:08<00:00, 4788.58it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)


Left with 40646 links after removing half-private links


100%|███████████████████████████████████████████████████████████████████████████████| 171/171 [00:01<00:00, 169.81it/s]


Left with 40128 links after removing issue-pairs with multiple link types between them
0 2
Left with 40105 links after removing issue-pairs with multiple entries
Cleaned 148579 issues and 40105 links
----------------------------
REDHAT
Loaded 353000 issues and 127369 links
After filtering out issues with empty titles, 352999 issues remain


100%|████████████████████████████████████████████████████████████████████████| 127369/127369 [00:26<00:00, 4788.94it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)


Left with 123000 links after removing half-private links


100%|██████████████████████████████████████████████████████████████████████████████| 1205/1205 [00:17<00:00, 69.32it/s]


Left with 120136 links after removing issue-pairs with multiple link types between them
0 2
Left with 119669 links after removing issue-pairs with multiple entries
Cleaned 352999 issues and 119669 links
----------------------------
SAKAI
Loaded 50550 issues and 20292 links
After filtering out issues with empty titles, 50550 issues remain


100%|██████████████████████████████████████████████████████████████████████████| 20292/20292 [00:04<00:00, 4740.01it/s]


Left with 20292 links after removing half-private links


100%|███████████████████████████████████████████████████████████████████████████████| 143/143 [00:00<00:00, 476.67it/s]


Left with 19852 links after removing issue-pairs with multiple link types between them
0 2
Left with 19803 links after removing issue-pairs with multiple entries
Cleaned 50550 issues and 19803 links
----------------------------
SECONDLIFE
Loaded 1867 issues and 674 links
After filtering out issues with empty titles, 1867 issues remain


100%|██████████████████████████████████████████████████████████████████████████████| 674/674 [00:00<00:00, 4616.46it/s]


Left with 674 links after removing half-private links


100%|████████████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 2428.91it/s]


Left with 634 links after removing issue-pairs with multiple link types between them
0 2
Left with 631 links after removing issue-pairs with multiple entries
Cleaned 1867 issues and 631 links
----------------------------
SONATYPE
Loaded 87284 issues and 4975 links
After filtering out issues with empty titles, 87282 issues remain


100%|████████████████████████████████████████████████████████████████████████████| 4975/4975 [00:01<00:00, 4530.95it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)


Left with 4534 links after removing half-private links


100%|████████████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 1214.37it/s]


Left with 4466 links after removing issue-pairs with multiple link types between them
0 2
Left with 4465 links after removing issue-pairs with multiple entries
Cleaned 87282 issues and 4465 links
----------------------------
SPRING
Loaded 69156 issues and 14716 links
After filtering out issues with empty titles, 69156 issues remain


100%|██████████████████████████████████████████████████████████████████████████| 14716/14716 [00:03<00:00, 4704.60it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)


Left with 14616 links after removing half-private links


100%|█████████████████████████████████████████████████████████████████████████████████| 52/52 [00:00<00:00, 559.14it/s]


Left with 14478 links after removing issue-pairs with multiple link types between them
0 2
Left with 14462 links after removing issue-pairs with multiple entries
Cleaned 69156 issues and 14462 links
----------------------------
0.024049617863072324


## Example of what exactly is cleaned

In [9]:
source = 'sakai'
issue_df, link_df = load_data(source)

print(f'Loaded {len(issue_df)} issues and {len(link_df)} links')
add_linked_issues_to_df(link_df)

# remove links with uncrawled and filtered issues
cl_link_df = link_df[link_df[['issue_id_1', 'issue_id_2']].isin(issue_df.index.values).all(axis=1)]
print(f'Left with {len(cl_link_df)} links after removing half-private links')
x = len(cl_link_df)

Loaded 50550 issues and 20292 links


100%|██████████████████████████████████████████████████████████████████████████| 20292/20292 [00:04<00:00, 4689.59it/s]

Left with 20292 links after removing half-private links





In [10]:
cl_link_df.name.value_counts()

SAK-28699_SAK-17240    2
SAK-34955_SAK-35012    2
SAK-42179_SAK-43226    2
SAK-32651_SAK-32652    2
SAK-32651_SAK-32377    2
                      ..
SAK-1547_SAK-35827     1
SAK-35827_SAK-1872     1
SAK-35577_SAK-35832    1
SAK-35835_SAK-29249    1
BBB-21_BBB-24          1
Name: name, Length: 20166, dtype: int64

In [11]:
cl_link_df[cl_link_df['name']=='SAK-34955_SAK-35012']

Unnamed: 0,name,linktype,issue_id_1,issue_id_2,issues
11060,SAK-34955_SAK-35012,1 - Relate,SAK-34955,SAK-35012,"['SAK-34955', 'SAK-35012']"
11126,SAK-34955_SAK-35012,Subtask,SAK-34955,SAK-35012,"['SAK-34955', 'SAK-35012']"


In [12]:
cl_link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)
print(f'Left with {len(cl_link_df)} links after removing issue-pairs with multiple links between them if they were in the correct order')

Left with 20040 links after removing issue-pairs with multiple links between them if they were in the correct order


In [13]:
# in case the name is the otherway around, like issue-1_issue-2 and issue-2_issue-1
doublelinks = (cl_link_df.issues.value_counts()>1).rename_axis('doubles').reset_index(name='valid')
valid_doubles = set(doublelinks[doublelinks['valid']==True]['doubles'])
print(len(valid_doubles))

143


In [14]:
valid_doubles

{"['EVALSYS-1233', 'EVALSYS-1242']",
 "['EVALSYS-741', 'EVALSYS-769']",
 "['GRBK-11', 'GRBK-225']",
 "['GRBK-1236', 'GRBK-859']",
 "['GRBK-1276', 'GRBK-1277']",
 "['GRBK-594', 'GRBK-668']",
 "['GRBK-752', 'GRBK-804']",
 "['GRBK-800', 'GRBK-803']",
 "['GRBK-868', 'GRBK-874']",
 "['PROD-232', 'PROD-271']",
 "['QNA-67', 'QNA-90']",
 "['QUALTRICS-40', 'QUALTRICS-91']",
 "['SAK-10568', 'SAK-10579']",
 "['SAK-10826', 'SAK-8706']",
 "['SAK-11008', 'SAK-9924']",
 "['SAK-1121', 'SAK-1159']",
 "['SAK-11798', 'SAK-13312']",
 "['SAK-12357', 'SAK-13245']",
 "['SAK-13522', 'SAK-28054']",
 "['SAK-1357', 'SAK-2015']",
 "['SAK-13679', 'SAK-15701']",
 "['SAK-14046', 'SAK-14175']",
 "['SAK-14120', 'SAK-38548']",
 "['SAK-1426', 'SAK-754']",
 "['SAK-1426', 'SAK-807']",
 "['SAK-14386', 'SAK-16166']",
 "['SAK-14474', 'SAK-35537']",
 "['SAK-15280', 'SAK-18008']",
 "['SAK-15813', 'SAK-16433']",
 "['SAK-16091', 'SAK-22700']",
 "['SAK-16279', 'SAK-17059']",
 "['SAK-16421', 'SAK-16907']",
 "['SAK-16422', 'SAK-166

In [15]:
cl_link_df[(cl_link_df['issue_id_1'] == 'EVALSYS-741') | (cl_link_df['issue_id_2'] == 'EVALSYS-741')]

Unnamed: 0,name,linktype,issue_id_1,issue_id_2,issues
36429,EVALSYS-741_EVALSYS-769,3 - Duplicate,EVALSYS-741,EVALSYS-769,"['EVALSYS-741', 'EVALSYS-769']"
36430,EVALSYS-769_EVALSYS-741,4 - Incorporate,EVALSYS-769,EVALSYS-741,"['EVALSYS-741', 'EVALSYS-769']"


In [16]:
for i in valid_doubles:
    if len(set(cl_link_df[cl_link_df['issues']==i]['linktype']))>1:
        cl_link_df = cl_link_df[cl_link_df.issues != i]
    else:
        print(i)
print(f'Left with {len(cl_link_df)} links after removing issue-pairs with multiple link types between them')

['SAK-8034', 'SAK-8162']
['SAK-36885', 'SAK-37167']
['SAK-16279', 'SAK-17059']
['GRBK-752', 'GRBK-804']
['SAK-5225', 'SAK-5226']
['SAK-28659', 'SAK-39744']
['SAK-1894', 'SAK-4528']
['SAK-29400', 'SAK-34002']
['GRBK-800', 'GRBK-803']
['SAK-1426', 'SAK-754']
['SAK-44081', 'SAK-44082']
['SAK-16091', 'SAK-22700']
['SAK-14474', 'SAK-35537']
['SAK-26306', 'SAK-26352']
['SAK-8392', 'SAK-8920']
['SAK-19340', 'SAK-19465']
['SAK-40655', 'SAK-40702']
['SAK-7272', 'SAK-7590']
['SAK-1357', 'SAK-2015']
['SAK-40979', 'SAK-43362']
['GRBK-868', 'GRBK-874']
['SAK-33995', 'SAK-42356']
['SAK-5904', 'SAK-8997']
['SAK-1426', 'SAK-807']
['SAK-46021', 'SAK-46185']
['SAK-15813', 'SAK-16433']
['SAK-35541', 'SAK-910']
['SAK-7271', 'SAK-7616']
['GRBK-1276', 'GRBK-1277']
['SAK-5296', 'SAK-7311']
['SAK-11008', 'SAK-9924']
['SAK-27942', 'SAK-29007']
['SAK-16421', 'SAK-16907']
['SAK-33431', 'SAK-33554']
['SAK-34169', 'SAK-36547']
['SAK-36638', 'SAK-36844']
['EVALSYS-1233', 'EVALSYS-1242']
['GRBK-1236', 'GRBK-859']
['

In [17]:
print(f'Left with {len(cl_link_df)} links after removing issue-pairs with multiple link types between them')

Left with 19852 links after removing issue-pairs with multiple link types between them


In [18]:
x-len(cl_link_df)

440

In [19]:
(x-len(cl_link_df))/x

0.02168342203824167

In [20]:
cl_link_df.issues.value_counts()

['GRBK-752', 'GRBK-804']      2
['SAK-19340', 'SAK-19465']    2
['SAK-7271', 'SAK-7616']      2
['SAK-33995', 'SAK-42356']    2
['GRBK-11', 'GRBK-225']       2
                             ..
['SAK-35575', 'SAK-35882']    1
['SAK-29249', 'SAK-35882']    1
['SAK-35205', 'SAK-35884']    1
['SAK-20939', 'SAK-35884']    1
['BBB-21', 'BBB-24']          1
Name: issues, Length: 19803, dtype: int64

In [21]:
cl_link_df[(cl_link_df['issue_id_1'] == 'SAK-35060') | (cl_link_df['issue_id_2'] == 'SAK-35060')]

Unnamed: 0,name,linktype,issue_id_1,issue_id_2,issues
9359,SAK-35060_SAK-37046,1 - Relate,SAK-35060,SAK-37046,"['SAK-35060', 'SAK-37046']"
10600,SAK-35665_SAK-35060,1 - Relate,SAK-35665,SAK-35060,"['SAK-35060', 'SAK-35665']"
10855,SAK-35205_SAK-35060,1 - Relate,SAK-35205,SAK-35060,"['SAK-35060', 'SAK-35205']"
10856,SAK-35060_SAK-35205,1 - Relate,SAK-35060,SAK-35205,"['SAK-35060', 'SAK-35205']"
10922,SAK-35136_SAK-35060,1 - Relate,SAK-35136,SAK-35060,"['SAK-35060', 'SAK-35136']"
11006,SAK-35060_SAK-34877,1 - Relate,SAK-35060,SAK-34877,"['SAK-34877', 'SAK-35060']"
11012,SAK-35052_SAK-35060,2 - Cloned,SAK-35052,SAK-35060,"['SAK-35052', 'SAK-35060']"
11013,SAK-35060_SAK-34968,5 - Depend,SAK-35060,SAK-34968,"['SAK-34968', 'SAK-35060']"


In [22]:
#Multiple links complete remove
cl_link_df.drop_duplicates(subset=['issues'], inplace=True)
print(f'Left with {len(cl_link_df)} links after removing issue-pairs with the same link type with multiple entries')

cl_link_df.reset_index(inplace=True, drop=True)

Left with 19803 links after removing issue-pairs with the same link type with multiple entries
