In [1]:
import pandas as pd
from tqdm import tqdm

In [2]:
tqdm.pandas()

In [3]:
SOURCES = ['Apache', 'Hyperledger', 'IntelDAOS', 'JFrog', 'Jira', 'JiraEcosystem', 'MariaDB', 'Mindville', 'MongoDB', 'Qt', 'RedHat', 'Sakai', 'SecondLife', 'Sonatype', 'Spring']

In [4]:
def add_linked_issues_to_df(df):
    df['issues']=''
    for i in tqdm(range(len(df))):
        df["issues"].iloc[i] = str(sorted(set([df.iloc[i]['issue_id_1'], df.iloc[i]['issue_id_2']])))

In [5]:
def load_data(source):
    #Loading Issues
    filename = '../data/crawl/issues_'+source.lower()+'.csv'
    issue_df = pd.read_csv(filename, encoding="UTF-8", low_memory=False, sep=';', index_col=['issue_id'])

    #Loading Links
    filename = '../data/crawl/links_'+source.lower()+'.csv'
    link_df = pd.read_csv(filename, encoding="UTF-8", low_memory=False, sep=';').drop_duplicates()
    
    return issue_df, link_df

In [6]:
def clean_issues(issue_df):
    #Remove issues with empty titles
    issue_df = issue_df[~issue_df['title'].fillna(' ').str.isspace()]
    print(f'After filtering out issues with empty titles, {len(issue_df)} issues remain')
    
    return issue_df

In [7]:
def clean_links(link_df):
    add_linked_issues_to_df(link_df)
    
    # remove links with uncrawled and filtered issues
    link_df = link_df[link_df[['issue_id_1', 'issue_id_2']].isin(issue_df.index.values).all(axis=1)]
    print(f'Left with {len(link_df)} links after removing half-private links')
    x = len(link_df)
    
    # cleanup links
    # only allow one linktype per issue-pair
    link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)
#     print(f'Left with {len(link_df)} links after removing issue-pairs with multiple links between them')

    # in case the name is the otherway around, like issue-1_issue-2 and issue-2_issue-1
    doublelinks = (link_df.issues.value_counts()>1).rename_axis('doubles').reset_index(name='valid')
    valid_doubles = set(doublelinks[doublelinks['valid']==True]['doubles'])

    for i in tqdm(valid_doubles):
        if len(set(link_df[link_df['issues']==i]['linktype']))>1:
            link_df = link_df[link_df.issues != i]
    print(f'Left with {len(link_df)} links after removing issue-pairs with multiple link types between them')
    
    print(round((x-len(link_df))/x)*100,2)

    del_mult_lt = (x-len(link_df))/x
    
    #Multiple links complete remove
    link_df.drop_duplicates(subset=['issues'], inplace=True)
    print(f'Left with {len(link_df)} links after removing issue-pairs with multiple entries')

    link_df.reset_index(inplace=True, drop=True)
    
    return link_df, del_mult_lt

In [8]:
sum_dml = 0
for s in SOURCES:
    print(s.upper())
    issue_df, link_df = load_data(s)
    print(f'Loaded {len(issue_df)} issues and {len(link_df)} links')
    
    issue_df = clean_issues(issue_df)
    
    link_df, del_mult_lt = clean_links(link_df)
    
    sum_dml+=del_mult_lt
    
    print(f'Cleaned {len(issue_df)} issues and {len(link_df)} links')
    
    link_df.to_csv('../data/crawl/clean_links_'+s.lower()+'.csv', encoding='utf-8', index=True, sep=';')
    print("----------------------------")
print(sum_dml/15)

APACHE
Loaded 1014926 issues and 264107 links
After filtering out issues with empty titles, 1014925 issues remain


  0%|                                                                           | 294/264107 [00:00<01:08, 3833.93it/s]


KeyboardInterrupt: 

## Example of what exactly is cleaned

In [None]:
source = 'sakai'
issue_df, link_df = load_data(source)

print(f'Loaded {len(issue_df)} issues and {len(link_df)} links')
add_linked_issues_to_df(link_df)

# remove links with uncrawled and filtered issues
cl_link_df = link_df[link_df[['issue_id_1', 'issue_id_2']].isin(issue_df.index.values).all(axis=1)]
print(f'Left with {len(cl_link_df)} links after removing half-private links')
x = len(cl_link_df)

In [None]:
cl_link_df.name.value_counts()

In [None]:
cl_link_df[cl_link_df['name']=='SAK-34955_SAK-35012']

In [None]:
cl_link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)
print(f'Left with {len(cl_link_df)} links after removing issue-pairs with multiple links between them if they were in the correct order')

In [None]:
# in case the name is the otherway around, like issue-1_issue-2 and issue-2_issue-1
doublelinks = (cl_link_df.issues.value_counts()>1).rename_axis('doubles').reset_index(name='valid')
valid_doubles = set(doublelinks[doublelinks['valid']==True]['doubles'])
print(len(valid_doubles))

In [None]:
valid_doubles

In [None]:
cl_link_df[(cl_link_df['issue_id_1'] == 'EVALSYS-741') | (cl_link_df['issue_id_2'] == 'EVALSYS-741')]

In [None]:
for i in valid_doubles:
    if len(set(cl_link_df[cl_link_df['issues']==i]['linktype']))>1:
        cl_link_df = cl_link_df[cl_link_df.issues != i]
    else:
        print(i)
print(f'Left with {len(cl_link_df)} links after removing issue-pairs with multiple link types between them')

In [None]:
print(f'Left with {len(cl_link_df)} links after removing issue-pairs with multiple link types between them')

In [None]:
x-len(cl_link_df)

In [None]:
(x-len(cl_link_df))/x

In [None]:
cl_link_df.issues.value_counts()

In [None]:
cl_link_df[(cl_link_df['issue_id_1'] == 'SAK-35060') | (cl_link_df['issue_id_2'] == 'SAK-35060')]

In [None]:
#Multiple links complete remove
cl_link_df.drop_duplicates(subset=['issues'], inplace=True)
print(f'Left with {len(cl_link_df)} links after removing issue-pairs with the same link type with multiple entries')

cl_link_df.reset_index(inplace=True, drop=True)