In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
import sys
sys.path.append('../../')
from utils.dataloader import get_issues, get_commits, get_pull_requests

In [None]:
issues = get_issues(filter="metadata")
commits = get_commits()
pull_requests = get_pull_requests()

In [21]:
# Q1. The distribution of the status of the issues

# state mappings (bitbucket)
# New -> open
# Duplicate -> closed
# Resolved -> closed
issues['prop:state'] = issues['state'].replace({'new': 'open', 'duplicate': 'closed', 'resolved': 'closed'})

In [22]:
# Q2. The distribution of the issues by type (bug, enhancement, etc.)
# located in the catiss_classification folder in analysis/issues/catiss_classification

In [23]:
# Q3. The distribution of the time taken to resolve issues

resolution_times = []

for index, issue in issues.iterrows():
    # closed issues only
    closed_at = issue['closed_at']
    created_at = issue['created_at']
    if (
        closed_at not in [None, '', 'None']
        and created_at not in [None, '', 'None']
    ):
        created_dt = pd.to_datetime(created_at, utc=True)

        # time in days
        if issue['host'] == "Github":
            closed_dt = pd.to_datetime(closed_at, utc=True)
        else:
            # Bitbucket
            closed_dt = pd.to_datetime(issue['updated_on'], utc=True)

        resolution_time = closed_dt - created_dt
        resolution_times.append(resolution_time.total_seconds() / 86400)  # convert to days
    else:
        # time is from issue creation to June 2
        created_dt = pd.to_datetime(created_at, utc=True)
        end_dt = pd.to_datetime('2025-06-02', utc=True)
        resolution_time = end_dt - created_dt
        resolution_times.append(resolution_time.total_seconds() / 86400)

issues['prop:resolution'] = resolution_times

# overlapping
open_issues = issues[issues['prop:state'] == 'open']
closed_issues = issues[issues['prop:state'] == 'closed']

print(f"20th Percentile (Open Issues): {open_issues['prop:resolution'].quantile(0.2)} days")
print(f"80th Percentile (Open Issues): {open_issues['prop:resolution'].quantile(0.8)} days")
print(f"20th Percentile (Closed Issues): {closed_issues['prop:resolution'].quantile(0.2)} days")
print(f"80th Percentile (Closed Issues): {closed_issues['prop:resolution'].quantile(0.8)} days")

20th Percentile (Open Issues): 361.6475810185185 days
80th Percentile (Open Issues): 2081.7269675925927 days
20th Percentile (Closed Issues): 0.07823148148148158 days
80th Percentile (Closed Issues): 161.9828217592594 days


In [24]:
# Q4. The distribution of the number of comments for each issue
comment_counts = []

for index, issue in issues.iterrows():
    if issue['host'] == "Github":
        # Github
        timeline_events = issue['timeline']
        comments = [event for event in timeline_events if event['event'] == 'commented']
        comment_counts.append(len(comments))
    else:
        # bitbucket
        comment_counts.append(len(issue['comments']))

issues['prop:comments'] = comment_counts

print(f"20% Percentile: {issues['prop:comments'].quantile(0.2)}")
print(f"80% Percentile: {issues['prop:comments'].quantile(0.8)}")

20% Percentile: 1.0
80% Percentile: 5.0


In [25]:
# Q5. The distribution of the number of unqiue users participating in each issue
user_participation_counts = []

for index, issue in issues.iterrows():
    if issue['host'] == "Github":
        # Github
        timeline_events = issue['timeline']
        users = set()
        users.add(issue['user'])
        for event in timeline_events:
            if event['event'] == 'commented':
                # ensure doesn't end with [bot]
                if 'actor' in event and 'login' in event['actor']:
                    if event['actor']['login'].endswith('[bot]'):
                        continue
                    users.add(event['actor']['login'])
        user_participation_counts.append(len(users))
    else:
        # bitbucket
        users = set()
        users.add(issue['user'])
        for comment in issue['comments']:
            users.add(comment['user']['uuid'])
        user_participation_counts.append(len(users))
        
issues['prop:users'] = user_participation_counts

print(f"20th Percentile: {issues['prop:users'].quantile(0.2)}")
print(f"80th Percentile: {issues['prop:users'].quantile(0.8)}")

20th Percentile: 1.0
80th Percentile: 3.0


In [27]:
# Q9. The distribution of the number of files changed per issue based on linked pull requests and commits
# Q10. The distribution of the number of lines changed per issue based on linked pull requests and commits
num_lines_changed = []
num_files_changed = []

for index, issue in tqdm(issues.iterrows(), total=issues.shape[0]):
    if issue['host'] == "Github":
        changes_count = 0
        unique_files = set()

        referenced_commits = set()

        for event in issue['timeline']:
            # check for prs
            if event['event'] == 'cross-referenced' and 'pull' in event['source']['issue']['html_url']:
                # find the corresponding pull request in the pull_requests DataFrame
                pr_data = pull_requests[pull_requests['html_url'] == event['source']['issue']['html_url']]

                if not pr_data.empty:
                    for commit in pr_data['commits'].iloc[0]:
                        # search for the commit in the commits DataFrame
                        commit_sha = commit['sha']
                        # check if the commit has already been counted
                        if commit_sha in referenced_commits:
                            continue
                        referenced_commits.add(commit_sha)

                        commit_data = commits[commits['sha'] == commit_sha]
                        if not commit_data.empty:
                            # get the number of files changed in the commit
                            unique_files.update(file['filename'] for file in commit_data['files'].iloc[0])

                            for file in commit_data['files'].iloc[0]:
                                changes_count += file['changes']
            # check for referenced commits
            elif event['event'] == 'referenced':
                commit_sha = event['commit_id']
                # check if the commit has already been counted
                if commit_sha in referenced_commits:
                    continue
                referenced_commits.add(commit_sha)
                commit_data = commits[commits['sha'] == commit_sha]
                if not commit_data.empty:
                    unique_files.update(file['filename'] for file in commit_data['files'].iloc[0])

                    # get the number of files changed in the commit
                    for file in commit_data['files'].iloc[0]:
                        changes_count += file['changes']

        num_lines_changed.append(changes_count)
        num_files_changed.append(len(unique_files))
    else:
        num_lines_changed.append(0)
        num_files_changed.append(0)

  0%|          | 0/19225 [00:00<?, ?it/s]

100%|██████████| 19225/19225 [23:40<00:00, 13.53it/s]  


In [28]:
issues['prop:files'] = num_files_changed

# Only consider non-zero values for percentiles
nonzero_files_changed = issues[issues['prop:files'] > 0]['prop:files']

# add line at 20% and 80% percentile of all the issues
percentile_20 = nonzero_files_changed.quantile(0.2)
percentile_80 = nonzero_files_changed.quantile(0.8)

print(f"20% Files Changed: {percentile_20}")
print(f"80% Files Changed: {percentile_80}")

20% Files Changed: 2.0
80% Files Changed: 36.0


In [29]:
issues['prop:loc'] = num_lines_changed

# Only consider non-zero values for percentiles
nonzero_lines_changed = issues[issues['prop:loc'] > 0]['prop:loc']
percentile_20 = nonzero_lines_changed.quantile(0.2)
percentile_80 = nonzero_lines_changed.quantile(0.8)

print(f"20% Lines Changed (nonzero): {percentile_20}")
print(f"80% Lines Changed (nonzero): {percentile_80}")

20% Lines Changed (nonzero): 23.0
80% Lines Changed (nonzero): 1329.2000000000025


In [None]:
issues = issues[['id', 'repo', 'host', 'prop:state', 'prop:resolution', 'prop:comments', 'prop:users', 'prop:files', 'prop:loc']]
issues.to_csv('../../../data/issues_properties.csv', index=False)