In [None]:
# 1. Star
# 2. Folk
# 3. Watching
# 4. Commits
# 5. Issue
# 6. PL 
# 7. Pull Request
# 8. Actions 
# 9. Docs 
# 10. Contributor
# 11. Code

In [8]:
import datetime
import pandas as pd
import os
import json
from collections import defaultdict

In [None]:
# the metrics calculated will be saved in '../metrics.csv'

In [9]:
repos_data_path = "/Users/zhangyujin/Desktop/repos-data"
metrics_path = "../metrics.csv"
metrics = pd.read_csv(metrics_path)

In [None]:
# 01 repo_size

In [55]:
metrics['stars'] = None
metrics['forks'] = None
metrics['open_issues'] = None
for index, row in metrics.iterrows():
    owner = row['owner']
    repo = row['repo']
    repo_path = f"{repos_data_path}/{owner}_{repo}"
    if os.path.exists(repo_path):
        repo_info_path = f"{repos_data_path}/{owner}_{repo}/repo.json"
        if os.path.exists(repo_info_path):
            with open(repo_info_path, 'r') as f:
                repo_info = json.load(f)
            stars = repo_info.get('stargazers_count', 0)
            forks = repo_info.get('forks_count', 0)
            open_issues = repo_info.get('open_issues', 0)
            metrics.at[index, 'stars'] = f"{stars:d}"
            metrics.at[index, 'forks'] = f"{forks:d}"
            metrics.at[index, 'open_issues'] = f"{open_issues:d}"

In [56]:
metrics.to_csv(metrics_path, index=False)

In [None]:
# 02 longitudinal(Recent 500 issues/pulls/commits)

In [51]:
metrics['issue_closed_rate'] = None
metrics['issue_time_to_close'] = None
metrics['issue_time_to_response'] = None
metrics['issue_monthly'] = None
metrics['issue_participants'] = None
metrics['issue_per_participants']= None

In [52]:
for index, row in metrics.iterrows():
    
    owner = row['owner']
    repo = row['repo']
    
    issues_path = f"{repos_data_path}/{owner}_{repo}/issues.json"
    
    if os.path.exists(issues_path):
        closed_issues_count = 0
        labeled_issues_count = 0
        commented_issues_count = 0
        total_comments = 0
        total_time_to_comment = 0
        total_time_to_close = 0
        monthly_issues_count = defaultdict(int)
        participants = set()
        issues_count = 0
        
        with open(issues_path, 'r') as f:
            issues = json.load(f)
            
        for issue in issues[:500]:
            issues_count += 1
            labeled_issues_count += 1 if issue['labels'] else 0
            created_at = datetime.datetime.strptime(issue['created_at'], '%Y-%m-%dT%H:%M:%SZ')
            monthly_issues_count[(created_at.year, created_at.month)] += 1

            if issue['state'] == 'closed':
                closed_issues_count += 1
                closed_at = datetime.datetime.strptime(issue['closed_at'], '%Y-%m-%dT%H:%M:%SZ')
                total_time_to_close += (closed_at - created_at).total_seconds()

            comments_path = f"{repos_data_path}/{owner}_{repo}/comments/{issue['number']}.json"
            
            if os.path.exists(comments_path):
                
                with open(comments_path, 'r') as f:
                    comments = json.load(f)
                    
                total_comments += len(comments)
                first_comment_time = datetime.datetime.strptime(comments[0]['created_at'], '%Y-%m-%dT%H:%M:%SZ')
                time_to_comment = (first_comment_time - created_at).total_seconds()
                total_time_to_comment += time_to_comment
                commented_issues_count += 1
                
                for comment in comments:
                    participant = comment['user']['login']
                    participants.add(participant)
        
    metrics.at[index, 'issue_closed_rate'] = f"{(closed_issues_count / issues_count):.2f}" if issues_count > 0 else "N/A"
    metrics.at[index, 'issue_time_to_close'] = f"{(total_time_to_close / closed_issues_count / (24 * 3600)):.1f}" if closed_issues_count > 0 else "N/A"
    metrics.at[index, 'issue_time_to_response'] = f"{(total_time_to_comment / commented_issues_count / (24 * 3600)):.1f}" if commented_issues_count > 0 else "N/A"
    metrics.at[index, 'issue_monthly'] = f"{(sum(monthly_issues_count.values()) / len(monthly_issues_count)):.1f}" if len(monthly_issues_count) > 0 else "N/A"
    metrics.at[index, 'issue_participants'] = f"{len(participants):d}"
    metrics.at[index, 'issue_per_participants'] = f"{(len(participants) / issues_count):.1f}" if issues_count > 0 else "N/A"

In [10]:
metrics['labeled_issues'] = None

In [12]:
for index, row in metrics.iterrows():
    
    owner = row['owner']
    repo = row['repo']
    
    issues_path = f"{repos_data_path}/{owner}_{repo}/issues.json"
    
    if os.path.exists(issues_path):
        labeled_issues_count = 0
        
        with open(issues_path, 'r') as f:
            issues = json.load(f)
            
        for issue in issues[:500]:
            labeled_issues_count += 1 if issue['labels'] else 0
        
    metrics.at[index, 'labeled_issues'] = f"{labeled_issues_count:d}"

In [53]:
metrics['pulls_closed_rate'] = None
metrics['pulls_merged_rate'] = None
metrics['pulls_time_to_close'] = None
metrics['pulls_time_to_response'] = None
metrics['pulls_monthly'] = None
metrics['pulls_participants'] = None
metrics['pulls_per_participants'] = None

In [54]:
for index, row in metrics.iterrows():
    
    owner = row['owner']
    repo = row['repo']
    
    pulls_path = f"{repos_data_path}/{owner}_{repo}/pulls.json"
    
    if os.path.exists(pulls_path):
        pulls_count = 0
        closed_pulls_count = 0
        merged_pulls_count = 0
        commented_pulls_count = 0
        total_comments = 0
        total_time_to_comment = 0
        total_time_to_close = 0
        monthly_pulls_count = defaultdict(int)
        participants = set()
        
        with open(pulls_path, 'r') as f:
            pulls = json.load(f)
            
        if pulls is None:
            continue
            
        for pull in pulls[:500]:
            
            pulls_count += 1
            
            if pull['merged_at']:
                merged_pulls_count += 1
                
            created_at = datetime.datetime.strptime(pull['created_at'], '%Y-%m-%dT%H:%M:%SZ')
            monthly_pulls_count[(created_at.year, created_at.month)] += 1

            if pull['state'] == 'closed':
                closed_pulls_count += 1
                closed_at = datetime.datetime.strptime(pull['closed_at'], '%Y-%m-%dT%H:%M:%SZ')
                total_time_to_close += (closed_at - created_at).total_seconds()

            comments_path = f"{repos_data_path}/{owner}_{repo}/comments/{pull['number']}.json"
            
            if os.path.exists(comments_path):
                
                with open(comments_path, 'r') as f:
                    comments = json.load(f)
                    
                total_comments += len(comments)
                first_comment_time = datetime.datetime.strptime(comments[0]['created_at'], '%Y-%m-%dT%H:%M:%SZ')
                time_to_comment = (first_comment_time - created_at).total_seconds()
                total_time_to_comment += time_to_comment
                commented_pulls_count += 1
                
                for comment in comments:
                    participant = comment['user']['login']
                    participants.add(participant)
        
    metrics.at[index, 'pulls_closed_rate'] = f"{(closed_pulls_count / pulls_count):.1f}" if pulls_count > 0 else "N/A"  
    metrics.at[index, 'pulls_merged_rate'] = f"{(merged_pulls_count / closed_pulls_count):.1f}" if closed_pulls_count > 0 else "N/A"
    metrics.at[index, 'pulls_time_to_close'] = f"{(total_time_to_close / closed_pulls_count / (24 * 3600)):.1f}" if closed_pulls_count > 0 else "N/A"
    metrics.at[index, 'pulls_time_to_response'] = f"{(total_time_to_comment / commented_pulls_count / (24 * 3600)):.1f}" if commented_pulls_count > 0 else "N/A"
    metrics.at[index, 'pulls_monthly'] = f"{(sum(monthly_pulls_count.values()) / len(monthly_pulls_count)):.1f}" if len(monthly_pulls_count) > 0 else "N/A"
    metrics.at[index, 'pulls_participants'] = f"{len(participants):d}"
    metrics.at[index, 'pulls_per_participants'] = f"{(len(participants) / pulls_count):.1f}" if pulls_count > 0 else "N/A"  

In [18]:
metrics['issues_average_comments'] = None
metrics['pulls_average_comments'] = None

In [19]:
for index, row in metrics.iterrows():
    
    owner = row['owner']
    repo = row['repo']
    
    issues_path = f"{repos_data_path}/{owner}_{repo}/issues.json"
    
    if os.path.exists(issues_path):
        total_comments = 0
        
        with open(issues_path, 'r') as f:
            issues = json.load(f)
            
        for issue in issues[:500]:
            total_comments += issue['comments']
        
    metrics.at[index, 'issues_average_comments'] = f"{(total_comments / len(issues)):.1f}" if issues else "N/A"

In [21]:
for index, row in metrics.iterrows():
    
    owner = row['owner']
    repo = row['repo']
    
    pulls_path = f"{repos_data_path}/{owner}_{repo}/pulls2.json"
    
    if os.path.exists(pulls_path):
        total_comments = 0
        
        with open(pulls_path, 'r') as f:
            pulls = json.load(f)
            
        for pull in pulls[:500]:
            total_comments += pull['comments']
        
    metrics.at[index, 'pulls_average_comments'] = f"{(total_comments / len(pulls)):.1f}" if pulls else "N/A"

In [42]:
metrics['code_review_score'] = None
metrics['maintained_score'] = None

In [43]:
for index, row in metrics.iterrows():
    owner = row['owner']
    repo = row['repo']
    scorecard_path = f"{repos_data_path}/{owner}_{repo}/scorecard.json"
    if os.path.exists(scorecard_path):
        with open(scorecard_path, 'r') as f:
            scorecard = json.load(f)
    if scorecard and scorecard['checks']:
        for check in scorecard['checks']:
            if check['name'] == "Code-Review":
                metrics.at[index, 'code_review_score'] = f"{check['score']}"
            if check['name'] == "Maintained":
                metrics.at[index, 'maintained_score'] = f"{check['score']}"

In [48]:
metrics['vulnerabilities']= None

In [49]:
for index, row in metrics.iterrows():
    owner = row['owner']
    repo = row['repo']
    scorecard_path = f"{repos_data_path}/{owner}_{repo}/scorecard.json"
    if os.path.exists(scorecard_path):
        with open(scorecard_path, 'r') as f:
            scorecard = json.load(f)
    if scorecard and scorecard['checks']:
        for check in scorecard['checks']:
            if check['name'] == "Vulnerabilities":
                metrics.at[index, 'vulnerabilities'] = f"{check['reason'].split()[0]}"

In [None]:
metrics['dependent_repositories'] = None
metrics['dependent_packages'] = None

In [None]:
for index, row in metrics.iterrows():
    owner = row['owner']
    repo = row['repo']
    dependents_path = f"{repos_data_path}/{owner}_{repo}/dependents.json"
    if os.path.exists(dependents_path):
        with open(dependents_path, 'r') as f:
            dependents = json.load(f)
        metrics.at[index, 'dependent_repositories'] = f"{dependents['repositories']}"
        metrics.at[index, 'dependent_packages'] = f"{dependents['packages']}"