#### The goal of this notebook is to enumerate all the repository metrics that are potentially useful for trustworthy assessment

In [13]:
import numpy as np
import pandas as pd

In [14]:
### Final metric criterias:
### 1. We do not include the metrics the are already easy to access on github pages.
### 2. We do not include the metrics that requires significant resources to build. The initial set of simple metrics can help a long way.
### 3. We include metrics that even are not available. we will remove them at the end. 
### 4. We focus on quantifiable metrics. 

In [15]:
## Github

# 1. Star
# 2. Folk
# 3. Watching
# 4. Commits
# 5. Issue
# 6. PL (Programming Language used by the OSS)
# 7. Pull Request
# 8. Actions (I assume that unit tests is a part of Action. Or I may)
# 9. Docs (Readme.md, Governance.md, Open source license.md, Security.md)
# 10. Contributor
# 11. Code

In [16]:
# Raw metric categories

# We start with basic repo features first.
basic_metrics = ["stars", "folks", "subscribers", "commits", "issues", "PL", "pulls", "actions", "docs", "contributors", "code"]

# These features are mostly about cosmetic information, e.g., whether the commit messages are well formatted and detailed, whether the code has enough comments. 
cosmetic_metrics = ["commits", "issues", "pulls", "docs", "code"] # code here mean in-code comments.

# this metrics involve code.
code_metrics = ["commits", "pulls"]

# this metrics involve timestamps. 
longitudinal_metrics = ["commits", "issues", "pulls", "actions"]

# this metrics involve contributors.
contributor_metrics = ["stars", "folks", "subscribers", "commits", "issues", "pulls"]

# special metrics. we will need to enumerate more. we possiblely need to put all the external metrics here.
special_metrics = ["languages"]


In [17]:
# define enumeration metrics. The first letter indicate the data types. b means binary, n means numeric.
cosmetic_combos = ["b_msg_format", "n_msg_length"]

code_combos = ["n_total_code_length", "n_avg_code_length", "n_avg_num_of_involved_files"]

longitudinal_combos = ["n_earliest_time", "n_most_recent_time", "n_totalnum", "n_percentage_succeed"] # n_percentage_succeed refers to whether a issue is closed, whether a pull request is merged, etc. 

# note we define contributors broadly here. A participant that involves in an issue discussion is also a contributor.
contributor_combos = ["n_total_contributor", "n_avg_contributor"]  



In [19]:
all_metrics = []
for metric1 in cosmetic_metrics: 
    for metric2 in cosmetic_combos:
        all_metrics.append(metric1 + "_" + metric2)
        
        
for metric1 in code_metrics: 
    for metric2 in code_combos:
        all_metrics.append(metric1 + "_" + metric2)
        
        
for metric1 in longitudinal_metrics: 
    for metric2 in longitudinal_combos:
        all_metrics.append(metric1 + "_" + metric2)        
        

for metric1 in contributor_metrics: 
    for metric2 in contributor_combos:
        all_metrics.append(metric1 + "_" + metric2) 
        

# all_metrics.append() # TODO: Add special metrics here. 
print(all_metrics)
print(len(all_metrics))

['commits_b_msg_format', 'commits_n_msg_length', 'issues_b_msg_format', 'issues_n_msg_length', 'pulls_b_msg_format', 'pulls_n_msg_length', 'docs_b_msg_format', 'docs_n_msg_length', 'code_b_msg_format', 'code_n_msg_length', 'commits_n_total_code_length', 'commits_n_avg_code_length', 'commits_n_avg_num_of_involved_files', 'pulls_n_total_code_length', 'pulls_n_avg_code_length', 'pulls_n_avg_num_of_involved_files', 'commits_n_earliest_time', 'commits_n_most_recent_time', 'commits_n_totalnum', 'issues_n_earliest_time', 'issues_n_most_recent_time', 'issues_n_totalnum', 'pulls_n_earliest_time', 'pulls_n_most_recent_time', 'pulls_n_totalnum', 'actions_n_earliest_time', 'actions_n_most_recent_time', 'actions_n_totalnum', 'stars_n_total_contributor', 'stars_n_avg_contributor', 'folks_n_total_contributor', 'folks_n_avg_contributor', 'subscribers_n_total_contributor', 'subscribers_n_avg_contributor', 'commits_n_total_contributor', 'commits_n_avg_contributor', 'issues_n_total_contributor', 'issues_