In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import re
import json
from collections import defaultdict
from datetime import datetime, timezone
from scipy import stats

In [2]:
# Yujin's calculations for the base percentile metrics

roughviz_metrics_path = "../metrics/roughViz_percentiles.csv"
roughviz_metrics = pd.read_csv(roughviz_metrics_path)
roughviz_metrics.iloc[:, 2:] = roughviz_metrics.iloc[:, 2:].apply(pd.to_numeric)

twopasswords_metrics_path = "../metrics/twopasswords_percentiles.csv"
twopasswords_metrics = pd.read_csv(twopasswords_metrics_path)
twopasswords_metrics.iloc[:, 2:] = twopasswords_metrics.iloc[:, 2:].apply(pd.to_numeric)

voicelistener_metrics_path = "../metrics/voice-listener_percentiles.csv"
voicelistener_metrics = pd.read_csv(voicelistener_metrics_path)
voicelistener_metrics.iloc[:, 2:] = voicelistener_metrics.iloc[:, 2:].apply(pd.to_numeric)


In [3]:
# Yujin's calculations for the Trustee label (MCPC)

roughviz_trustee_path = "../metrics/roughViz_prototype.csv"
roughviz_trustee = pd.read_csv(roughviz_trustee_path)
roughviz_trustee.iloc[:, 2:] = roughviz_trustee.iloc[:, 2:].apply(pd.to_numeric)

twopasswords_trustee_path = "../metrics/twopasswords_prototype.csv"
twopasswords_trustee = pd.read_csv(twopasswords_trustee_path)
twopasswords_trustee.iloc[:, 2:] = twopasswords_trustee.iloc[:, 2:].apply(pd.to_numeric)

voicelistener_trustee_path = "../metrics/voice-listener_prototype.csv"
voicelistener_trustee = pd.read_csv(voicelistener_trustee_path)
voicelistener_trustee.iloc[:, 2:] = voicelistener_trustee.iloc[:, 2:].apply(pd.to_numeric)


In [4]:
repos_trustee = {
    "roughViz": roughviz_trustee,
    "twopasswords": twopasswords_trustee,
    "voice_listener": voicelistener_trustee,
}

repos_metrics = {
    "roughViz": roughviz_metrics,
    "twopasswords": twopasswords_metrics,
    "voice_listener": voicelistener_metrics,
}

In [5]:
# # Grades based on 20%ile
# balanced_grades = {
#     "F": [0, 19],
#     "D": [20, 39],
#     "C": [40, 59],
#     "B": [60, 79],
#     "A": [80, 100],
# }

grades_bins = [0, 200, 400, 600, 800, 1010]
grades_labels = ["F", "D", "C", "B", "A"]

In [6]:
z_grades_bins = [-100, -1.5, -0.5, 0.5, 1.5, 100]

# Calculate Trust Component Label Metrics

In [20]:
combined_metrics = pd.DataFrame(columns=[
    'owner',
    'repo',
    'maintenance',
    'issues_maintenance',
    'code_maintenance',
    'community_documentation',
    'maintainer_history',
    'contribution',
    'contributor_participation',
    'code_contribution',
    'contributor_growth',
    'usage_popularity',
    'stars_and_watches',
    'forks',
    'downstream_dependents',
    'project_maturity',
    'code_quality',
    'dependencies_health',
    'testing_quality',
    'review_coverage',
    'community_activity_and_integrity',
    'trustee_avg',
    'popularity_component',
    'community_activity_and_integrity_component',
    'maintenance_and_goodwill_component',
    'code_quality_component',
    'component_avg',
    'component_rank',
    'component_rank_grade',
    'trustee_rank',
    'trustee_rank_grade',
    'component_zscore',
    'component_z_grade',
    'trustee_zscore',
    'trustee_z_grade',
    ]
)

for repo in repos_trustee:
    component = repos_trustee[repo].fillna(0)
    component.rename(columns={
        'avg_percentile': 'trustee_avg',
    }, inplace=True)

    # re-calculate component version of popularity
    component["popularity_component"] =  repos_metrics[repo][["stars_and_watchers", "forks", "downstream_dependents"]].mean(axis=1)

    # calculate component top-level metrics
    component["community_activity_and_integrity_component"] = component[["popularity_component", "code_contribution", "contributor_participation", "contributor_growth"]].mean(axis=1)
    component["maintenance_and_goodwill_component"] = component[["issues_maintenance", "community_documentation", "code_maintenance", "maintainer_history"]].mean(axis=1)
    component["code_quality_component"] = component[["dependencies_health", "review_coverage", "testing_quality", "project_maturity"]].mean(axis=1)
    component["component_avg"] = component[["community_activity_and_integrity_component", "maintenance_and_goodwill_component", "code_quality_component"]].mean(axis=1)
    
    # component rank and grades based on equal 20% distribution rank
    component["component_rank"] = component["component_avg"].rank()
    component["component_rank_grade"] = pd.cut(component["component_rank"], bins=grades_bins, right=False, labels=grades_labels)
    # trustee rank and grades based on equal 20% distribution rank    
    component["trustee_rank"] = component["trustee_avg"].rank()
    component["trustee_rank_grade"] = pd.cut(component["trustee_rank"], bins=grades_bins, right=False, labels=grades_labels)
    
    # component z-score and grades based on -1.5/-0.5/0.5/1.5 z-score
    component["component_zscore"] = stats.zscore(component["component_avg"])
    component["component_z_grade"] = pd.cut(component["component_zscore"], bins=z_grades_bins, right=False, labels=grades_labels)

    # trustee z-score and grades based on -1.5/-0.5/0.5/1.5 z-score
    component["trustee_zscore"] = stats.zscore(component["trustee_avg"])
    component["trustee_z_grade"] = pd.cut(component["trustee_zscore"], bins=z_grades_bins, right=False, labels=grades_labels)

    output_file = "../metrics/" + repo + "_both_prototypes_v2.csv"
    component.to_csv(output_file, index=False)

    combined_metrics = pd.concat([combined_metrics, component.tail(1)], ignore_index=True)

# all metrics for the 3 prototype repos
combined_metrics.to_csv("../metrics/prototype_metrics.csv", index=False)
