You need to set up your own notebook/python enviroment in order to run this. We do not include a guide on the versions used. 

In [None]:
import pymc3 as pm
import arviz as az
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
from scipy.special import expit as logistic
from scipy.special import softmax

In [None]:
df_fc = pd.read_csv('forkCommits.csv')
df_fc.head()

In [None]:
import ast
from datetime import datetime
df_dr = pd.read_csv('datareps.csv')


listofavergaes = []
repo_day_avg = {
 
}
line_additions_avg = {
 
}
aditions = 0
for index, row in df_fc.iterrows():
    list_of_commits = ast.literal_eval(row.commits)
    if len(list_of_commits) > 0:
        date_of_first_commit = datetime.fromisoformat(list_of_commits[0]['date'])
        fork_day = datetime.fromisoformat(row.created_at)
        days_until_first_commit = (date_of_first_commit - fork_day).days
        if  days_until_first_commit < 0:
            print(row.owner_login)
        
        if row.repository in repo_day_avg:
            repo_day_avg[row.repository]["count"]=repo_day_avg[row.repository]["count"]+1
            repo_day_avg[row.repository]["sum"]= repo_day_avg[row.repository]["sum"]+days_until_first_commit
        else:
             repo_day_avg[row.repository]={
                "count": 1,
                "sum": days_until_first_commit
            }
        
        for i in list_of_commits:
            aditions = int(i['lines'])
            if row.repository in line_additions_avg:
                line_additions_avg[row.repository]["count"]=line_additions_avg[row.repository]["count"]+1
                line_additions_avg[row.repository]["sum"]= line_additions_avg[row.repository]["sum"]+aditions
            else:
                line_additions_avg[row.repository]={
                    "count": 1,
                    "sum": aditions
                }

for key,value in repo_day_avg.items():
    avg =(value["sum"]) / (value["count"])
    df_dr.loc[df_dr.github_url == key , "avg_days_until_fc"] = avg

for key,value in line_additions_avg.items():
    avg =(value["sum"]) / (value["count"])
    df_dr.loc[df_dr.github_url == key , "avg_lines_comitted"] = avg

In [None]:
df_fc.drop(df_fc[df_fc["commits"] == "[]"].index)
arr = []
arrLines = []
for index, row in df_fc.iterrows():
    list_of_commits = ast.literal_eval(row.commits)
    if len(list_of_commits) > 0:
        date_of_first_commit = datetime.fromisoformat(list_of_commits[0]['date'])
        fork_day = datetime.fromisoformat(row.created_at)
        days_until_first_commit = (date_of_first_commit - fork_day).days
        arr.append([str(row.repository), days_until_first_commit])
        line_sum = 0
        for commit in list_of_commits:
            line_sum += int(commit['lines'])
        arrLines.append([str(row.repository), line_sum])

model_input = pd.DataFrame(arr, columns=["github_url", "days"])
model_line_input = pd.DataFrame(arrLines, columns=["github_url", "lines"])
df_dr["github_url"] = df_dr["github_url"].astype(str)
model_input["github_url"] = model_input["github_url"].astype(str)
model_line_input["github_url"] = model_line_input["github_url"].astype(str)
df_dr.drop("avg_days_until_fc", axis=1)
df_model = pd.merge(model_input, df_dr, on = "github_url", how = "inner")
df_line_model = pd.merge(model_line_input, df_dr, on = "github_url", how = "inner")

In [None]:
df_model['days_std'] = (df_model['days'] - df_model["days"].mean()) / df_model["days"].std()
df_model['days_std']

In [None]:
with pm.Model() as m_days:
    Xcg = pm.Data("CG", df_model["contributor_guidelines"])
    Xpv = pm.Data("PV", df_model["project_vision"])
    Xce = pm.Data("CE", df_model["code_examples"])
    Xad = pm.Data("AD", df_model["architectural_description"])
    Xsp = pm.Data("SP", df_model["setup"])
    Xtd = pm.Data("TD", df_model["test_description"])

    a = pm.Normal("a", 0, 0.2)
    bCG = pm.Normal("bCG", 0, 0.3)
    bPV = pm.Normal("bPV", 0, 0.3)
    bCE = pm.Normal("bCE", 0, 0.3)
    bAD = pm.Normal("bAD", 0, 0.3)
    bSP = pm.Normal("bSP", 0, 0.3)
    bTD = pm.Normal("bTD", 0, 0.3)

    sigma = pm.Exponential("sigma", 1)
    mu = pm.Deterministic("mu", a + bCG * Xcg + bPV * Xpv + bCE * Xce + bAD * Xad + bSP * Xsp + bTD * Xtd)

    days = pm.Normal(
        "days", mu=mu, sigma=sigma, observed=df_model["days_std"]
    )
    prior_samples = pm.sample_prior_predictive()
    m_days_trace = pm.sample(return_inferencedata=False)

In [None]:
with m_days:
   testtrace = az.plot_trace(m_days_trace);

In [54]:
with m_days:
    test = az.summary(m_days_trace, var_names=["a", "bCG", "bPV", "bCE", "bAD", "bSP", "bTD", "sigma"], hdi_prob=.90)

In [None]:
test

In [None]:
fig, ax = plt.subplots(3,2, figsize=(10, 8))
x = np.linspace(0, 1, 50)

for a, ba in zip(prior_samples["a"][:50], prior_samples["bCG"][:50]):
    y = a + ba * x
    ax[0,0].plot(x, y, c="black", alpha=0.4)

ax[0,0].set_xlabel("CG")
ax[0,0].set_ylabel("bCG")

for a, ba in zip(prior_samples["a"][:50], prior_samples["bPV"][:50]):
    y = a + ba * x
    ax[0,1].plot(x, y, c="black", alpha=0.4)

ax[0,1].set_xlabel("PV")
ax[0,1].set_ylabel("bPV")


for a, ba in zip(prior_samples["a"][:50], prior_samples["bCE"][:50]):
    y = a + ba * x
    ax[1,0].plot(x, y, c="black", alpha=0.4)

ax[1,0].set_xlabel("CE")
ax[1,0].set_ylabel("bCE")

for a, ba in zip(prior_samples["a"][:50], prior_samples["bAD"][:50]):
    y = a + ba * x
    ax[1,1].plot(x, y, c="black", alpha=0.4)

ax[1,1].set_xlabel("AD")
ax[1,1].set_ylabel("bAD")

for a, ba in zip(prior_samples["a"][:50], prior_samples["bSP"][:50]):
    y = a + ba * x
    ax[2,1].plot(x, y, c="black", alpha=0.4)

ax[2,1].set_xlabel("SP")
ax[2,1].set_ylabel("bSP")

for a, ba in zip(prior_samples["a"][:50], prior_samples["bTD"][:50]):
    y = a + ba * x
    ax[2,0].plot(x, y, c="black", alpha=0.4)

ax[2,0].set_xlabel("TD")
ax[2,0].set_ylabel("bTD")

plt.subplots_adjust(left=0.1,
                    bottom=0.1,
                    right=0.9,
                    top=0.9,
                    wspace=0.4,
                    hspace=0.7)
fig.suptitle("Prior predictive check");

In [None]:
import itertools
with m_days:
    all_comb = np.array([np.reshape(np.array(i), (1, 6)) for i in itertools.product([0, 1], repeat = 6)])
    print(all_comb[:,0,0])
    pm.set_data({"CG": all_comb[:,0,0], "PV": all_comb[:,0,1],"CE": all_comb[:,0,2],"AD": all_comb[:,0,3],"SP": all_comb[:,0,4],"TD": all_comb[:,0,5]})
    ppc = pm.sample_posterior_predictive(m_days_trace)

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
sns.boxplot(ax=ax,data=ppc["days"], showmeans=True)
ax.set(ylim=(-5, 5))

In [None]:
#compare when true/false specific
v0 = (all_comb[:,0,5]==0).nonzero()
v1 = (all_comb[:,0,5]==1).nonzero()
r0 = ppc["days"][v0].mean()
r1 = ppc["days"][v1].mean()
print(r0,r1)

In [None]:
with pm.Model() as m_lines:
    Xcg = pm.Data("CG", df_line_model["contributor_guidelines"])
    Xpv = pm.Data("PV", df_line_model["project_vision"])
    Xce = pm.Data("CE", df_line_model["code_examples"])
    Xad = pm.Data("AD", df_line_model["architectural_description"])
    Xsp = pm.Data("SP", df_line_model["setup"])
    Xtd = pm.Data("TD", df_line_model["test_description"])

    a = pm.Normal("a", 0, 0.2)
    bCG = pm.Normal("bCG", 0, 0.3)
    bPV = pm.Normal("bPV", 0, 0.3)
    bCE = pm.Normal("bCE", 0, 0.3)
    bAD = pm.Normal("bAD", 0, 0.3)
    bSP = pm.Normal("bSP", 0, 0.3)
    bTD = pm.Normal("bTD", 0, 0.3)

    sigma = pm.Exponential("sigma", 1)
    mu = pm.Deterministic("mu", a + bCG * Xcg + bPV * Xpv + bCE * Xce + bAD * Xad + bSP * Xsp + bTD * Xtd)

    lines = pm.Normal(
        "lines", mu=mu, sigma=sigma, observed=df_line_model["lines_std"]
    )
    prior_samples_lines = pm.sample_prior_predictive()
    m_lines_trace = pm.sample(return_inferencedata=False)

In [None]:
with m_lines:
    test = az.summary(m_lines_trace, var_names=["a", "bCG", "bPV", "bCE", "bAD", "bSP", "bTD", "sigma"], hdi_prob=.90)

In [None]:
test

In [None]:
fig, ax = plt.subplots(3,2, figsize=(10, 8))
x = np.linspace(0, 1, 50)

for a, ba in zip(prior_samples_lines["a"][:50], prior_samples_lines["bCG"][:50]):
    y = a + ba * x
    ax[0,0].plot(x, y, c="black", alpha=0.4)

ax[0,0].set_xlabel("CG")
ax[0,0].set_ylabel("bCG")

for a, ba in zip(prior_samples_lines["a"][:50], prior_samples_lines["bPV"][:50]):
    y = a + ba * x
    ax[0,1].plot(x, y, c="black", alpha=0.4)

ax[0,1].set_xlabel("PV")
ax[0,1].set_ylabel("bPV")


for a, ba in zip(prior_samples_lines["a"][:50], prior_samples_lines["bCE"][:50]):
    y = a + ba * x
    ax[1,0].plot(x, y, c="black", alpha=0.4)

ax[1,0].set_xlabel("CE")
ax[1,0].set_ylabel("bCE")

for a, ba in zip(prior_samples_lines["a"][:50], prior_samples_lines["bAD"][:50]):
    y = a + ba * x
    ax[1,1].plot(x, y, c="black", alpha=0.4)

ax[1,1].set_xlabel("AD")
ax[1,1].set_ylabel("bAD")

for a, ba in zip(prior_samples_lines["a"][:50], prior_samples_lines["bSP"][:50]):
    y = a + ba * x
    ax[2,1].plot(x, y, c="black", alpha=0.4)

ax[2,1].set_xlabel("SP")
ax[2,1].set_ylabel("bSP")

for a, ba in zip(prior_samples_lines["a"][:50], prior_samples_lines["bTD"][:50]):
    y = a + ba * x
    ax[2,0].plot(x, y, c="black", alpha=0.4)

ax[2,0].set_xlabel("TD")
ax[2,0].set_ylabel("bTD")

#fig.tight_layout(pad=2.0)
plt.subplots_adjust(left=0.1,
                    bottom=0.1,
                    right=0.9,
                    top=0.9,
                    wspace=0.4,
                    hspace=0.7)
fig.suptitle("Prior predictive check");

In [None]:
import itertools
with m_lines:
    all_comb = np.array([np.reshape(np.array(i), (1, 6)) for i in itertools.product([0, 1], repeat = 6)])
    print(all_comb[:,0,0])
    pm.set_data({"CG": all_comb[:,0,0], "PV": all_comb[:,0,1],"CE": all_comb[:,0,2],"AD": all_comb[:,0,3],"SP": all_comb[:,0,4],"TD": all_comb[:,0,5]})
    ppc = pm.sample_posterior_predictive(m_lines_trace)

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
sns.boxplot(ax=ax,data=ppc["lines"], showmeans=True)
ax.set(ylim=(-5, 5))