# Step 3.1: GitHub Status Analysis

In [1]:
import os
from os import listdir
from os.path import isfile, join
from collections import Counter
import pandas as pd
import json

In [2]:
df = pd.read_csv('GitHubStatusCheckerResults.csv')
df

Unnamed: 0,Project,Buildable,Success,Fail,No,TotalCommits,% SUCCESS
0,Apktool,1637,918,929,0,1847,56.08
1,Bukkit,1431,1404,105,0,1509,98.11
2,Hystrix,2108,988,1121,0,2109,46.87
3,Jest,1152,1084,69,0,1153,94.10
4,Mekanism,5530,178,8177,0,8355,3.22
...,...,...,...,...,...,...,...
75,swagger-core,3982,2721,1262,0,3983,68.33
76,thymeleaf,1686,1404,283,0,1687,83.27
77,webmagic,1132,999,133,0,1132,88.25
78,wiremock,1780,306,1933,0,2239,17.19


In [3]:
df[['% SUCCESS']].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
% SUCCESS,80.0,41.669375,33.17802,0.0,10.5625,36.25,70.625,100.0


In [4]:
df[['TotalCommits']].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
TotalCommits,80.0,3760.9125,2404.921711,1132.0,1974.5,2980.0,4847.0,10594.0


In [36]:
# Sum of each field
df[["Buildable","Success","Fail","TotalCommits"]].sum()

Buildable       281487
Success          98488
Fail            202385
TotalCommits    300873
dtype: int64

In [37]:
# Average
df[["Success","Fail","TotalCommits","% SUCCESS"]].mean()

Success         1231.100000
Fail            2529.812500
TotalCommits    3760.912500
% SUCCESS         41.669375
dtype: float64

In [35]:
def getStats(df, name):
    aux_df_summary = df[['% SUCCESS']].describe().T
    aux_df_summary = aux_df_summary.rename(index={'% SUCCESS':name})
    aux_df_summary = aux_df_summary.rename(columns={
        '25%': '1st Qu.', 
        '50%': 'Median',
        '75%': '3rd Qu.',
        'mean': 'Mean', 
        'max': 'Max', 
        'min': 'Min', 
        'std': 'SD',
    })
    aux_df_summary = aux_df_summary[['Min', '1st Qu.', 'Median', 'Mean', '3rd Qu.', 'Max', 'SD']]
    aux_df_summary['with build conf.'] = df['Buildable'].sum()
    aux_df_summary['Build success'] = df['Success'].sum()
    aux_df_summary['Fraction built'] = (df['Success'].sum() / df['Buildable'].sum())*100
    return aux_df_summary

In [69]:
def createSummary(df, q1, q3):
    # Short proyects (< Q1)
    short_df = df[ df['TotalCommits']< q1 ]
    short_df_summary = getStats(short_df, 'Short history')
    # Medium proyects (> Q1 commits & < Q3)
    medium_df = df.query('TotalCommits >= %d and TotalCommits < %d'%(q1,q3))
    medium_df_summary = getStats(medium_df, 'Medium history')
    # Large proyects (> Q3)
    large_df = df[ df['TotalCommits'] >= q3 ]
    large_df_summary = getStats(large_df, 'Long history')
    # All
    all_df_summary = getStats(df, 'All')
    print("Q1 %d"%q1)
    print("Q3 %d"%q3)
    print("Short projects: %d"%len(short_df))
    print("Medium projects: %d"%len(medium_df))
    print("Large projects: %d"%len(large_df))
    return pd.concat([short_df_summary,medium_df_summary,large_df_summary, all_df_summary]).round(2)

###  Reproduction Summary

In [70]:
q1 = df['TotalCommits'].quantile(0.25)
q3 = df['TotalCommits'].quantile(0.75)
createSummary(df, q1, q3)

Q1 1974
Q3 4847
Short projects: 20
Medium projects: 40
Large projects: 20


Unnamed: 0,Min,1st Qu.,Median,Mean,3rd Qu.,Max,SD,with build conf.,Build success,Fraction built
Short history,0.0,37.43,55.42,58.22,82.27,99.38,28.13,28312,15948,56.33
Medium history,0.0,10.56,34.94,40.35,68.6,100.0,33.34,114716,44688,38.96
Long history,0.0,4.27,12.8,27.76,48.04,93.52,31.76,138459,37852,27.34
All,0.0,10.56,36.25,41.67,70.62,100.0,33.18,281487,98488,34.99


###  Reproduction Summary (with replication quartiles)

In [66]:
df_replication = pd.read_csv('/home/notebooks/ProjectAnalysis/ApacheProyectsAnalysis/ApacheStatusCheckerResults.csv')
q1 = df_replication['TotalCommits'].quantile(0.25)
q3 = df_replication['TotalCommits'].quantile(0.75)
createSummary(df, q1, q3)

Q1 234
Q3 1898
Short projects: 0
Medium projects: 18
Large projects: 62


  aux_df_summary['Fraction built'] = (df['Success'].sum() / df['Buildable'].sum())*100


Unnamed: 0,Min,1st Qu.,Median,Mean,3rd Qu.,Max,SD,with build conf.,Build success,Fraction built
Short history,,,,,,,,0,0,
Medium history,0.0,40.36,55.42,59.17,82.94,99.38,28.03,24841,14322,57.65
Long history,0.0,6.99,27.24,36.59,64.37,100.0,33.02,256646,84166,32.79
All,0.0,10.56,36.25,41.67,70.62,100.0,33.18,281487,98488,34.99


## Check number of builds of each Build System

In [46]:
projects = [f for f in listdir("/home/results/GitHub/")]
build_systems = []
for project in projects:
    path = join("/home/results/GitHub/", project)
    build_path = join(path, "experiment_1/build_files/")
    for build_file in listdir(build_path):
        with open(join(build_path, build_file)) as f:
            bs = json.load(f)
            build_systems.append((bs['build_system']))

In [47]:
counter = Counter(build_systems)
counter

Counter({'Maven': 191657, 'Ant': 25735, 'NOT_FOUND': 19386, 'Gradle': 64095})

In [48]:
total_commits = df['TotalCommits'].sum()
print("Maven: {:.2f}%".format(counter['Maven']*100/total_commits))
print("Gradle: {:.2f}%".format(counter['Gradle']*100/total_commits))
print("Ant: {:.2f}%".format(counter['Ant']*100/total_commits))

Maven: 63.70%
Gradle: 21.30%
Ant: 8.55%
