# Step 3.2: Apache Log Analysis

In [1]:
import pandas as pd 
import os
import csv
import re
import json
from collections import Counter

We obtain the conversion from exception to action defined in the original experiment

In [2]:
exception_map = {}
with open('/home/previousResults/TufanoResults/maven-common-errors.txt', "r") as f:
    for line in f.readlines():
        exception, action = line.rstrip("\n").split(",")
        exception_map[exception] = action

In [3]:
result_path_template = "/home/notebooks/ProjectAnalysis/ApacheProyectsAnalysis/results/%s.csv"

def errors_summary(project_name, commit_history):
    summary = {}
    summary["project"]=project_name
    summary["total"] = 0
    summary["totalBuildable"] = 0
    summary["same_success"] = 0
    summary["new_errors"]  = []
    summary["new_errors_count"]  = 0
    summary["diff_errors"]  = []
    summary["diff_errors_count"]  = 0
    summary["same_errors"] = []
    summary["same_errors_count"]  = 0
    summary["fix_errors"]  = []
    summary["fix_errors_count"] = 0
    
    original_errors = []
    
    for result in commit_history:

        summary["total"] += 1

        new_exceptions = json.loads(result['NEW_EXCEPTION'].replace("'", '"'))

        if result['OLD_EXCEPTION'] is not '':
            original_errors.append(result['OLD_EXCEPTION'])

        if result['IS_MAVEN'] == 'True':
            if result['OLD_BUILD_SUCCESSFUL'] == 'True' and result['NEW_BUILD_SUCCESSFUL'] == 'True':
                # BOTH BUILT SUCCESSFULLY
                summary["same_success"] += 1
            if result['OLD_BUILD_SUCCESSFUL'] == 'True' and result['NEW_BUILD_SUCCESSFUL'] == 'False':
                # WE FAIL AT BUILD, BUT WORKS ON PAST
                summary["new_errors_count"] += 1
                summary["new_errors"].append(new_exceptions[0])
            if result['OLD_BUILD_SUCCESSFUL'] == 'False' and result['NEW_BUILD_SUCCESSFUL'] == 'False':
                # BOTH FAIL
                if result['OLD_EXCEPTION'] in new_exceptions:
                    # FAIL FOR SAME CAUSE
                    summary["same_errors_count"] += 1
                    summary["same_errors"].append(result['OLD_EXCEPTION'])
                else:
                    # FAIL FOR DIFFERENT ERROR
                    summary["diff_errors_count"] += 1
                    summary["diff_errors"].append(new_exceptions[0])
            if result['OLD_BUILD_SUCCESSFUL'] == 'False' and result['NEW_BUILD_SUCCESSFUL'] == 'True':
                # WE BUILT SUCESSFULLY A BUILD THAT DOES NOT WORK IN PAST (RARE)
                summary["fix_errors_count"] += 1
                summary["fix_errors"].append(result['OLD_EXCEPTION'])
            summary["totalBuildable"] += 1
    return summary, original_errors

In [4]:
projects = pd.read_csv("/home/previousResults/TufanoResults/compilability.csv")
all_projects_summary = []
early_snapshots = []
intermediate_snapshots = []
recent_snapshots = []

original_errors = []
original_errors_early_snapshots = []
original_errors_intermediate_snapshots = []
original_errors_recent_snapshots = []

for row in projects.iterrows():
    repo = row[1]
    if os.path.exists(result_path_template%repo['PROJECT']):
        
        with open(result_path_template%repo['PROJECT']) as csvfile:
            commit_history = list(csv.DictReader(csvfile))
        
        size = len(commit_history)
        q1 = int(0.25*size)
        q3 = int(0.75*size)
        
        project_summary, project_original_errors = errors_summary(repo['PROJECT'],commit_history)
        project_summary_early, project_original_errors_early = errors_summary(repo['PROJECT'],commit_history[:q1])
        project_summary_inter, project_original_errors_inter = errors_summary(repo['PROJECT'],commit_history[q1+1:q3])
        project_summary_recent, project_original_errors_recent = errors_summary(repo['PROJECT'],commit_history[q3+1:])
        
        all_projects_summary.append(project_summary)
        early_snapshots.append(project_summary_early)
        intermediate_snapshots.append(project_summary_inter)
        recent_snapshots.append(project_summary_recent)
        
        original_errors = original_errors + project_original_errors
        original_errors_early_snapshots = original_errors_early_snapshots + project_original_errors_early
        original_errors_intermediate_snapshots = original_errors_intermediate_snapshots + project_original_errors_inter
        original_errors_recent_snapshots = original_errors_recent_snapshots + project_original_errors_recent
        

Next, we will take as a factor of study the location of the failures along the project, together with the general vision of all the historical of commits:

- 1. **All Snapshots** -> 100% of the snaphost of all projects
    - 1.1. Same errors
    - 1.2. Different errors
    - 1.3. New errors
    - 1.4. Errors solved
    - 1.5. All errors (includes: Same errors, Different errors and New errors)
    - 1.6. Original errors
- 2. **Early Snapshots** -> The oldest snapshots (snapshots below quartile Q1)
    - 2.1. Same errors
    - 2.2. Different errors
    - 2.3. New errors
    - 2.4. Errors solved
    - 2.5. All errors (includes: Same errors, Different errors and New errors)
    - 2.6. Original errors
- 3. **Intermediate Snapshots** -> The snapshots located between quartile Q1 and Q3 of the commit historic 
    - 3.1. Same errors
    - 3.2. Different errors
    - 3.3. New errors
    - 3.4. Errors solved
    - 3.5. All errors (includes: Same errors, Different errors and New errors)
    - 3.6. Original errors
- 4. **Recent Snapshots** -> The most recent snapshots (above quartile 3)
    - 4.1. Same errors
    - 4.2. Different errors
    - 4.3. New errors
    - 4.4. Errors solved
    - 4.5. All errors (includes: Same errors, Different errors and New errors)
    - 4.6. Original errors

## 1. Projects Summary - All snapshots

The following table shows, for each project:
- *same_success*: The number of commits in which both experiments succeeded
- *same_errors_count*: The number of commits where the same error was found
- *diff_errors_count*: The number of commits where a different error was found
- *new_errors_count*: The number of commints that previously succeeded and now fail
- *fix_errors_count*: The number of commits that previously failed and now don't
- *totalBuildable*: Number of commits which has pom.xml (uses Maven)
- *total*: Number of commit of the repository

In [5]:
summary_df = pd.DataFrame(all_projects_summary)
stats = summary_df[['project','same_success','same_errors_count','diff_errors_count','new_errors_count', 'fix_errors_count', 'totalBuildable', 'total']]
stats

Unnamed: 0,project,same_success,same_errors_count,diff_errors_count,new_errors_count,fix_errors_count,totalBuildable,total
0,isis,89,1757,4,211,1,2062,4817
1,james-hupa,0,539,42,96,0,677,686
2,james-jdkim,9,114,0,0,0,123,124
3,james-jsieve,0,282,51,140,0,473,527
4,james-jspf,166,280,0,0,0,446,621
5,james-mime4j,70,547,67,38,0,722,733
6,james-postage,0,42,21,0,0,63,74
7,jclouds,0,173,866,0,0,1039,5074
8,jena,328,239,2031,48,1,2647,2680
9,kalumet,5,11,96,58,0,170,172


In [6]:
# Summary of al values
stats.select_dtypes(pd.np.number).sum()

same_success          14480
same_errors_count     67298
diff_errors_count     11592
new_errors_count       8257
fix_errors_count        184
totalBuildable       101811
total                139389
dtype: int64

In [7]:
stats.describe()

Unnamed: 0,same_success,same_errors_count,diff_errors_count,new_errors_count,fix_errors_count,totalBuildable,total
count,79.0,79.0,79.0,79.0,79.0,79.0,79.0
mean,183.291139,851.873418,146.734177,104.518987,2.329114,1288.746835,1764.417722
std,347.858288,1825.197041,324.161708,294.889152,9.825035,2139.353919,2694.935048
min,0.0,0.0,0.0,0.0,0.0,3.0,25.0
25%,0.0,43.5,0.0,0.0,0.0,202.5,234.0
50%,25.0,173.0,18.0,0.0,0.0,514.0,726.0
75%,186.0,648.0,96.0,71.5,0.0,1061.5,1898.0
max,1656.0,11040.0,2031.0,2305.0,68.0,11221.0,14818.0


In [8]:
# Totals
print("Total commits: %d"%summary_df["total"].sum())
total_errors = summary_df["same_errors_count"].sum()+summary_df["diff_errors_count"].sum()+summary_df["new_errors_count"].sum()
print("Total errors: %d"%(total_errors))

Total commits: 139389
Total errors: 87147


In [9]:
def getAction(exception):
    if exception in exception_map.keys():
        return exception_map[exception]
    else:
        if exception == 'TIMEOUT':
            return "Resolution"
        else:
            return "Other"

In [10]:
def report(data):
    error_list = []
    for error_tuple in Counter(data).items():
        exception = error_tuple[0]
        count = error_tuple[1]
        error_list.append((exception, getAction(exception),count))
    errors = sorted(error_list, key=lambda i: i[2], reverse=True)
    errors = pd.DataFrame(errors, columns = ["Error", "Action", "Count"])
    errors['%'] = errors.apply (lambda row: (row[2] / total_errors)*100, axis=1)
    print(errors[['Count', '%']].sum())
    return errors    

### 1.1. Same Errors - All snapshots

In [11]:
# Same errors
same_errors_df = report(summary_df["same_errors"].sum())
same_errors_df

Count    67298.000000
%           77.223542
dtype: float64


Unnamed: 0,Error,Action,Count,%
0,DependencyResolutionException,Resolution,21867,25.092086
1,ArtifactResolutionException,Resolution,20959,24.050168
2,ModelParseException,Parsing,7650,8.778271
3,MojoExecutionException,Other,6588,7.559641
4,MojoFailureException,Other,3259,3.739658
5,CompilationFailureException,Compilation,2544,2.919205
6,ProjectBuildingException,Other,2345,2.690856
7,PluginParameterException,Parsing,1095,1.256498
8,PluginDescriptorParsingException,Parsing,698,0.800946
9,PluginContainerException,Other,243,0.278839


In [12]:
same_errors_df.groupby(["Action"]).sum()

Unnamed: 0_level_0,Count,%
Action,Unnamed: 1_level_1,Unnamed: 2_level_1
Compilation,2544,2.919205
Other,12443,14.278174
Parsing,9485,10.883909
Resolution,42826,49.142254


### 1.2. Different errors - All snapshots

In [13]:
# Diferent exception from original 
diff_error_df = report(summary_df["diff_errors"].sum())
diff_error_df

Count    11592.000000
%           13.301663
dtype: float64


Unnamed: 0,Error,Action,Count,%
0,ArtifactResolutionException,Resolution,4607,5.28647
1,TIMEOUT,Resolution,2901,3.328858
2,MojoFailureException,Other,2695,3.092476
3,ModelParseException,Parsing,635,0.728654
4,OTHER,Other,394,0.45211
5,PluginExecutionException,Other,146,0.167533
6,PluginResolutionException,Resolution,96,0.110159
7,PluginDescriptorParsingException,Parsing,59,0.067702
8,MojoExecutionException,Other,46,0.052784
9,ProjectBuildingException,Other,5,0.005737


In [14]:
diff_error_df.groupby(["Action"]).sum()

Unnamed: 0_level_0,Count,%
Action,Unnamed: 1_level_1,Unnamed: 2_level_1
Other,3293,3.778673
Parsing,694,0.796356
Resolution,7605,8.726634


### 1.3. New errors - All snapshots

In [15]:
# New errors 
new_errors_df = report(summary_df["new_errors"].sum())
new_errors_df

Count    8257.000000
%           9.474795
dtype: float64


Unnamed: 0,Error,Action,Count,%
0,ArtifactResolutionException,Resolution,3433,3.939321
1,MojoFailureException,Other,1730,1.985152
2,TIMEOUT,Resolution,1435,1.646643
3,MojoExecutionException,Other,1094,1.25535
4,PluginExecutionException,Other,381,0.437192
5,ModelParseException,Parsing,171,0.19622
6,PluginContainerException,Other,11,0.012622
7,InternalErrorException,Other,2,0.002295


In [16]:
new_errors_df.groupby(["Action"]).sum()

Unnamed: 0_level_0,Count,%
Action,Unnamed: 1_level_1,Unnamed: 2_level_1
Other,3218,3.692611
Parsing,171,0.19622
Resolution,4868,5.585964


### 1.4. Solved errors - All snapshots

The following are errors from the original experiment that do not exist in the current experiment.

In [17]:
# Errors solved
errors_solved_df = report(summary_df["fix_errors"].sum())
errors_solved_df

Count    184.000000
%          0.211138
dtype: float64


Unnamed: 0,Error,Action,Count,%
0,MojoFailureException,Other,103,0.118191
1,DependencyResolutionException,Resolution,56,0.064259
2,MojoExecutionException,Other,15,0.017212
3,ArtifactResolutionException,Resolution,9,0.010327
4,,Other,1,0.001147


In [18]:
errors_solved_df.groupby(["Action"]).sum()

Unnamed: 0_level_0,Count,%
Action,Unnamed: 1_level_1,Unnamed: 2_level_1
Other,119,0.136551
Resolution,65,0.074587


### 1.5. All errors - All snapshots

This section includes all errors reported in the current experiment (*same error*, *different error* and *new error*)

In [19]:
# All Errors
all_errors_df = report(summary_df["same_errors"].sum() + summary_df["diff_errors"].sum() + summary_df["new_errors"].sum())
all_errors_df

Count    87147.0
%          100.0
dtype: float64


Unnamed: 0,Error,Action,Count,%
0,ArtifactResolutionException,Resolution,28999,33.275959
1,DependencyResolutionException,Resolution,21867,25.092086
2,ModelParseException,Parsing,8456,9.703145
3,MojoExecutionException,Other,7728,8.867775
4,MojoFailureException,Other,7684,8.817286
5,TIMEOUT,Resolution,4336,4.975501
6,CompilationFailureException,Compilation,2544,2.919205
7,ProjectBuildingException,Other,2350,2.696593
8,PluginParameterException,Parsing,1095,1.256498
9,PluginDescriptorParsingException,Parsing,757,0.868647


In [20]:
print(all_errors_df.groupby(["Action"]).sum().sum())
all_errors_df.groupby(["Action"]).sum()

Count    87147.0
%          100.0
dtype: float64


Unnamed: 0_level_0,Count,%
Action,Unnamed: 1_level_1,Unnamed: 2_level_1
Compilation,2544,2.919205
Other,18954,21.749458
Parsing,10350,11.876485
Resolution,55299,63.454852


### 1.6. Original Errors - All snapshots

In [21]:
# Original Errors
ordered_original_errors = sorted(list(Counter(original_errors).items()), key=lambda i: i[1], reverse=True)
total_original_errors = sum(n for _, n in ordered_original_errors)
original_errors_df = pd.DataFrame([ (error[0], getAction(error[0]),error[1], (error[1] / total_original_errors) * 100) for error in ordered_original_errors], 
             columns = ["Error", "Action", "Count", "%"])
original_errors_df

Unnamed: 0,Error,Action,Count,%
0,DependencyResolutionException,Resolution,26090,32.994828
1,ArtifactResolutionException,Resolution,21742,27.496111
2,MojoExecutionException,Other,11294,14.283004
3,ModelParseException,Parsing,7740,9.788423
4,MojoFailureException,Other,4505,5.697267
5,CompilationFailureException,Compilation,2807,3.549884
6,ProjectBuildingException,Other,2349,2.970673
7,PluginParameterException,Parsing,1113,1.40756
8,PluginDescriptorParsingException,Parsing,698,0.882729
9,PluginContainerException,Other,383,0.484363


In [22]:
print(original_errors_df.groupby(["Action"]).sum().sum())
original_errors_df.groupby(["Action"]).sum()

Count    79073.0
%          100.0
dtype: float64


Unnamed: 0_level_0,Count,%
Action,Unnamed: 1_level_1,Unnamed: 2_level_1
Compilation,2807,3.549884
Other,18839,23.82482
Parsing,9595,12.134357
Resolution,47832,60.490939


## 2. Projects Summary - Early Snapshots

In [23]:
early_snapshots_df = pd.DataFrame(early_snapshots)
early_snapshots_df[['project','same_success','same_errors_count','diff_errors_count','new_errors_count', 'fix_errors_count', 'totalBuildable', 'total']]

Unnamed: 0,project,same_success,same_errors_count,diff_errors_count,new_errors_count,fix_errors_count,totalBuildable,total
0,isis,0,0,0,0,0,0,1204
1,james-hupa,0,162,0,0,0,162,171
2,james-jdkim,0,30,0,0,0,30,31
3,james-jsieve,0,78,16,10,0,104,131
4,james-jspf,1,0,0,0,0,1,155
5,james-mime4j,0,172,0,0,0,172,183
6,james-postage,0,8,0,0,0,8,18
7,jclouds,0,173,866,0,0,1039,1268
8,jena,0,5,620,13,0,638,670
9,kalumet,5,11,25,0,0,41,43


In [24]:
early_snapshots_df.select_dtypes(pd.np.number).sum()

diff_errors_count     3033
fix_errors_count        15
new_errors_count       877
same_errors_count    15334
same_success          2323
total                34821
totalBuildable       21582
dtype: int64

In [25]:
early_snapshots_df.describe()

Unnamed: 0,diff_errors_count,fix_errors_count,new_errors_count,same_errors_count,same_success,total,totalBuildable
count,79.0,79.0,79.0,79.0,79.0,79.0,79.0
mean,38.392405,0.189873,11.101266,194.101266,29.405063,440.772152,273.189873
std,146.379943,1.687632,50.80016,486.450609,63.009669,673.745329,540.095055
min,0.0,0.0,0.0,0.0,0.0,6.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,58.5,10.5
50%,0.0,0.0,0.0,9.0,0.0,181.0,58.0
75%,2.0,0.0,0.0,166.5,12.5,474.0,217.5
max,866.0,15.0,405.0,2934.0,293.0,3704.0,2952.0


### 2.1. Same Errors - Early Snapshots

In [26]:
# Same errors
same_errors_df = report(early_snapshots_df["same_errors"].sum())
same_errors_df

Count    15334.000000
%           17.595557
dtype: float64


Unnamed: 0,Error,Action,Count,%
0,DependencyResolutionException,Resolution,5280,6.058728
1,ArtifactResolutionException,Resolution,5238,6.010534
2,ModelParseException,Parsing,3474,3.986368
3,MojoExecutionException,Other,426,0.488829
4,PluginParameterException,Parsing,392,0.449815
5,MojoFailureException,Other,211,0.24212
6,CompilationFailureException,Compilation,126,0.144583
7,ProjectBuildingException,Other,93,0.106716
8,PluginContainerException,Other,56,0.064259
9,MojoNotFoundException,Parsing,32,0.03672


In [27]:
same_errors_df.groupby(["Action"]).sum()

Unnamed: 0_level_0,Count,%
Action,Unnamed: 1_level_1,Unnamed: 2_level_1
Compilation,126,0.144583
Other,792,0.908809
Parsing,3898,4.472902
Resolution,10518,12.069262


### 2.2. Different errors - Early Snapshots

In [28]:
# Diferent exception from original 
diff_error_df = report(early_snapshots_df["diff_errors"].sum())
diff_error_df

Count    3033.000000
%           3.480326
dtype: float64


Unnamed: 0,Error,Action,Count,%
0,ArtifactResolutionException,Resolution,1543,1.770572
1,TIMEOUT,Resolution,842,0.966184
2,MojoFailureException,Other,621,0.712589
3,OTHER,Other,20,0.02295
4,MojoExecutionException,Other,4,0.00459
5,PluginExecutionException,Other,2,0.002295
6,PluginVersionResolutionException,Resolution,1,0.001147


In [29]:
diff_error_df.groupby(["Action"]).sum()

Unnamed: 0_level_0,Count,%
Action,Unnamed: 1_level_1,Unnamed: 2_level_1
Other,647,0.742424
Resolution,2386,2.737903


### 2.3. New errors - Early Snapshots

In [30]:
# New errors 
new_errors_df = report(early_snapshots_df["new_errors"].sum())
new_errors_df

Count    877.000000
%          1.006346
dtype: float64


Unnamed: 0,Error,Action,Count,%
0,TIMEOUT,Resolution,406,0.465879
1,ArtifactResolutionException,Resolution,180,0.206548
2,MojoFailureException,Other,165,0.189335
3,PluginExecutionException,Other,126,0.144583


In [31]:
new_errors_df.groupby(["Action"]).sum()

Unnamed: 0_level_0,Count,%
Action,Unnamed: 1_level_1,Unnamed: 2_level_1
Other,291,0.333919
Resolution,586,0.672427


### 2.4. Solved errors - Early Snapshots

The following are errors from the original experiment that do not exist in the current experiment.

In [32]:
# Errors solved
errors_solved_df = report(early_snapshots_df["fix_errors"].sum())
errors_solved_df

Count    15.000000
%         0.017212
dtype: float64


Unnamed: 0,Error,Action,Count,%
0,MojoExecutionException,Other,15,0.017212


In [33]:
errors_solved_df.groupby(["Action"]).sum()

Unnamed: 0_level_0,Count,%
Action,Unnamed: 1_level_1,Unnamed: 2_level_1
Other,15,0.017212


### 2.5. All errors - Early Snapshots

This section includes all errors reported in the current experiment (*same error*, *different error* and *new error*)

In [34]:
# All Errors
all_errors_df = report(early_snapshots_df["same_errors"].sum() + early_snapshots_df["diff_errors"].sum() + early_snapshots_df["new_errors"].sum())
all_errors_df

Count    19244.000000
%           22.082229
dtype: float64


Unnamed: 0,Error,Action,Count,%
0,ArtifactResolutionException,Resolution,6961,7.987653
1,DependencyResolutionException,Resolution,5280,6.058728
2,ModelParseException,Parsing,3474,3.986368
3,TIMEOUT,Resolution,1248,1.432063
4,MojoFailureException,Other,997,1.144044
5,MojoExecutionException,Other,430,0.493419
6,PluginParameterException,Parsing,392,0.449815
7,PluginExecutionException,Other,134,0.153763
8,CompilationFailureException,Compilation,126,0.144583
9,ProjectBuildingException,Other,93,0.106716


In [35]:
print(all_errors_df.groupby(["Action"]).sum().sum())
all_errors_df.groupby(["Action"]).sum()

Count    19244.000000
%           22.082229
dtype: float64


Unnamed: 0_level_0,Count,%
Action,Unnamed: 1_level_1,Unnamed: 2_level_1
Compilation,126,0.144583
Other,1730,1.985152
Parsing,3898,4.472902
Resolution,13490,15.479592


### 2.6. Original Errors - Early Snapshots

In [36]:
# Original Errors
ordered_original_errors = sorted(list(Counter(original_errors_early_snapshots).items()), key=lambda i: i[1], reverse=True)
total_original_errors = sum(n for _, n in ordered_original_errors)
original_errors_df = pd.DataFrame([ (error[0], getAction(error[0]),error[1], (error[1] / total_original_errors) * 100) for error in ordered_original_errors], 
             columns = ["Error", "Action", "Count", "%"])
original_errors_df

Unnamed: 0,Error,Action,Count,%
0,DependencyResolutionException,Resolution,6454,35.110434
1,ArtifactResolutionException,Resolution,5238,28.495267
2,ModelParseException,Parsing,3524,19.170928
3,MojoExecutionException,Other,1164,6.332282
4,MojoFailureException,Other,885,4.814492
5,PluginParameterException,Parsing,410,2.230443
6,CompilationFailureException,Compilation,323,1.757154
7,PluginContainerException,Other,196,1.06626
8,ProjectBuildingException,Other,93,0.50593
9,PluginExecutionException,Other,38,0.206724


In [37]:
print(original_errors_df.groupby(["Action"]).sum().sum())
original_errors_df.groupby(["Action"]).sum()

Count    18382.0
%          100.0
dtype: float64


Unnamed: 0_level_0,Count,%
Action,Unnamed: 1_level_1,Unnamed: 2_level_1
Compilation,323,1.757154
Other,2399,13.050811
Parsing,3968,21.586334
Resolution,11692,63.605701


## 3. Projects Summary - Intemediate Snapshots

In [38]:
intermediate_snapshots_df = pd.DataFrame(intermediate_snapshots)
intermediate_snapshots_df[['project','same_success','same_errors_count','diff_errors_count','new_errors_count', 'fix_errors_count', 'totalBuildable', 'total']]

Unnamed: 0,project,same_success,same_errors_count,diff_errors_count,new_errors_count,fix_errors_count,totalBuildable,total
0,isis,0,855,2,0,0,857,2407
1,james-hupa,0,226,40,76,0,342,342
2,james-jdkim,0,61,0,0,0,61,61
3,james-jsieve,0,116,29,91,0,236,263
4,james-jspf,119,178,0,0,0,297,309
5,james-mime4j,18,280,67,0,0,365,365
6,james-postage,0,17,19,0,0,36,36
7,jclouds,0,0,0,0,0,0,2536
8,jena,0,43,1289,7,0,1339,1339
9,kalumet,0,0,63,22,0,85,85


In [39]:
intermediate_snapshots_df.select_dtypes(pd.np.number).sum()

diff_errors_count     3786
fix_errors_count        60
new_errors_count      3717
same_errors_count    37642
same_success          5869
total                69613
totalBuildable       51074
dtype: int64

In [40]:
intermediate_snapshots_df.describe()

Unnamed: 0,diff_errors_count,fix_errors_count,new_errors_count,same_errors_count,same_success,total,totalBuildable
count,79.0,79.0,79.0,79.0,79.0,79.0,79.0
mean,47.924051,0.759494,47.050633,476.481013,74.291139,881.177215,646.506329
std,163.293253,5.751734,139.985796,1065.897201,185.557728,1347.501256,1143.721264
min,0.0,0.0,0.0,0.0,0.0,11.0,0.0
25%,0.0,0.0,0.0,4.0,0.0,116.0,64.5
50%,0.0,0.0,0.0,72.0,0.0,362.0,247.0
75%,20.5,0.0,12.0,283.5,40.5,948.0,561.5
max,1289.0,51.0,982.0,5741.0,1228.0,7408.0,5904.0


### 3.1. Same Errors - Intermediate Snapshots

In [41]:
# Same errors
same_errors_df = report(intermediate_snapshots_df["same_errors"].sum())
same_errors_df

Count    37642.000000
%           43.193684
dtype: float64


Unnamed: 0,Error,Action,Count,%
0,DependencyResolutionException,Resolution,12813,14.702744
1,ArtifactResolutionException,Resolution,11269,12.931025
2,MojoExecutionException,Other,3981,4.568143
3,ModelParseException,Parsing,3206,3.678841
4,CompilationFailureException,Compilation,2266,2.600204
5,ProjectBuildingException,Other,2091,2.399394
6,MojoFailureException,Other,973,1.116504
7,PluginParameterException,Parsing,572,0.656362
8,PluginDescriptorParsingException,Parsing,275,0.315559
9,PluginContainerException,Other,186,0.213432


In [42]:
same_errors_df.groupby(["Action"]).sum()

Unnamed: 0_level_0,Count,%
Action,Unnamed: 1_level_1,Unnamed: 2_level_1
Compilation,2266,2.600204
Other,7231,8.297474
Parsing,4063,4.662237
Resolution,24082,27.633768


### 3.2. Different errors - Intermediate Snapshots

In [43]:
# Diferent exception from original 
diff_error_df = report(intermediate_snapshots_df["diff_errors"].sum())
diff_error_df

Count    3786.000000
%           4.344384
dtype: float64


Unnamed: 0,Error,Action,Count,%
0,ArtifactResolutionException,Resolution,1491,1.710902
1,MojoFailureException,Other,1383,1.586974
2,TIMEOUT,Resolution,703,0.806683
3,ModelParseException,Parsing,143,0.164091
4,OTHER,Other,17,0.019507
5,MojoExecutionException,Other,15,0.017212
6,PluginDescriptorParsingException,Parsing,14,0.016065
7,PluginExecutionException,Other,11,0.012622
8,ProjectBuildingException,Other,5,0.005737
9,InternalErrorException,Other,4,0.00459


In [44]:
diff_error_df.groupby(["Action"]).sum()

Unnamed: 0_level_0,Count,%
Action,Unnamed: 1_level_1,Unnamed: 2_level_1
Other,1435,1.646643
Parsing,157,0.180155
Resolution,2194,2.517585


### 3.3. New errors - Intermediate Snapshots

In [45]:
# New errors 
new_errors_df = report(intermediate_snapshots_df["new_errors"].sum())
new_errors_df

Count    3717.000000
%           4.265207
dtype: float64


Unnamed: 0,Error,Action,Count,%
0,ArtifactResolutionException,Resolution,1877,2.153832
1,MojoExecutionException,Other,478,0.548499
2,MojoFailureException,Other,477,0.547351
3,TIMEOUT,Resolution,460,0.527844
4,PluginExecutionException,Other,254,0.291462
5,ModelParseException,Parsing,171,0.19622


In [46]:
new_errors_df.groupby(["Action"]).sum()

Unnamed: 0_level_0,Count,%
Action,Unnamed: 1_level_1,Unnamed: 2_level_1
Other,1209,1.387311
Parsing,171,0.19622
Resolution,2337,2.681676


### 3.4. Solved errors - Intermediate Snapshots

The following are errors from the original experiment that do not exist in the current experiment.

In [47]:
# Errors solved
errors_solved_df = report(intermediate_snapshots_df["fix_errors"].sum())
errors_solved_df

Count    60.000000
%         0.068849
dtype: float64


Unnamed: 0,Error,Action,Count,%
0,DependencyResolutionException,Resolution,54,0.061964
1,ArtifactResolutionException,Resolution,6,0.006885


In [48]:
errors_solved_df.groupby(["Action"]).sum()

Unnamed: 0_level_0,Count,%
Action,Unnamed: 1_level_1,Unnamed: 2_level_1
Resolution,60,0.068849


### 3.5. All errors - Intermediate Snapshots

This section includes all errors reported in the current experiment (*same error*, *different error* and *new error*)

In [49]:
# All Errors
all_errors_df = report(intermediate_snapshots_df["same_errors"].sum() + intermediate_snapshots_df["diff_errors"].sum() + intermediate_snapshots_df["new_errors"].sum())
all_errors_df

Count    45145.000000
%           51.803275
dtype: float64


Unnamed: 0,Error,Action,Count,%
0,ArtifactResolutionException,Resolution,14637,16.795759
1,DependencyResolutionException,Resolution,12813,14.702744
2,MojoExecutionException,Other,4474,5.133854
3,ModelParseException,Parsing,3520,4.039152
4,MojoFailureException,Other,2833,3.250829
5,CompilationFailureException,Compilation,2266,2.600204
6,ProjectBuildingException,Other,2096,2.405132
7,TIMEOUT,Resolution,1163,1.334527
8,PluginParameterException,Parsing,572,0.656362
9,PluginDescriptorParsingException,Parsing,289,0.331624


In [50]:
print(all_errors_df.groupby(["Action"]).sum().sum())
all_errors_df.groupby(["Action"]).sum()

Count    45145.000000
%           51.803275
dtype: float64


Unnamed: 0_level_0,Count,%
Action,Unnamed: 1_level_1,Unnamed: 2_level_1
Compilation,2266,2.600204
Other,9875,11.331429
Parsing,4391,5.038613
Resolution,28613,32.833029


### 3.6. Original Errors - Intermediate Snapshots

In [51]:
# Original Errors
ordered_original_errors = sorted(list(Counter(original_errors_intermediate_snapshots).items()), key=lambda i: i[1], reverse=True)
total_original_errors = sum(n for _, n in ordered_original_errors)
original_errors_df = pd.DataFrame([ (error[0], getAction(error[0]),error[1], (error[1] / total_original_errors) * 100) for error in ordered_original_errors], 
             columns = ["Error", "Action", "Count", "%"])
original_errors_df

Unnamed: 0,Error,Action,Count,%
0,DependencyResolutionException,Resolution,15039,36.249036
1,ArtifactResolutionException,Resolution,11553,27.846606
2,MojoExecutionException,Other,4965,11.967316
3,ModelParseException,Parsing,3246,7.823949
4,CompilationFailureException,Compilation,2297,5.536541
5,ProjectBuildingException,Other,2095,5.049653
6,MojoFailureException,Other,1168,2.815272
7,PluginParameterException,Parsing,572,1.378712
8,PluginDescriptorParsingException,Parsing,275,0.662842
9,PluginContainerException,Other,186,0.448322


In [52]:
print(original_errors_df.groupby(["Action"]).sum().sum())
original_errors_df.groupby(["Action"]).sum()

Count    41488.0
%          100.0
dtype: float64


Unnamed: 0_level_0,Count,%
Action,Unnamed: 1_level_1,Unnamed: 2_level_1
Compilation,2297,5.536541
Other,8496,20.478211
Parsing,4103,9.889607
Resolution,26592,64.095642


## 4. Projects Summary - Recent Snapshots

In [53]:
recent_snapshots_df = pd.DataFrame(recent_snapshots)
recent_snapshots_df[['project','same_success','same_errors_count','diff_errors_count','new_errors_count', 'fix_errors_count', 'totalBuildable', 'total']]

Unnamed: 0,project,same_success,same_errors_count,diff_errors_count,new_errors_count,fix_errors_count,totalBuildable,total
0,isis,89,901,2,211,1,1204,1204
1,james-hupa,0,149,2,20,0,171,171
2,james-jdkim,9,21,0,0,0,30,30
3,james-jsieve,0,87,5,39,0,131,131
4,james-jspf,46,101,0,0,0,147,155
5,james-mime4j,52,93,0,38,0,183,183
6,james-postage,0,16,1,0,0,17,18
7,jclouds,0,0,0,0,0,0,1268
8,jena,328,191,121,27,1,668,669
9,kalumet,0,0,7,35,0,42,42


In [54]:
recent_snapshots_df.select_dtypes(pd.np.number).sum()

diff_errors_count     4756
fix_errors_count       109
new_errors_count      3649
same_errors_count    14249
same_success          6260
total                34797
totalBuildable       29023
dtype: int64

In [55]:
recent_snapshots_df.describe()

Unnamed: 0,diff_errors_count,fix_errors_count,new_errors_count,same_errors_count,same_success,total,totalBuildable
count,79.0,79.0,79.0,79.0,79.0,79.0,79.0
mean,60.202532,1.379747,46.189873,180.367089,79.240506,440.468354,367.379747
std,157.832951,7.863098,137.693926,376.405884,187.66823,673.688619,605.425032
min,0.0,0.0,0.0,0.0,0.0,6.0,0.0
25%,0.0,0.0,0.0,9.5,0.0,57.5,54.5
50%,0.0,0.0,0.0,63.0,0.0,181.0,155.0
75%,23.5,0.0,29.5,162.0,46.5,474.0,363.0
max,1008.0,67.0,918.0,2363.0,1041.0,3704.0,3704.0


### 4.1. Same Errors - Recent Snapshots

In [56]:
# Same errors
same_errors_df = report(recent_snapshots_df["same_errors"].sum())
same_errors_df

Count    14249.000000
%           16.350534
dtype: float64


Unnamed: 0,Error,Action,Count,%
0,ArtifactResolutionException,Resolution,4425,5.077627
1,DependencyResolutionException,Resolution,3751,4.304222
2,MojoExecutionException,Other,2170,2.490046
3,MojoFailureException,Other,2071,2.376444
4,ModelParseException,Parsing,965,1.107324
5,PluginDescriptorParsingException,Parsing,423,0.485387
6,ProjectBuildingException,Other,159,0.18245
7,CompilationFailureException,Compilation,152,0.174418
8,PluginParameterException,Parsing,131,0.150321
9,PluginExecutionException,Other,2,0.002295


In [57]:
same_errors_df.groupby(["Action"]).sum()

Unnamed: 0_level_0,Count,%
Action,Unnamed: 1_level_1,Unnamed: 2_level_1
Compilation,152,0.174418
Other,4402,5.051235
Parsing,1519,1.743032
Resolution,8176,9.381849


### 4.2. Different errors - Recent Snapshots

In [58]:
# Diferent exception from original 
diff_error_df = report(recent_snapshots_df["diff_errors"].sum())
diff_error_df

Count    4756.000000
%           5.457445
dtype: float64


Unnamed: 0,Error,Action,Count,%
0,ArtifactResolutionException,Resolution,1565,1.795816
1,TIMEOUT,Resolution,1352,1.551402
2,MojoFailureException,Other,690,0.791766
3,ModelParseException,Parsing,491,0.563416
4,OTHER,Other,356,0.408505
5,PluginExecutionException,Other,132,0.151468
6,PluginResolutionException,Resolution,96,0.110159
7,PluginDescriptorParsingException,Parsing,44,0.050489
8,MojoExecutionException,Other,27,0.030982
9,PluginContainerException,Other,3,0.003442


In [59]:
diff_error_df.groupby(["Action"]).sum()

Unnamed: 0_level_0,Count,%
Action,Unnamed: 1_level_1,Unnamed: 2_level_1
Other,1208,1.386164
Parsing,535,0.613905
Resolution,3013,3.457377


### 4.3. New errors - Recent Snapshots

In [60]:
# New errors 
new_errors_df = report(recent_snapshots_df["new_errors"].sum())
new_errors_df

Count    3649.000000
%           4.187178
dtype: float64


Unnamed: 0,Error,Action,Count,%
0,ArtifactResolutionException,Resolution,1369,1.570909
1,MojoFailureException,Other,1086,1.24617
2,MojoExecutionException,Other,614,0.704557
3,TIMEOUT,Resolution,567,0.650625
4,PluginContainerException,Other,11,0.012622
5,InternalErrorException,Other,2,0.002295


In [61]:
new_errors_df.groupby(["Action"]).sum()

Unnamed: 0_level_0,Count,%
Action,Unnamed: 1_level_1,Unnamed: 2_level_1
Other,1713,1.965644
Resolution,1936,2.221534


### 4.4. Solved errors - Recent Snapshots

The following are errors from the original experiment that do not exist in the current experiment.

In [62]:
# Errors solved
errors_solved_df = report(recent_snapshots_df["fix_errors"].sum())
errors_solved_df

Count    109.000000
%          0.125076
dtype: float64


Unnamed: 0,Error,Action,Count,%
0,MojoFailureException,Other,103,0.118191
1,ArtifactResolutionException,Resolution,3,0.003442
2,DependencyResolutionException,Resolution,2,0.002295
3,,Other,1,0.001147


In [63]:
errors_solved_df.groupby(["Action"]).sum()

Unnamed: 0_level_0,Count,%
Action,Unnamed: 1_level_1,Unnamed: 2_level_1
Other,104,0.119339
Resolution,5,0.005737


### 4.5. All errors - Recent Snapshots

This section includes all errors reported in the current experiment (*same error*, *different error* and *new error*)

In [64]:
# All Errors
all_errors_df = report(recent_snapshots_df["same_errors"].sum() + recent_snapshots_df["diff_errors"].sum() + recent_snapshots_df["new_errors"].sum())
all_errors_df

Count    22654.000000
%           25.995158
dtype: float64


Unnamed: 0,Error,Action,Count,%
0,ArtifactResolutionException,Resolution,7359,8.444353
1,MojoFailureException,Other,3847,4.41438
2,DependencyResolutionException,Resolution,3751,4.304222
3,MojoExecutionException,Other,2811,3.225584
4,TIMEOUT,Resolution,1919,2.202026
5,ModelParseException,Parsing,1456,1.67074
6,PluginDescriptorParsingException,Parsing,467,0.535876
7,OTHER,Other,356,0.408505
8,ProjectBuildingException,Other,159,0.18245
9,CompilationFailureException,Compilation,152,0.174418


In [65]:
print(all_errors_df.groupby(["Action"]).sum().sum())
all_errors_df.groupby(["Action"]).sum()

Count    22654.000000
%           25.995158
dtype: float64


Unnamed: 0_level_0,Count,%
Action,Unnamed: 1_level_1,Unnamed: 2_level_1
Compilation,152,0.174418
Other,7323,8.403043
Parsing,2054,2.356937
Resolution,13125,15.060759


### 4.6. Original Errors - Recent Snapshots

In [66]:
# Original Errors
ordered_original_errors = sorted(list(Counter(original_errors_recent_snapshots).items()), key=lambda i: i[1], reverse=True)
total_original_errors = sum(n for _, n in ordered_original_errors)
original_errors_df = pd.DataFrame([ (error[0], getAction(error[0]),error[1], (error[1] / total_original_errors) * 100) for error in ordered_original_errors], 
             columns = ["Error", "Action", "Count", "%"])
original_errors_df

Unnamed: 0,Error,Action,Count,%
0,MojoExecutionException,Other,5145,26.918851
1,ArtifactResolutionException,Resolution,4923,25.757338
2,DependencyResolutionException,Resolution,4570,23.910427
3,MojoFailureException,Other,2446,12.797572
4,ModelParseException,Parsing,965,5.04892
5,PluginDescriptorParsingException,Parsing,423,2.213153
6,CompilationFailureException,Compilation,186,0.97316
7,PluginExecutionException,Other,164,0.858055
8,ProjectBuildingException,Other,159,0.831895
9,PluginParameterException,Parsing,131,0.685397


In [67]:
print(original_errors_df.groupby(["Action"]).sum().sum())
original_errors_df.groupby(["Action"]).sum()

Count    19113.0
%          100.0
dtype: float64


Unnamed: 0_level_0,Count,%
Action,Unnamed: 1_level_1,Unnamed: 2_level_1
Compilation,186,0.97316
Other,7915,41.411605
Parsing,1519,7.94747
Resolution,9493,49.667765
