In [33]:
import pandas as pd
import xml.etree.ElementTree as etree
import os
from os import listdir
from os.path import isfile, join

# Configuration Variables
top = 25 # Top X projects to be analyzed

dir = os.path.join("results")

# Reading the xml files
xmlfiles = [f for f in listdir(dir) if isfile(join(dir, f)) and f.endswith('.xml')]

print('Listing files under the directory of %s' % dir)
print('Amount of xml files %d' % len(xmlfiles))
xmlfiles

Listing files under the directory of results
Amount of xml files 211


['a-foundation.xml',
 'aircompressor.xml',
 'airlift.xml',
 'akarnokd-misc.xml',
 'apollo.xml',
 'aprof.xml',
 'artemis-odb.xml',
 'audio-recognition.xml',
 'autohash.xml',
 'autolink-java.xml',
 'beam.xml',
 'benchmark-arraycopy.xml',
 'benchmarks.xml',
 'bloomfilter.xml',
 'blynk-server.xml',
 'bpm.xml',
 'btrace.xml',
 'bucket4j.xml',
 'byte-buddy.xml',
 'cache2k-benchmark.xml',
 'caffeine.xml',
 'cepsim.xml',
 'codec.xml',
 'collections-m0e4.xml',
 'commcare-core.xml',
 'commonmark-java.xml',
 'compactmap.xml',
 'completable-futures.xml',
 'conscrypt.xml',
 'core-ng-project.xml',
 'cqrs-hotel.xml',
 'crate.xml',
 'cryptolib.xml',
 'CubicChunks.xml',
 'cukedoctor.xml',
 'cyclops-react.xml',
 'debop4k.xml',
 'demos.xml',
 'directory-kerby.xml',
 'dispatching.xml',
 'disruptor_benchmark.xml',
 'dropbox-sdk-java.xml',
 'druid.xml',
 'dynamicbean.xml',
 'eclipselink.runtime.xml',
 'entity-essentials.xml',
 'entity-system-benchmarks.xml',
 'es4j.xml',
 'Event4J.xml',
 'expire-threadlocal

In [34]:
# Reading the projects xml file
projects_ranking_file = os.path.join('jmh-projects-bigquery-fh-201702 - jmh-projects-gh.csv')
projects_ranking = pd.read_csv(projects_ranking_file, sep=',')

# Normalize the columns
projects_ranking['name'] = projects_ranking['project'].apply(lambda x: x.split('/')[0])
projects_ranking['xml'] = projects_ranking['project'].apply(lambda x: x.split('/')[1] + '.xml')

projects_ranking.head(3)

Unnamed: 0,project,forked,watchers,stars,forks,subscribers,name,xml
0,ReactiveX/RxJava,False,23558,23558,4143,1754,ReactiveX,RxJava.xml
1,Netflix/feign,False,1716,1716,335,276,Netflix,feign.xml
2,netty/netty,False,9746,9746,4775,1243,netty,netty.xml


In [40]:
# Remove projects that could not be analyzed
projects_to_remove = ['jgrapht/jgrapht', 'lemire/RoaringBitmap', 'RoaringBitmap/RoaringBitmap']

projects_ranking = projects_ranking[~projects_ranking['project'].isin(projects_to_remove)] # jgrapht

top_projects = projects_ranking.sort_values(by=['watchers', 'stars', 'forks'], ascending=False)[:top]
top_projects

Unnamed: 0,project,forked,watchers,stars,forks,subscribers,name,xml
0,ReactiveX/RxJava,False,23558,23558,4143,1754,ReactiveX,RxJava.xml
2,netty/netty,False,9746,9746,4775,1243,netty,netty.xml
3,openzipkin/zipkin,False,5627,5627,851,507,openzipkin,zipkin.xml
4,druid-io/druid,False,4743,4743,1132,457,druid-io,druid.xml
5,square/okio,False,3703,3703,601,220,square,okio.xml
6,grpc/grpc-java,False,2631,2631,821,380,grpc,grpc-java.xml
7,ben-manes/caffeine,False,2414,2414,192,176,ben-manes,caffeine.xml
8,h2oai/h2o-3,False,1943,1943,836,283,h2oai,h2o-3.xml
9,requery/requery,False,1841,1841,140,77,requery,requery.xml
1,Netflix/feign,False,1716,1716,335,276,Netflix,feign.xml


In [41]:
import xml.etree.ElementTree as ET


def analyze_xml(df):
    tree = ET.parse(os.path.join(dir, df['xml']))
    root = tree.getroot()
    
    summary = root.find("FindBugsSummary")
    total_bugs = summary.get('total_bugs')
    
    package_stats = []
    clazz_stats = []
    # Per package
    for pkg in summary.findall("PackageStats"):
        package_stats.append(pkg.get('total_bugs'))
        
        # Per class
        for clazz in pkg.findall('ClassStats'):
            clazz_stats.append(clazz.get('bugs')) 
                
    df['Total Bugs'] = total_bugs
    df['Bugs per Package'] = package_stats
    df['Bugs per Class'] = clazz_stats
    
    return df
    
top_projects = top_projects.apply(analyze_xml, axis=1)

top_projects   


Unnamed: 0,project,forked,watchers,stars,forks,subscribers,name,xml,Total Bugs,Bugs per Package,Bugs per Class
0,ReactiveX/RxJava,False,23558,23558,4143,1754,ReactiveX,RxJava.xml,267,"[132, 0, 0, 0, 0, 0, 0, 0, 126, 0, 0, 0, 0, 0,...","[0, 0, 0, 0, 0, 0, 7, 4, 15, 0, 0, 0, 0, 0, 0,..."
2,netty/netty,False,9746,9746,4775,1243,netty,netty.xml,593,"[0, 0, 9, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,openzipkin/zipkin,False,5627,5627,851,507,openzipkin,zipkin.xml,121,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,druid-io/druid,False,4743,4743,1132,457,druid-io,druid.xml,633,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5,square/okio,False,3703,3703,601,220,square,okio.xml,51,"[19, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[12, 0, 0, 0, 1, 0, 6, 0, 0, 6, 0, 0, 0, 6, 0,..."
6,grpc/grpc-java,False,2631,2631,821,380,grpc,grpc-java.xml,52,"[21, 0, 18, 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[3, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 4, 0, 0, ..."
7,ben-manes/caffeine,False,2414,2414,192,176,ben-manes,caffeine.xml,188,"[38, 0, 36, 0, 66, 0, 0, 0, 0, 0, 0, 48, 0, 0,...","[6, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
8,h2oai/h2o-3,False,1943,1943,836,283,h2oai,h2o-3.xml,82,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
9,requery/requery,False,1841,1841,140,77,requery,requery.xml,0,"[0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Netflix/feign,False,1716,1716,335,276,Netflix,feign.xml,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
