# Step 1: Project Mining - Apache

In [1]:
import os 
import csv
import pandas as pd
import json
import pickle
root="/home/"
analysis_path=root+"previousResults/TufanoResults/analyzedProjects/"
results_path="results/"
config_path=root+"configFiles/ApacheProjects"
projects_path=root+"projects"
import concurrent.futures

This notebook tries to recover the projects used in the previous experiment and, in the case of recovering them, checks that all the studied commits are still available.

**NOTE:** Some projects not found in a first iteration have been searched in Software Heritage and included in the results shown below.

In [2]:
def check_git_project(project_name):

    print("Project: %s"%project_name)
        
    if os.path.exists("%s/%s.pickle"%(results_path, project_name)):
        print(" -> Project already checked")
        return
    
    folder  = "%s/%s/" % (projects_path, project_name)

    # CHECK IF PROJECT EXISTS
    if os.path.exists(folder):
        print(" -> Project exist in local folder!")
    else:
        print(" -> Project does not exist in local folder!")
        url="https://github.com/apache/%s"%project_name
        !curl --head --fail $url > /dev/null 2>&1
        if _exit_code is not 0:
            print(" -> Project %s does not exist in GitHub!"%project_name)
            return
        else:

            # CLONE PROJECT
            git_url = "https://github.com/apache/%s.git" % project_name
            print(" -> Downloading project from GitHub ...")
            !git clone $git_url $folder > /dev/null 2>&1
            print(" -> Project downloaded from GitHub")    

    # OPEN COMMITS FILE (PREVIOUS EXPERIMENT)
    commits = []
    with open(analysis_path+project_name+"/compilation.log.csv") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            commits.append(row['COMMIT_ID'])

    # CHECK IF ALL COMMITS IN FILE EXIST IN GIT REPOSITORY
    commit_results = []
    n=1
    for sha in commits:
        print(" -> Commits checked : %d/%d"%(n, len(commits) ), end="\r")
        n+=1
        !cd $folder && git log $sha > /dev/null 2>&1
        commit_results.append((sha, _exit_code))
    commits_df = pd.DataFrame(commit_results, columns=['Commit', "Status"])
    print("")

    # SAVE RESULTS (LOCAL)
    project = {
        'project': project_name,
        'commits': commits_df,
        'success': commits_df[commits_df['Status']==0]['Commit'].count(),
        'errors': commits_df[commits_df['Status']!=0]['Commit'].count()
    }

    # SAVE RESULTS (PICKLE FILE)
    with open("%s/%s.pickle"%(results_path,project_name), 'wb') as f:
        pickle.dump(project, f)

In [3]:
# Create results folder (if not exist)
if not os.path.exists(results_path):
    print("Create folder results/")
    os.mkdir(results_path)
else:
    print("results/ folder exists")

results/ folder exists


In [4]:
# Iterate projects from previous experiment
for project_name in [p for p in os.listdir(analysis_path) if os.path.isdir(analysis_path+p)]:
    check_git_project(project_name)

Project: roller
 -> Project already checked
Project: myfaces-portlet-bridge
 -> Project exist in local folder!
 -> Commits checked : 27/27
Project: myfaces-html5
 -> Project already checked
Project: jena
 -> Project already checked
Project: maven-surefire
 -> Project already checked
Project: log4j-extras
 -> Project already checked
Project: servicemix4-bundles
 -> Project already checked
Project: qpid-proton
 -> Project already checked
Project: maven-enforcer
 -> Project already checked
Project: tuscany-sca-2.x
 -> Project already checked
Project: myfaces
 -> Project already checked
Project: webservices-commons-xmlschema
 -> Project already checked
Project: kalumet
 -> Project already checked
Project: mahout
 -> Project already checked
Project: oltu
 -> Project already checked
Project: uima-addons
 -> Project already checked
Project: maven-doxia
 -> Project already checked
Project: tika
 -> Project already checked
Project: oozie
 -> Project already checked
Project: streams
 -> Project 

In [5]:
def getCommitInfo(project_name, commit_hash):
    folder  = "%s/%s/" % (projects_path, project_name)
    info=!cd $folder && git log $commit_hash --pretty=format:"%h|=|%ad|=|%s" --date=iso8601 | head -n 1
    return tuple(info[0].split("|=|"))

In [6]:
def createConfigFile(project):
    commits = []
    for elem in list(data['commits'].itertuples(index=False, name=False)):
        info = getCommitInfo(project['project'], elem[0][0:8])
        _, date, comment = info
        commits.append({
            "c_hash": elem[0][0:8],
            "date": date,
            "comment": comment
        })
    
    config={
        'project': project['project'],
        'git_url': "https://github.com/apache/%s.git" % project['project'],
        'last_commit': commits[-1]["c_hash"],
        'experiment': 1,
        'commits': commits
    }

    with open('%s/%s-config.json'%(config_path,project_name), 'w') as outfile:
        json.dump(config, outfile, indent=4)

In [7]:
if not os.path.exists(config_path):
    print("Create folder %s"%config_path)
    os.mkdir(config_path)

In [8]:
projects = []
for project_name in [p for p in os.listdir(analysis_path) if os.path.isdir(analysis_path+p)]:
    if os.path.exists(results_path+project_name+".pickle"):
        with open("results/%s.pickle"%project_name, 'rb') as f:
            data = pickle.load(f)
            percent_of_success = (data['success']/data['commits']['Commit'].count())*100
            projects.append((data['project'], data['success'], data['errors'],len(data['commits']), percent_of_success))
            if not os.path.exists('%s/%s-config.json'%(config_path,project_name)):
                if percent_of_success == 100:
                    print("Created config file of project %s"%project_name)
                    createConfigFile(data)
                else:
                    print("Project %s has not all the commits defined"%project_name)
            else:
                print("Config file of %s project already exist"%project_name)

Config file of roller project already exist
Config file of myfaces-portlet-bridge project already exist
Config file of myfaces-html5 project already exist
Config file of jena project already exist
Config file of maven-surefire project already exist
Config file of log4j-extras project already exist
Config file of servicemix4-bundles project already exist
Config file of qpid-proton project already exist
Config file of maven-enforcer project already exist
Config file of tuscany-sca-2.x project already exist
Project myfaces has not all the commits defined
Config file of webservices-commons-xmlschema project already exist
Config file of kalumet project already exist
Config file of mahout project already exist
Config file of oltu project already exist
Config file of uima-addons project already exist
Config file of maven-doxia project already exist
Project tika has not all the commits defined
Config file of oozie project already exist
Project streams has not all the commits defined
Config fil

In [10]:
# View a quick resume of this step
df = pd.DataFrame(projects, columns = ['Project', 'Success', 'Fail', 'Total', '% SUCCESS'])
df.groupby("% SUCCESS").count()['Project']
df

Unnamed: 0,Project,Success,Fail,Total,% SUCCESS
0,roller,3130,0,3130,100.000000
1,myfaces-portlet-bridge,27,0,27,100.000000
2,myfaces-html5,77,0,77,100.000000
3,jena,2680,0,2680,100.000000
4,maven-surefire,1677,0,1677,100.000000
5,log4j-extras,213,0,213,100.000000
6,servicemix4-bundles,7228,0,7228,100.000000
7,qpid-proton,1364,0,1364,100.000000
8,maven-enforcer,411,0,411,100.000000
9,tuscany-sca-2.x,4310,0,4310,100.000000
