In [None]:
import ijson
import datetime
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as mticker
from matplotlib.dates import MonthLocator

fileType1= 't1.json'
fileType2= 't2.json'
fileType3= 't3.json'

tools = ['checkpatch','sparse','linux driver verification','smatch','coverity','cppcheck'
                 ,'strace','syzkaller','kasan']

def datasetReader(filename):
    with open(filename, 'r') as f:
        objects = ijson.items(f, 'item')
        rows = list(objects)
    return rows
def parse_full_date(row):
    date = datetime.datetime.utcfromtimestamp(int(row["authorDate"]))
    date = date.strftime("%Y-%m")
    return date

def prepareDataFrame(ds):
    commits = pd.DataFrame.from_dict(ds)
    
    #change date format
    commits["authorDate"] = commits.apply(parse_full_date,axis=1)
    
    #sort the dataset for the index ( authorDate )
    df = commits["authorDate"].value_counts().sort_index()
    print(len(df))
    #get the x and y axis 
    x = df.index.values
    y = df.values
    return x,y

def prepareDataFrameSemiManual(ds):
    commits = pd.DataFrame.from_dict(ds)
    
    toolsValues = list()
    for tool in tools:
        toolsValues.append(commits[commits.type == tool])
    xy = list()
    for commit in toolsValues:
        #print commit.type
        #change date format
        commit["authorDate"] = commit.apply(parse_full_date,axis=1)

        #sort the dataset for the index ( authorDate )
        df = commit["authorDate"].value_counts().sort_index()
        #print(len(df))
        #get the x and y axis 
        x = df.index.values
        y = df.values
        xy.append([x,y])
        #print [x,y]
    return xy

def parse_full_year(row):
    date = datetime.datetime.utcfromtimestamp(int(row["authorDate"]))
    date = date.strftime("%Y")
    return date

def prepareDataFrame2(ds,total):
    commits = pd.DataFrame.from_dict(ds)
    print commits
    
    #change date format
    commits["authorDate"] = commits.apply(parse_full_year,axis=1)
    
    #sort the dataset for the index ( authorDate )
    df = commits["authorDate"].value_counts().sort_index()
    print(len(df))
    #get the x and y axis 
    x = df.index.values
    y = df.values
    yList = list()
    for i,j in zip(x,y):
        total = countDict[i]
        print total
        print j
        if( total != 0):
            yList.append((100*int(j))/total)
        else:
            yList.append(0)
    y = np.asarray(yList) 
    print x,y
    return x,y

def temporalDistOfCommits(ds1,ds2,ds3):
    x1,y1 = prepareDataFrame(ds1)
    x2,y2 = prepareDataFrame(ds2)
    x3,y3 = prepareDataFrame(ds3)
    
    plt.figure()
    ax1 = plt.subplot2grid((1,1),(0,0))
    
    ax1.plot_date(x1,y1,'b-', color='b',label='Automated Patches' ,linewidth= 10)
    plt.plot_date(x2,y2,'b-', color='y',label='Semi-manual Patches',linewidth= 10)
    plt.plot_date(x3,y3,'b-', color='r',label='Manual Patches',linewidth= 10)
    
    #ax1.grid(True)#, color='g' , linestyle ='-')

    #ax1.set_yticks([0,50,100,150,200,250])

    for label in ax1.xaxis.get_ticklabels():
        label.set_rotation(90)
    
    #ax1.spines['left'].set_color('c')

    #ax1.tick_params(axis='x', colors='c')
    ax1.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    
    ax1.xaxis.set_major_locator(MonthLocator(interval=6))
    #ax1.xaxis.set_major_locator(mdates.AutoDateLocator())
    #ax1.set_xticks([2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016])

    
    plt.xlabel('Author Dates', fontsize=20)
    plt.ylabel('# commits', fontsize=20)
    #plt.title("Temporal dist. of commits", fontsize=18)
    plt.tick_params(labelsize=16)

    plt.subplots_adjust(left=0.03,bottom=0.15,right=0.98, top=0.95 , wspace= 0.1, hspace=0)
    plt.legend(bbox_to_anchor=(0.25, 1), loc=0, borderaxespad=0., fontsize=20)
    plt.ion()
    fig = plt.gcf()
    fig.set_size_inches(20,10, forward=True)
    plt.savefig("plots/TemporalDistCommits.pdf",dpi=100)
    
def temporalDistOfSemiManual(ds2):
    
    xy = prepareDataFrameSemiManual(ds2)
    #print(xy)
    x1,y1 = xy[0]
    x2,y2 = xy[1]
    x3,y3 = xy[2]
    x4,y4 = xy[3]
    x5,y5 = xy[4]
    x6,y6 = xy[5]
    x7,y7 = xy[6]
    x8,y8 = xy[7]
    x9,y9 = xy[8]
    
    
    plt.figure()
    ax1 = plt.subplot2grid((1,1),(0,0))
    otherColors = ['#00A8F0','#C0D800','#CB4B4B','#4DA74D','#9440ED','#800080','#737CA1','#E4317F','#7D0541','#4EE2EC'
                   ,'#6698FF','#437C17','#7FE817','#FBB117']

    
    ax1.plot_date(x1,y1,'b-', color=otherColors[0],label=tools[0] ,linewidth= 5)
    plt.plot_date(x2,y2,'b-', color=otherColors[1],label=tools[1],linewidth= 5)
    plt.plot_date(x3,y3,'b-', color=otherColors[2],label=tools[2],linewidth= 5)
    plt.plot_date(x4,y4,'b-', color=otherColors[3],label=tools[3],linewidth= 5)
    plt.plot_date(x5,y5,'b-', color=otherColors[4],label=tools[4],linewidth= 5)
    plt.plot_date(x6,y6,'b-', color=otherColors[5],label=tools[5],linewidth= 5)
    plt.plot_date(x7,y7,'b-', color=otherColors[6],label=tools[6],linewidth= 5)
    plt.plot_date(x8,y8,'b-', color=otherColors[7],label=tools[7],linewidth= 5)
    plt.plot_date(x9,y9,'b-', color=otherColors[8],label=tools[8],linewidth= 5)
    
    #ax1.grid(True)#, color='g' , linestyle ='-')

    #ax1.set_yticks([0,50,100,150,200,250])

    for label in ax1.xaxis.get_ticklabels():
        label.set_rotation(90)
    
    #ax1.spines['left'].set_color('c')

    #ax1.tick_params(axis='x', colors='c')
    ax1.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    
    ax1.xaxis.set_major_locator(MonthLocator(interval=6))
    #ax1.xaxis.set_major_locator(mdates.AutoDateLocator())
    #ax1.set_xticks([2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016])

    
    plt.xlabel('Author Dates', fontsize=20)
    plt.ylabel('# commits', fontsize=20)
    #plt.title("Temporal dist. of commits", fontsize=18)
    plt.tick_params(labelsize=16)

    plt.subplots_adjust(left=0.03,bottom=0.15,right=0.98, top=0.95 , wspace= 0.1, hspace=0)
    plt.legend(bbox_to_anchor=(0.25, 1), loc=0, borderaxespad=0., fontsize=20)
    plt.ion()
    fig = plt.gcf()
    fig.set_size_inches(20,10, forward=True)
    plt.savefig("plots/TemporalDistCommitsSemiManual.pdf",dpi=100)
    

    
if __name__ == '__main__':
    dsType1 = datasetReader(fileType1)
    dsType2 = datasetReader(fileType2)
    dsType3 = datasetReader(fileType3)
    #git rev-list --count v2.6.12..v4.8
    total = 616291

    
    temporalDistOfCommits(dsType1,dsType2,dsType3)
    temporalDistOfSemiManual(dsType2)


In [None]:
import ijson
import datetime
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as mticker
import re
import numpy as np
from scipy import arange

fileType1= 't1.json'
fileType2= 't2.json'
fileType3= 't3.json'


'''dirList = ['arch','block','certs','crypto','Documentation',
           'drivers','firmware','fs','include','init',
           'ipc','kernel','lib','mm','net','samples',
           'scripts','security','sound','tools','usr','virt','staging']'''


dirList = ['arch','drivers','fs','include',
           'kernel','net','sound','staging','other']

dirListNew = ['arch','drivers','fs','include',
           'kernel','net','sound']
xList = ['arch' 'block' 'certs' 'crypto' 'Documentation'
           'drivers' 'firmware' 'fs' 'include' 'init'
           'ipc' 'kernel' 'lib' 'mm' 'net' 'samples'
           'scripts' 'security' 'sound' 'tools' 'usr' 'virt']
'''
# structure extraction code for ijson
def structureExtractor(filename):
    with open(filename, 'r') as f:
        objects = ijson.parse(f)
        for prefix, event, value in objects:
            print(prefix)
            print(event)
            print(value)
            print('end')
'''
def datasetReader(filename):
    with open(filename, 'r') as f:
        objects = ijson.items(f,'item.paths.item' )
        rows = list(objects)
    return rows

def prepareDataFrame(ds):
    commits = pd.DataFrame.from_dict(ds)
    
    #commits =commits[commits.rootDir.str.match('^[^.].*[^-_.]$')]
    #folder patterns to searcg
    #pat = '|'.join(map(re.escape, dirList))
    #commits = commits[commits.rootDir.str.contains(pat)]
    
    patNew = '|'.join(map(re.escape, dirListNew))
    others = commits[commits.rootDir.str.contains(patNew) == False]
    others = others["rootDir"].value_counts().sort_index().values.sum()
    print(others)
 
    #df = commits["rootDir"].value_counts().sort_index()
    yList = list()
    for i in dirListNew:
        if i == 'drivers':
            drivers = commits[commits.rootDir.str.contains('drivers')]
            drivers = drivers[commits.path.str.contains('^drivers/staging') == False]

            yList.append( drivers['rootDir'].value_counts().sort_index().values.sum() )
        else:
            drivers = commits[commits.rootDir.str.contains(i)]
            yList.append( drivers['rootDir'].value_counts().sort_index().values.sum() )
            
    #archs = commits[commits.rootDir.str.contains('archs')]
    
    # find the folders starting with staging.
    
    stag = commits[commits.path.str.contains('^drivers/staging')]
    stagDf = stag['path'].value_counts().sort_index()
    stagDf = stagDf.values.sum()
 
    #get the x and y axis 
    #x = df.index.values

    #lst = list(x)
    #lst.append('staging')

    #x = np.asarray(lst) 
    
    xList = list()
    xList = dirList
    #xList.append('staging')
    #xList.append('other')
    xNew = np.asarray(xList)
    #print(xNew)


    #y = df.values
    
    #print y
    
    #y = np.append(y, [stagDf])
    yList = np.append(yList, [stagDf])
    yList = np.append(yList, [others])
    #print yList
    print xNew
    print yList
    return xNew,yList

def rangeForLabels(x, offset=1):     
    return np.arange(offset,len(x)+1,1)
 

def spatialDistOfCommits(ds1,ds2,ds3):
    x1,y1 = prepareDataFrame(ds1)
    x2,y2 = prepareDataFrame(ds2)
    x3,y3 = prepareDataFrame(ds3)
    
    plt.figure()
    ax = plt.subplot2grid((1,1),(0,0))
    
    #initialize new y axis with all zeros
    newy1 = np.zeros(len(dirList))
    newy2 = np.zeros(len(dirList))
    newy3 = np.zeros(len(dirList))
    
    #dataset return results within different range
    #new y axis are calculated projecting original
    #y axis values
    for ee in dirList:

        if ee in x1:
            temp = y1[x1.tolist().index(ee)]
            newy1[dirList.index(ee)] = int(temp)
        if ee in x2:
            temp = int(y2[x2.tolist().index(ee)])
            newy2[dirList.index(ee)] = temp
        if ee in x2:
            temp = int(y3[x3.tolist().index(ee)])
            newy3[dirList.index(ee)] = temp
     
    
   
    ax.bar(rangeForLabels(dirList), newy1, width=0.2,color='b',label='Automated Patches',alpha=.5)
    ax.set_xticks(rangeForLabels(dirList))
    ax.set_xticklabels(dirList, rotation='vertical')
    
    ax.bar(rangeForLabels(dirList)+.2, newy2, width=0.2,color='y',label='Semi-manual Patches',alpha=.5)
    ax.set_xticks(rangeForLabels(dirList))
    ax.set_xticklabels(dirList)
    
    ax.bar(rangeForLabels(dirList)+.4, newy3, width=0.2,color='r',label='Manual Patches',alpha=.5)
    ax.set_xticks(rangeForLabels(dirList))
    #ax.set_xticklabels(dirList, rotation='vertical')
    # plt.plot(y2, color='y',label='Type2')
    # plt.xticks(range(len(x2)), x2)
    # plt.plot(y3, color='r',label='Type3')
    # plt.xticks(range(len(x3)), x3)
    
    #text valuess
    #for i, v in enumerate(newy1):
    #    ax.text(.3, .25, str(v), color='blue')
    
    #ax.grid(True)#, color='g' , linestyle ='-')

    ax.set_yticks([0,100,500,1000,2000,3000,4000,5000, 6000, 7000,8000])

    for label in ax.xaxis.get_ticklabels():
        label.set_rotation(45)
    
    #ax1.spines['left'].set_color('c')

    #ax1.tick_params(axis='x', colors='c')
    #ax1.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    #ax1.xaxis.set_major_locator(mticker.MaxNLocator(6*12))
    
    plt.xlabel('Folders',fontsize=20)
    plt.ylabel('# commits',fontsize=20)
    #plt.title("Spatial dist. of commits",fontsize=20)
    plt.tick_params(labelsize=16)

    plt.subplots_adjust(left=0.05,bottom=0.18,right=0.98, top=0.95 , wspace= 0.1, hspace=0)
    plt.legend(bbox_to_anchor=(0.25, 1), loc=0, borderaxespad=0., fontsize=20)
    plt.ion()
    fig = plt.gcf()
    fig.set_size_inches(20,10, forward=True)

    #plt.subplots_adjust(left=0.09,bottom=0.16,right=1.95, top=0.95 , wspace= 0.2, hspace=0)
    #plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    #plt.ion()
    #fig = plt.gcf()
    #fig.set_size_inches(22.5,10.5, forward=True)
    plt.savefig("plots/SpatialDistCommits.pdf",dpi=100)
    
def spatialDistOfCommitsPercen(ds1,ds2,ds3):
    x1,y1 = prepareDataFrame(ds1)
    x2,y2 = prepareDataFrame(ds2)
    x3,y3 = prepareDataFrame(ds3)
    
    plt.figure()
    ax = plt.subplot2grid((1,1),(0,0))
    
    #initialize new y axis with all zeros
    newy1 = np.zeros(len(dirList))
    newy2 = np.zeros(len(dirList))
    newy3 = np.zeros(len(dirList))
    
    #dataset return results within different range
    #new y axis are calculated projecting original
    #y axis values
    for ee in dirList:

        if ee in x1:
            temp = y1[x1.tolist().index(ee)]
            newy1[dirList.index(ee)] = int(temp)
        if ee in x2:
            temp = int(y2[x2.tolist().index(ee)])
            newy2[dirList.index(ee)] = temp
        if ee in x2:
            temp = int(y3[x3.tolist().index(ee)])
            newy3[dirList.index(ee)] = temp
     
    #newy1 = newy1.astype(int)

    s = (np.sum(newy1, axis=0)) 
    newy1 = [(x * 100)/s for x in newy1]
    s = (np.sum(newy2, axis=0)) 
    newy2 = [(x * 100)/s for x in newy2]
    s = (np.sum(newy3, axis=0)) 
    newy3 = [(x * 100)/s for x in newy3]


   
    ax.bar(rangeForLabels(dirList), newy1, width=0.2,color='b',label='Automated Patches',alpha=.5)
    ax.set_xticks(rangeForLabels(dirList))
    ax.set_xticklabels(dirList, rotation='vertical')
    
    ax.bar(rangeForLabels(dirList)+.2, newy2, width=0.2,color='y',label='Semi-manual Patches',alpha=.5)
    ax.set_xticks(rangeForLabels(dirList))
    ax.set_xticklabels(dirList)
    
    ax.bar(rangeForLabels(dirList)+.4, newy3, width=0.2,color='r',label='Manual Patches',alpha=.5)
    ax.set_xticks(rangeForLabels(dirList))
   

    #ax.set_yticks([0,100,500,1000,2000,3000,4000,5000, 6000, 7000,8000])

    for label in ax.xaxis.get_ticklabels():
        label.set_rotation(45)
    

    
    plt.xlabel('Folders',fontsize=20)
    plt.ylabel('Percentages',fontsize=20)
    #plt.title("Percentages of files per folder",fontsize=20)
    plt.tick_params(labelsize=16)

    plt.subplots_adjust(left=0.05,bottom=0.18,right=0.98, top=0.95 , wspace= 0.1, hspace=0)
    plt.legend(bbox_to_anchor=(0.25, 1), loc=0, borderaxespad=0., fontsize=20)
    plt.ion()
    fig = plt.gcf()
    fig.set_size_inches(20,10, forward=True)

    plt.savefig("plots/SpatialDistCommitsPercen.pdf",dpi=100)
        
if __name__ == '__main__':
    dsType1 = datasetReader(fileType1)
    dsType2 = datasetReader(fileType2)
    dsType3 = datasetReader(fileType3)
    spatialDistOfCommits(dsType1,dsType2,dsType3)
    spatialDistOfCommitsPercen(dsType1,dsType2,dsType3)


In [None]:
def prepareDataFrameSha(ds):
    commits = pd.DataFrame.from_dict(ds)
    
    #commits =commits[commits.rootDir.str.match('^[^.].*[^-_.]$')]
    #folder patterns to searcg
    #pat = '|'.join(map(re.escape, dirList))
    #commits = commits[commits.rootDir.str.contains(pat)]
    
    patNew = '|'.join(map(re.escape, dirListNew))
    others = commits[commits.rootDir.str.contains(patNew) == False]
    print others
    print(others.commits)
    others = others["rootDir"].value_counts().sort_index().values.sum()
    #print(others)
 
    #df = commits["rootDir"].value_counts().sort_index()
    yList = list()
    for i in dirListNew:
        if i == 'drivers':
            drivers = commits[commits.rootDir.str.contains('drivers')]
            drivers = drivers[commits.path.str.contains('^drivers/staging') == False]
            print i
            print(drivers.commits)

            yList.append( drivers['rootDir'].value_counts().sort_index().values.sum() )
        else:
            drivers = commits[commits.rootDir.str.contains(i)]
            print i
            print drivers.commits
            yList.append( drivers['rootDir'].value_counts().sort_index().values.sum() )
            
    #archs = commits[commits.rootDir.str.contains('archs')]
    
    # find the folders starting with staging.
    
    stag = commits[commits.path.str.contains('^drivers/staging')]
    stagDf = stag['path'].value_counts().sort_index()
    stagDf = stagDf.values.sum()
 
    #get the x and y axis 
    
    xList = list()
    xList = dirList
    #xList.append('staging')
    #xList.append('other')
    xNew = np.asarray(xList)
    #print(xNew)



    yList = np.append(yList, [stagDf])
    yList = np.append(yList, [others])
    #print yList
    #print xNew
    #print yList
    return xNew,yList

In [None]:
def datasetReader(filename):
    with open(filename, 'r') as f:
        objects = ijson.items(f,'item' )
        
        o = (objects)
        paths = o['paths']
        print paths
        o.pop('paths',None)
        print o
        pathsDict = common_entries(*paths)
        print pathsDict
        o.update(pathsDict)
        print o
        rows = list(o)
    return o

In [None]:
def datasetReader(filename):
    with open(filename, 'r') as f:
        objects = ijson.items(f,'item' )
        rows = list(objects)
    return rows

In [None]:
ds1 = datasetReader(fileType1)

In [None]:
commits = pd.DataFrame.from_dict(ds1)

In [None]:
commits

In [None]:
import pickle
import csv
def saveList(aFile,aList):
    with open(aFile, "wb") as fp:
        pickle.dump(aList,fp)
def loadList(aFile):
    with open(aFile, "rb") as fp:
        b = pickle.load(fp)
    return b

In [None]:
saveList(dsAllN,'RQ0-Spatial')

In [None]:
commits[['commit','path','rootDir']].to_csv('RQ0-Spatial')

In [None]:
ds1 = datasetReader(fileType1)
ds2 = datasetReader(fileType2)
ds3 = datasetReader(fileType3)

ds1N = normJson(ds1)
ds2N = normJson(ds2)
ds3N = normJson(ds3)

In [None]:
dsAllN = ds1N + ds2N +ds3N
save
commits = pd.DataFrame.from_dict(dsAllN)
commits.to_csv('RQ0-Spatial')

In [None]:
def normJson(ds):
    for i in ds: 
        o = i
        paths = o['paths']
        #print paths
        o.pop('paths',None)
        #print o
        pathsDict = common_entries(*paths)
        #print pathsDict
        o = o.update(pathsDict)
    return ds
    

In [None]:
    patNew = '|'.join(map(re.escape, dirListNew))
    others = commits[commits.rootDir.str.contains(patNew) == False]
    print others
    print(others.commits)
    others = others["rootDir"].value_counts().sort_index().values.sum()
    #print(others)
 
    #df = commits["rootDir"].value_counts().sort_index()
    yList = list()
    for i in dirListNew:
        if i == 'drivers':
            drivers = commits[commits.rootDir.str.contains('drivers')]
            drivers = drivers[commits.path.str.contains('^drivers/staging') == False]
            print i
            print(drivers.commits)

            yList.append( drivers['rootDir'].value_counts().sort_index().values.sum() )
        else:
            drivers = commits[commits.rootDir.str.contains(i)]
            print i
            print drivers.commits
            yList.append( drivers['rootDir'].value_counts().sort_index().values.sum() )
            
    #archs = commits[commits.rootDir.str.contains('archs')]
    
    # find the folders starting with staging.
    
    stag = commits[commits.path.str.contains('^drivers/staging')]
    stagDf = stag['path'].value_counts().sort_index()
    stagDf = stagDf.values.sum()
 
    #get the x and y axis 
    
    xList = list()
    xList = dirList
    #xList.append('staging')
    #xList.append('other')
    xNew = np.asarray(xList)

In [None]:
def common_entries(*dcts):
    aDict= dict()
    for i in set(dcts[0]).intersection(*dcts[1:]):
        print i
        folderList = list()
        if i == 'path':
            path = tuple(d[i] for d in dcts)
            print path
            patNew = '|'.join(map(re.escape, dirListNew))
            if patNew in tuple == False:
                folderList.append('others')
            elif i.str.contains('drivers'):
                if i.str.contains('^drivers/staging') == False:
                    folderList.append('drivers')
                else:
                    folderList.append('staging')
            
                
            aDict['folder'] = folderList
        #aDict[i] = tuple(d[i] for d in dcts)
    return aDict
        