In [20]:
import subprocess
import pandas as pd
from datetime import datetime
import re
import math

In [2]:
import os

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib as mpl
import numpy as np
import seaborn as sns

import matplotlib.colors as colors

%matplotlib inline

<h1>Low-level utilities</h1>

In [2]:
# Saves the 'dataset_df' dataframe to a CSV file under the given 'directory'. We treat the persisted data as a
# snapshot as of a point in time because the CSV filename is built out of a timestamp (today) and the given 
# 'snapshot_name'.
# Example: if today is August 5, 2019 and the 'snaphost_name' parameter is "ticket_enriched_commits", then the
# dataframe is saved to "<directory>/190805 ticket_enriched_commits.csv"
def snapshot_dataset(dataset_df, directory, snapshot_name):
    d = datetime.today()
    date_tag = d.strftime('%y%m%d') # We prefix filenames with strings like '190628' for June 28, 2019 (date of extract)
    if dataset_df is not None:
        dataset_df.to_csv(directory + '/' + date_tag + ' ' + snapshot_name + '.csv')   

In [10]:
# Retuns a dataframe corresponding to a dataset snapshot that is presumed to exist in the given directory
# and be time-stamped with one of the date prefixes in the 'dates' list.
#
# -directory: a string, for the full path for the directory where the dataset was previously saved as a snapshot
# -dates: a list of date prefixes (a prefix is a string like '190805' for an August 5, 2019 timestamp). These 
#         defines the acceptable timestamps for the data set we week. Only one data set is returned, so if there
#         are multiple datasets in the directories all for the given 'snapshot_name', then the one that has the
#         latest timestamp in 'dates' is chosen to be returned.
# -snapshot_name: a string, corresponding to the logical name of the snapshot we seek.
def load_dataset_snapshot(directory, dates, snapshot_name):
    for date_prefix in dates:
        filename = directory + '/' + date_prefix + ' ' + snapshot_name +'.csv'
        if os.path.isfile(filename):
            # When reading CSV file, explicitly state dtype for columns where the act of reading the
            # CSV file might trigger a warning of "mixed types". Each time we get such a warning,
            # add one more column here where we specify the dtype for that column so warning is avoided
            df                    = pd.read_csv(filename, dtype={'Body':str, 
                                                                 'Symbolic Link':str})
            df.drop(['Unnamed: 0'], axis=1, inplace=True)
            foundData             = True
            return df # Found a date for which we had data, so no need to look at more dates
    # If we get this far, we didnt find a snapshot for any of the date_prefixes
    return None

In [13]:
# Displays a barchart plot. The x-axis is taken from the first column of 'commits_df' parameter, and the y-axis is
# taken from the values in the second column of the 'commits_df'. However, there is a twist: the x-values are 
# sorted in order of decreasing y-values, and then rendered. Thus the bar chart is decreasing unless the 
# 'cumulative' parameter is set to True.
#
# -commits_df: a dataframe with two columns: developer (a string), and an integer column for the number of commits 
#  done by that developer.
# -cumulative: boolean to determine whether the y-axis represents the cumulative number of commits
def chartCommits(commits_df, cumulative=False):
    col = list(commits_df.columns)[1]
    sorted_series = commits_df[col].sort_values(ascending=False)
    if (not cumulative):
        sorted_series.T.plot(kind='bar', figsize=[8, 4])
    else:
        cum_df = sorted_series.cumsum() /sorted_series.sum()
        cum_df.T.plot(kind= 'bar', figsize=[8, 4])

In [17]:
# -df: dataframe holding the data. The column and index headers will be used as the x and y axis values for 
#      the heatmap.
# -bounds: an np.array of numbers, marking the levels around which a particular color should be centered in the heatmap
def plotHeatmap(df, title, bounds):
    norm = colors.BoundaryNorm(boundaries=bounds, ncolors=256)
    f, ax = plt.subplots(figsize=(10, 6))
    hm = sns.heatmap(df, annot=True, ax=ax,fmt='.0f',
                     linewidths=.05, norm=norm)
    f.subplots_adjust(top=0.93)
    t= f.suptitle(title, fontsize=14)

In [19]:
# Searches whether a string appears in the requirements behind a dataframe of commits. Specifically, this
# returns two lists and a dataframe: 
#    1) a list of the 'Subject' fields for rows in 'commits_df' that contains the given 'word'
#    2) a list of the 'TicketIds' involved
#    3) a dataframe obtained by filtering 'commits_df' by filtering for the rows for the 'TicketIds' involved

# The lists may have different lengths since a given ticket may appear in the subject of multiple commits
#
# -commits_df: a dataframe with commit information per row, including columns called 'Subject' and 'Ticket(s)'
# -word: a string that we are searching for in the 'commits_df'
def searchInReqs(commits_df, word):
    subjects = set()
    tickets = set()
    included_rows = []
    for index, row in commits_df.iterrows():
        req = row['Subject']
        
        ticket = row['Ticket(s)']
        if type(ticket) == float and math.isnan(ticket):
            #Skip this row, it is not a real ticket
            included_rows.append(False)
            continue
        
        if word in req:
            subjects.add(req)
            tickets.add(row['Ticket(s)'])
            included_rows.append(True)
        else:
            included_rows.append(False)

    filtered_df = commits_df[included_rows]
    
    #Sort tickets list, easier to cross-reference
    tickets_list = list(tickets)
    tickets_list.sort()
    
    return list(subjects), tickets_list, filtered_df

<h1>Library to ingest data from GIT repos</h1>

In [3]:
class GitLogStatics:
    def __init__(self):
        return
    
    START_COMMIT_RECORD = '______START_COMMIT_RECORD______'
    COMMITID            = 'CommitId'
    SUBJECT             = 'Subject'
    AUTHOR              = 'Author'
    AUTHOR_EMAIL        = 'Author_email'
    AUTHOR_DATE         = 'Author_date'
    COMMITTER           = 'Committer'
    COMMITTER_EMAIL     = 'Committer_email'
    COMMITTER_DATE      = 'Committer_date'
    BODY_STARTS         = 'Body_Starts'
    END_BODY            = 'End_Body'
    FILES_CHANGED       = 'Files_changed'
    
    # It is important the the 'FORMAT' string contain no spaces. Otherwise the subprocess.communicate() call will fail
    # silently and produce no output. For that reason we use a '.' (dot) in lieu of ' ' (a space)
    FORMAT = '' \
    + '%Cred______START_COMMIT_RECORD______%Creset' \
    + '%n' + COMMITID + '..........%H' \
    + '%n' + SUBJECT + '...........%s' \
    + '%n' + AUTHOR + '............%an' \
    + '%n' + AUTHOR_EMAIL + '......%ae' \
    + '%n' + AUTHOR_DATE + '.......%ad' \
    + '%n' + COMMITTER + '.........%cn' \
    + '%n' + COMMITTER_EMAIL + '...%ce' \
    + '%n' + COMMITTER_DATE + '....%cd' \
    + '%n' + BODY_STARTS + '.......%b' \
    + '%n' + END_BODY \
    + '%n' + FILES_CHANGED 

In [4]:
class ChangedLine():
    
    def __init__(self):
        self.filename    = None
        self.loc         = None
        self.loc_added   = None
        self.loc_removed = None
        self.loc_changed = None
        self.other       = None
        
    def buildUp(self, filename, raw_loc_changes): 
        self.filename = filename.strip()
        
        self.loc, self.loc_added, self.loc_removed, self.loc_changed, self.other = self._parseLocChanges(raw_loc_changes)
        
    def _parseLocChanges(self, raw): # raw is like '109 +++++--?' or 'Bin 0 -> 5190 bytes'
        REGEX = '^[0-9]+'
        m = re.search(REGEX, raw)
        if m != None:
            loc = int(m.group(0))
            # Ratios among '+', '-', '?' indicate what proportion of the 'loc'-many lines were added, removed, or changed
            p = raw.count('+')
            m = raw.count('-')
            q = raw.count('?')
            t = p + m + q
            added = 0
            removed = 0
            changed = 0
            if t != 0:
                added = loc * p/t
                removed = loc* m/t
                changed = loc* q/t
            return loc, added, removed, changed, None
        else:
            return 0,0,0,0, raw

In [5]:
class CommitParcel():
    def __init__(self):
        self.commitId        = None
        self.subject         = None
        self.author          = None
        self.author_email    = None
        self.author_date     = None
        self.committer       = None
        self.committer_email = None
        self.committer_date  = None
        self.body            = None
        self.files_changed   = []
        
    def buildUp(self, commit_lines):
        
        self.commitId        = self._stripLabel(commit_lines[1], GitLogStatics.COMMITID)
        self.subject         = self._stripLabel(commit_lines[2], GitLogStatics.SUBJECT)
        self.author          = self._stripLabel(commit_lines[3], GitLogStatics.AUTHOR)
        self.author_email    = self._stripLabel(commit_lines[4], GitLogStatics.AUTHOR_EMAIL)
        self.author_date     = self._stripLabel(commit_lines[5], GitLogStatics.AUTHOR_DATE)
        self.committer       = self._stripLabel(commit_lines[6], GitLogStatics.COMMITTER)
        self.committer_email = self._stripLabel(commit_lines[7], GitLogStatics.COMMITTER_EMAIL)
        self.committer_date  = self._stripLabel(commit_lines[8], GitLogStatics.COMMITTER_DATE)
        
        next_idx = self._readBody(commit_lines)
        self._readFilesChanged(commit_lines, next_idx)
        
    def _readBody(self, commit_lines):       
            idx = 9
            
            result = self._stripLabel(commit_lines[idx], GitLogStatics.BODY_STARTS)

            idx += 1
            while idx < len(commit_lines):
                if self._hasLabel(commit_lines[idx], GitLogStatics.END_BODY): # Done with body
                    break
                result += '\n' + commit_lines[idx]
                idx += 1
            self.body = result
            return idx
        
    def _readFilesChanged(self, commit_lines, start_idx):
        #Label indicating when this beings is GitLogStatics.FILES_CHANGED
        idx = start_idx
        while idx < len(commit_lines):
            line = commit_lines[idx]
            if line.find('|') != -1:
                tokens = line.split('|')
                assert len(tokens)== 2, tokens
                change = ChangedLine()
                change.buildUp(tokens[0].strip(), tokens[1].strip())  
                self.files_changed.append(change)
            idx += 1
            
    def _stripLabel(self, line, label): 
            REGEX  = '^' + label + '[.]*' # like 'Subject...........'
            m = re.search(REGEX, line)
            assert m!=None, line
            prefix = m.group(0)  
            return line[len(prefix):].strip()
        
    def _hasLabel(self, line, label):
            REGEX  = '^' + label + '[.]*' # like 'Subject...........'
            m = re.search(REGEX, line)
            if m==None:
                return False
            else:
                return True

In [6]:
class FilenameParser:   
    # -filename: a string, for the full path in a GIT repo for an artifact, with '/' separators. Might look like:
    #
    #     'MisysPD/UniversalBanking/BFUBInfrastructure/src/com/misys/ub/fatoms/batch/common/uxEnhancement/BatchUXEnhancementAccumulator.java'
    #
    # -layer_depth: integer stating how many folders in the filename's path correspond to the highest level (self.layer)
    #               as apposed to a submodule. In the above example a 'layer_depth' of 2 results in
    #
    #           self.layer     = 'MisysPD/UniversalBanking'
    #           self.submodule = 'BFUBInfrastructure'
    #
    # -package_folders: a list of strings, each of them being the name used for folders containing packages.
    #              For example, for Java the 'package_folder' is usually '[src]' or possibly '[src, src-gen, lib, dist]'
    #              if there are generated Java source files as well as other artifacts besides Java files
    # -artifact_families: Used to produce a more meaningful artifact_type and artifact_family in cases where:
    #                     1) an additional suffixed appended at the end which might be spurious, like a '.bak' suffix.
    #                     2) or we want to explicitly say what name to use for the artifact_family of some artifact_types
    #       More formally: parameter is a dictionary whose values are lists of strings, such as:
    #
    #                          {'java':         ['java'],
    #                           'BF Artefact':  ['bfg', 'ast', 'bod']]
    #
    #                     In this example, any file that ends in 'java', 'java.bak', 'java.foo.bar' will thake those values
    #                     as the 'artifact_type', but their 'artifact_family' woulbe 'java.'
    #                     Also in this example, any file finishing in 'bfg', 'bfg.store', 'bfg.mo', 'ast', 'bod.bak' would
    #                     take those values as the 'artifact_type', and the 'artifact_family' would be 'BF Artefact'
    #                     
    def __init__(self, filename, layer_depth=2, package_folders=['src'], artifact_families=[], package_prefixes=[]):
        self.filename               = filename
        self.layer_depth            = layer_depth
        self.package_folders        = package_folders
        self.artifact_families      = artifact_families
        self.package_prefixes       = package_prefixes
        
        self.artifact_type          = None
        self.artifact_family        = None
        self.layer                  = None
        self.submodule              = None 
        self.package_type           = None
        self.package                = None
        self.classname              = None
        self.symbolic_link          = None
        
    def parse(self):
        
        #  Comments below are with regards to this example for 'self.filename':
        #
        # 'MisysPD/UniversalBanking/BFUBInfrastructure/src/com/misys/ub/fatoms/batch/common/
        #                                                 uxEnhancement/BatchUXEnhancementAccumulator.java'
        #
        # If 'self.filename' does not include a 'src' folder in its path, then the 'self.package' is the full path
        
        massaged_filename, link = self._stripSymbolicLinks()
        self.symbolic_link      = link
        tokens                  = massaged_filename.split('/')

        self._extractArtifactType(tokens) # 'java' for both self.artifact_type and self.artifact_family

        idx_module_ends         = len(tokens)-2 #Default value, may be changed in loop below if we have a source file
        idx_package_starts      = idx_module_ends # By default we assume there is no 'package_type', so skipping it
        self.package_type       = ''            #Default value, may be changed in loop below if we have a source file
        self.package            = ''            #Default value, may be changed in loop below if we have a source file
        
        src = self._searchPackageDivider(tokens)
        if src != None: 
            idx_module_ends    = tokens.index(src)
            idx_package_starts = idx_module_ends + 1 # Shift when package starts by 1, to make room for a package_type
        else: # No package divider, but check if we have a match of package prefixes
            idx = self._findWherePrefixStarts(tokens)
            if idx != None:
                idx_package_starts = idx
                idx_module_ends    = idx

        self.package      = '.'.join(tokens[idx_package_starts:-1]) # like 'com.misys.ub.fatoms.batch.common.uxEnhancement'
        if idx_module_ends < idx_package_starts: # There is room for a package_type
            self.package_type = tokens[idx_module_ends] # like 'src'


        module_tokens       = tokens[:idx_module_ends] # like '[MisysPD, UniversalBanking, BFUBInfrastructure]
        DEPTH               = self.layer_depth
        self.layer          = '/'.join(module_tokens[:DEPTH]) # like 'MisysPD/UniversalBanking'
        self.submodule      = '/'.join(module_tokens[DEPTH:]) # like 'BFUBInfrastructure'
                                          
        self.classname      = tokens[-1:][0]  # like 'BatchUXEnhancementAccumulator.java'
        
    def _searchPackageDivider(self, tokens):
        best_idx = len(tokens) #deliberately out of bounds, signifying we don't have any matches yet
        for src in self.package_folders:
            # Match things like 'src', 'src-test', 'test-src', or 'api-src'. Priority for first match          
            REGEX = '(^' + src + '$)|(^' + src + '-[a-zA-Z]+$)|(^[a-zA-Z]+-' + src + '$)'
            for idx in range(len(tokens)):
                tok = tokens[idx]
                m = re.search(REGEX, tok)
                if m!=None: # Found a match for a divider
                    if idx < best_idx: # This match is better than what we had so far, so make it the front runner
                        best_idx = idx
                else:
                    continue # Try next match
        # Return the best match, if there is one. Else null
        if best_idx < len(tokens):
            return tokens[best_idx]
        else:
            return None
    
    def _findWherePrefixStarts(self, tokens): #find where in tokens a prefix like 'com/misys' occurs, if at all
        result_idx = None
        for prefix in self.package_prefixes: # prefix is like 'com/misys'
            prefix_tokens = prefix.split('/')
            if len(prefix_tokens)==0: #boundary case - ignore empty prefixes
                continue
            for idx in range(len(tokens)): #Attempt to match starting at each idx, until we succeed or exhaust options
                tok = tokens[idx]
                if prefix_tokens[0]==tok: #This may be the start of a match. See if all other prefix tokens match as well
                    if len(tokens[idx:]) < len(prefix_tokens):
                        continue # Can't match if the full prefix won't fit in the space of the remaining tokens
                    else:
                        so_far_so_good = True
                        for prefix_idx in range(len(prefix_tokens)): #match all the prefix tokens
                            if prefix_tokens[prefix_idx] != tokens[idx + prefix_idx]:
                                so_far_so_good = False
                                break #no match, so give up, and try next tok
                        if so_far_so_good==True: # We didn't invalidate this, so a match was found
                            result_idx = idx
                            return result_idx
                        else:
                            continue # try another tok to start the attempt to match prefix
                            
            # We got this far without matching any prefix to any consecutive subset of the tokens. So no match
            result_idx = None
            return result_idx
                                        
    # Used to massage filenames that contain symbolic links, by replacing the link by the actual file. For example,
    # if 'self.filename' is:
    # 
    #     'MisysPD/UniversalBanking/{BFUBRetail/src-gen => ReferencedBOs/src}/com/trapedza/bankfusion/bo/refimpl/IBOUB_BLK_StaticAccountBlock.java'
    #
    # then the symbolic link portion '{BFUBRetail/src-gen => ReferencedBOs/src}' gets replaced by 'ReferencedBOs/src'
    def _stripSymbolicLinks(self):
        REGEX = '{[ _.0-9a-zA-Z/-]* => [ _.0-9a-zA-Z/-]+}'
        m = re.search(REGEX, self.filename)
        if m != None:
            link = m.group(0) # Like '{BFUBRetail/src-gen => ReferencedBOs/src}'
            link_tokens = link[1:-1].split(' => ') # like '[BFUBRetail/src-gen, ReferencedBOs/src]'
            massaged_filename = self.filename.replace(link, link_tokens[1])
            return massaged_filename, link
        else:
            return self.filename, ''

    def _extractArtifactType(self, tokens):
        pathless_filename = tokens[-1:][0] # Strips the pathname, leaving like BatchUXEnhancementAccumulator.java'
        
        words             = pathless_filename.split('.')

        if len(words) < 2: # There is no sufix to the file
            self.artifact_type   = ''
            self.artifact_family = ''
            return

        # If we get this far, a suffix does exist. First test if this filename is in designated artifact family
        for key in self.artifact_families.keys():
            possible_types = self.artifact_families[key]
            for candidate in possible_types:
                if candidate in words:
                    idx_candidate        = words.index(candidate)
                    self.artifact_type   = '.'.join(words[idx_candidate:])
                    self.artifact_family = key
                    return
            
        # If we get this far, there are no special artifact families. Use the default implementation
        self.artifact_type   = words[-1:][0] # like 'java' or, if multiple suffixes are used, e.g. like 'properties.bak', returns 'bak'
        self.artifact_family = self.artifact_type
        

In [3]:
# Catalogues all the contents of a GIT repo into a dataframe, one row per artifact
class GitRepoCataloguer:
    
    def __init__(self, repo_directory, layer_depth, package_folders, artifact_families, package_prefixes):
        self.repo_directory    = repo_directory
        
        self.layer_depth       = layer_depth
        self.package_folders   = package_folders
        self.artifact_families = artifact_families
        self.package_prefixes  = package_prefixes
        self.repo_df           = None # Populated when self.load is called

   
    # Populates and returns self.repo_df, with each row containing information about each artifact in self.repo, 
    # such as the filename, loc, artifact type, and some module classification based on the filename
    #
    # It also returns an array of the entries that produced errorsand an array of ignored entries
    def catalogueRepoContents(self):
        result_dict = {'Filename': [], 'LOC': []}
        DIR = self.repo_directory

        # Include the -print0 and -0 options to ensure filenames with spaces don't cause problems
        # DOES NOT WORK: raw_catalogue = subprocess.getoutput('find ' + DIR + ' -type f | xargs wc -l').split('\n')
        raw_catalogue = subprocess.getoutput('find ' + DIR + ' -type f -print0 | xargs -0 wc -l').split('\n')
            
        df, ERRORS, IGNORED = self._build_df(raw_catalogue)
        self.repo_df = df
        return self.repo_df, ERRORS, IGNORED
    
    def _build_df(self, raw_catalogue):

        DIR = self.repo_directory
        ERRORS  = []
        IGNORED = []

        repo_list            = []
        artifact_type_list   = []
        artifact_family_list = [] 
        layer_list           = []
        submodule_list       = []
        package_list         = []
        package_type_list    = []
        classname_list       = []
        symbolic_link_list   = []
        filename_list        = []
        loc_list             = []

        repo_name = DIR.split('/')[-1]
        filename = None
        for entry in raw_catalogue:
            
            filename, loc, status_code = self._extractFilenameAndLoc(entry)
            
            if status_code == 1: # this tells us its an error
                ERRORS.append(entry)
                continue
            if status_code == 2: # this entry can be ignored
                IGNORED.append(entry)
                continue

            # If we get this far, filename should be good
            assert(filename != None)

            repo_list            .append(repo_name)
            fp = FilenameParser(filename, 
                                layer_depth         = self.layer_depth, 
                                package_folders     = self.package_folders,
                                artifact_families   = self.artifact_families,
                                package_prefixes    = self.package_prefixes)
            fp.parse()

            artifact_type_list    .append(fp.artifact_type)
            artifact_family_list  .append(fp.artifact_family)
            layer_list            .append(fp.layer)
            submodule_list        .append(fp.submodule)
            package_type_list     .append(fp.package_type)
            package_list          .append(fp.package)
            classname_list        .append(fp.classname)
            symbolic_link_list    .append(fp.symbolic_link)
            filename_list         .append(filename)
            loc_list              .append(loc)

        result_dict = {'Repo': repo_list, 'Artifact Family': artifact_family_list, 
                       'Artifact Type': artifact_type_list,
                       'Submodule': submodule_list, 'Package Type': package_type_list,
                       'Package': package_list, 'Classname': classname_list,
                       'Layer': layer_list, 'Loc': loc_list,'Symbolic Link': symbolic_link_list,'Filename': filename_list, 
                      }
        df = pd.DataFrame(result_dict)
        return df, ERRORS, IGNORED
    
    # Returns a filename (string), a loc (int), and a status (int). 
    # Status values are: 
    #     0: processing successful, and filename and loc have valid values
    #     1: processing erroneous, and filename and loc are null
    #     2: processing ignored, since the input 'entry' is ignorable noise. Returned filename and loc are null.
    #
    # -entry: a string, a line in the 'raw catalogue' produced by the shell command 'find DIR -type f | xargs wc -l'
    def _extractFilenameAndLoc(self, entry): 
        DIR = self.repo_directory

        tokens          = entry.split() # pair like ['1104', 'C:/Alex/Code/Essence/ansible/Essence/roles/oradb-create/templates/dbca-create-db.rsp.11.2.0.4.j2']
        if len(tokens) < 2:
            return None, None, 1

        if tokens[0].isdigit():
            loc           = int(tokens[0])
        else: #this is an error, a bad one
            return None, None, 1

        #full_filename = tokens[1] 
        #full_filename = entry.strip().split(tokens[0])[1].strip() 

        full_filename = entry.strip()[len(tokens[0]):].strip() # strip *original* entry to preserve spaces in filename, if any

        if full_filename.startswith(DIR +'/.git'): #Ignore the underlying .git directories
            return None, None, 2
        
        if full_filename == 'total': #ignore spurious line added by the linux 'find' command
            return None, None, 2
        
        split_filename = full_filename.split(DIR)
        if len(split_filename) < 2:
            return None, None, 1          

        # This is the normal case and if get this far things should work
        filename      = full_filename.split(DIR)[1] 
        
        return filename, loc, 0
    
    # Saves the self.repo_df if it has been loaded already
    def save(self, directory):
        '''
        d = datetime.today()
        date_tag = d.strftime('%y%m%d') # We prefix filenames with strings like '190628' for June 28, 2019 (date of extract)
        if self.repo_df is not None:
            self.repo_df.to_csv(directory + '/' + date_tag + ' Repo Catalogue.csv') 
        '''
        snapshot_dataset(self.repo_df, directory, 'Repo Catalogue')

In [4]:
# Catalogues indicative information about artifacts in all repos, such as filenames, and stores it in a CSV file in the
# 'root_data_dir' with today's timestamp as a prefix to the CSV file's name.
def catalogueAllRepos(repos, root_data_dir, root_git_dir,
                         package_folders, artifact_families, package_prefixes):
    ERRORS_DICT = {}
    IGNORED_DICT = {}
    print('*** Cataloguing repos under ' + root_git_dir + '***')
    for repo in repos:
        REPO_DIR = root_git_dir + '/' + repo
        DATA_DIR = root_data_dir + '/' + repo
        if not os.path.isdir(DATA_DIR):
            os.mkdir(DATA_DIR)
        print('Cataloguing ' + repo + '...')
        cataloguer = GitRepoCataloguer(REPO_DIR, layer_depth=2, package_folders=package_folders,
                             artifact_families=artifact_families, package_prefixes=package_prefixes)
        df, ERRORS, IGNORED = cataloguer.catalogueRepoContents()
        ERRORS_DICT[repo] = ERRORS
        IGNORED_DICT[repo] = IGNORED

        if df.index.size==0:
            print('...' + repo + ' is empty; Nothing will be saved. Errors=' + str(len(ERRORS)) 
                  + ', Ignored=' + str(len(IGNORED)))
        else:
            cataloguer.save(DATA_DIR)
            print('...done cataloguing ' + repo +'; found ' + str(len(df.index)) 
                  + ' artifacts, ' + str(len(ERRORS)) + ' ERRORS, and another ' + str(len(IGNORED)) + ' where safely ignored')
            
    return ERRORS_DICT, IGNORED_DICT

In [5]:
class GitLogParser:
    # -git_directory: a string with the full path to the directory in which the git project exists and git commands
    #                 can be run successfully. For example, c:/Alex/Code/Essence/ubrepos'
    # -after_date: either None, or must be a string in format MM/DD/YY, such as '06/01/18' for the 1st of June, 2018.
    #              Used to filter log entries to only include commits after that date.
    # -before_date: similar to 'after_date', but excluding commits before the date.
    # -max_commits: integer used to restrict how many commits to retrieve. If set to 'None' then no restriction is imposed.
    # -diff_line_width: integer to determine how many characters to include in the diff lines product by 
    #                   'git log --stat'. Should be big enough that paths of changed files are not truncated, since
    #                   the 'GitLogParser' needs the full paths to accurately match changed files to entries
    #                   in the git repo. The default value of 450 should usually be enough.
    # -layer_depth: integer stating how many folders in the filename's path correspond to the highest level (self.layer)
    #               as apposed to a submodule. In the above example a 'layer_depth' of 2 results in
    #
    #           self.layer     = 'MisysPD/UniversalBanking'
    #           self.submodule = 'BFUBInfrastructure'
    #
    # -package_folders, artifact_families, package_prefixes: see documentation of the FilenameParser class.
    def __init__(self, git_directory, repo, after_date, before_date=None, max_commits=None, 
                 diff_line_width   =450,
                 layer_depth       =2, 
                 package_folders   = ['src'],
                 artifact_families =[],
                 package_prefixes  =[]):
        self.git_directory     = git_directory
        self.repo              = repo
        self.after_date        = after_date
        self.before_date       = before_date
        self.max_commits       = max_commits
        self.diff_line_width   = diff_line_width
        self.layer_depth       = layer_depth
        self.package_folders   = package_folders
        self.artifact_families = artifact_families
        self.package_prefixes = package_prefixes
        self.log_tokens      = None
        self.git_command     = None
        self.parcels         = None

    def _buildGitCommand(self):
        
        FILTERS = ''
        if self.after_date != None:  
            FILTERS += ' --after=' + self.after_date
        if self.before_date != None:
            FILTERS += ' --before=' + self.before_date
        if self.max_commits != None:
            FILTERS += ' -n ' + str(self.max_commits)

        GIT_CMD = 'git log --date=short --format=' + GitLogStatics.FORMAT \
        + ' --stat --stat-width=' + str(self.diff_line_width) \
        + FILTERS
        
        return GIT_CMD
    
    def _generateLogTokens(self):
        self.git_command = self._buildGitCommand()
        process          = subprocess.Popen(self.git_command.split(), cwd=self.git_directory, stdout=subprocess.PIPE)
        output, error    = process.communicate()
        s                = output.decode('utf-8')
        self.log_tokens  = s.split('\n')
        
    def parse(self):
        
        self._generateLogTokens()
        self.parcels = self._parcelOutCommits(self.log_tokens)
        return self._build_raw_df(), self._build_commits_df()
        
    def _build_raw_df(self):
        repo_list            = []
        commitId_list        = []
        subject_list         = []
        author_list          = []
        author_email_list    = []
        author_date_list     = []
        committer_list       = []
        committer_email_list = []
        committer_date_list  = []
        body_list            = []
        filename_list        = []
        loc_list             = []
        loc_added_list       = []
        loc_removed_list     = []
        loc_changed_list     = []
        other_list           = []
        artifact_type_list   = []
        artifact_family_list = [] 
        layer_list           = []
        submodule_list       = []
        package_list         = []
        package_type_list    = []
        classname_list       = []
        symbolic_link_list   = []

        for p in self.parcels:
            for f in p.files_changed:
                
                repo_list            .append(self.repo)
                commitId_list        .append(p.commitId)
                subject_list         .append(p.subject)
                author_list          .append(p.author) 
                author_email_list    .append(p.author_email)
                author_date_list     .append(p.author_date)
                committer_list       .append(p.committer)
                committer_email_list .append(p.committer_email)
                committer_date_list  .append(p.committer_date)
                body_list            .append(p.body)
                filename_list        .append(f.filename)
                loc_list             .append(f.loc)
                loc_added_list       .append(f.loc_added)
                loc_removed_list     .append(f.loc_removed)
                loc_changed_list     .append(f.loc_changed)
                other_list           .append(f.other)

                fp = FilenameParser(f.filename, 
                                    layer_depth         = self.layer_depth, 
                                    package_folders     = self.package_folders,
                                    artifact_families   = self.artifact_families,
                                    package_prefixes    = self.package_prefixes)
                fp.parse()
                
                artifact_type_list    .append(fp.artifact_type)
                artifact_family_list  .append(fp.artifact_family)
                layer_list            .append(fp.layer)
                submodule_list        .append(fp.submodule)
                package_type_list     .append(fp.package_type)
                package_list          .append(fp.package)
                classname_list        .append(fp.classname)
                symbolic_link_list    .append(fp.symbolic_link)

        result_dict = {'Repo': repo_list, 'CommitId(s)': commitId_list, 'Artifact Family': artifact_family_list, 
                       'Artifact Type': artifact_type_list,
                       'Submodule': submodule_list, 'Package Type': package_type_list,
                       'Package': package_list, 'Classname': classname_list,
                       'Loc': loc_list, 'Loc+': loc_added_list, 'Loc-': loc_removed_list, 'Loc?': loc_changed_list,
                       'Loc other': other_list, 'Subject': subject_list, 'Body': body_list,
                       'Layer': layer_list, 'Symbolic Link': symbolic_link_list,'Filename': filename_list, 
                      'Author(s)': author_list, 'Author(s) e-mail': author_email_list, 'Author(s) date': author_date_list,
                      'Committer(s)': committer_list, 'Committer(s) e-mail': committer_email_list, 
                       'Comitter(s) date': committer_date_list,
                      }
        df = pd.DataFrame(result_dict)
        return df
    
    def _build_commits_df(self):
        repo_list            = []
        commitId_list        = []
        subject_list         = []
        author_list          = []
        author_email_list    = []
        author_date_list     = []
        committer_list       = []
        committer_email_list = []
        committer_date_list  = []
        body_list            = []
        
        files_count_list     = []
        loc_list             = []
        loc_added_list       = []
        loc_removed_list     = []
        loc_changed_list     = []
        
        for p in self.parcels:
                
            repo_list            .append(self.repo)
            commitId_list        .append(p.commitId)
            subject_list         .append(p.subject)
            author_list          .append(p.author) 
            author_email_list    .append(p.author_email)
            author_date_list     .append(p.author_date)
            committer_list       .append(p.committer)
            committer_email_list .append(p.committer_email)
            committer_date_list  .append(p.committer_date)
            body_list            .append(p.body)
            
            files_count_list     .append(len(p.files_changed))

            loc         = 0
            loc_added   = 0
            loc_removed = 0
            loc_changed = 0
            for f in p.files_changed:
                loc         += f.loc
                loc_added   += f.loc_added
                loc_removed += f.loc_removed
                loc_changed += f.loc_changed
                
            loc_list             .append(loc)
            loc_added_list       .append(loc_added)
            loc_removed_list     .append(loc_removed)
            loc_changed_list     .append(loc_changed)

        result_dict = {'Repo': repo_list, 'CommitId(s)': commitId_list, '# files changed': files_count_list,
                       'Loc': loc_list, 'Loc+': loc_added_list, 'Loc-': loc_removed_list, 'Loc?': loc_changed_list,
                       'Subject': subject_list, 'Body': body_list,
                      'Author(s)': author_list, 'Author(s) e-mail': author_email_list, 'Author(s) date': author_date_list,
                      'Committer(s)': committer_list, 'Committer(s) e-mail': committer_email_list, 
                       'Comitter(s) date': committer_date_list,
                      }
        df = pd.DataFrame(result_dict)
        return df
        
    def _parcelOutCommits(self, tokens):
        cursor = 0
        parcelled_commits= []
        while cursor < len(tokens):
            n = self._getNextCommitLines(tokens, cursor)
            if n == None:
                break # We are done, didn't find a parcel
            parcel = CommitParcel()
            parcel.buildUp(n[0])
            parcelled_commits.append(parcel)
            cursor = n[1]
        return parcelled_commits

    def _getNextCommitLines(self, tokens, cursor):
        while tokens[cursor] != GitLogStatics.START_COMMIT_RECORD:
            cursor += 1
            if len(tokens) <= cursor: #No more tokens to see
                return None
        #Found where next commit starts
        commit_start_idx = cursor

        cursor += 1
        #search for where commit ends
        while tokens[cursor] != GitLogStatics.START_COMMIT_RECORD:
            cursor += 1
            if len(tokens) <= cursor: #No more tokens to see
                break

        commit_end_idx = cursor
        return tokens[commit_start_idx: commit_end_idx], cursor 

In [6]:
class GitLogAggregationEngine():
    
    #
    # -glogdf: a DataFrame, as built by the GitLogParser after doing a full parsing run
    def __init__(self, raw_df, commits_df):
        self.raw_df       = raw_df
        self.commits_df   = commits_df
       
        self.artifacts_df = None
        self.modules_df   = None
        self.vol_df       = None
        
    # Saves all the dataframes in that are part of the state of 'self'
    def save_all(self, directory):
        '''
        d = datetime.today()
        date_tag = d.strftime('%y%m%d') # We prefix filenames with strings like '190628' for June 28, 2019 (date of extract)
        if self.commits_df is not None:
            self.commits_df.to_csv(directory + '/' + date_tag + ' commits_parsed_git_log.csv')
        if self.raw_df is not None:
            self.raw_df.to_csv(directory + '/' + date_tag + ' raw_parsed_git_log.csv')
        if self.artifacts_df is not None:
            self.artifacts_df.to_csv(directory + '/' + date_tag + ' by_artifact_parsed_git_log.csv')
        if self.modules_df is not None:
            self.modules_df.to_csv(directory + '/' + date_tag + ' by_module_parsed_git_log.csv')
        if self.vol_df is not None:
            self.vol_df.to_csv(directory + '/' + date_tag + ' by_volatility_parsed_git_log.csv')
        '''
        snapshot_dataset(self.commits_df,   directory, 'commits_parsed_git_log')
        snapshot_dataset(self.raw_df,       directory, 'raw_parsed_git_log')
        snapshot_dataset(self.artifacts_df, directory, 'by_artifact_parsed_git_log')
        snapshot_dataset(self.modules_df,   directory, 'by_module_parsed_git_log')
        snapshot_dataset(self.vol_df,       directory, 'by_volatility_parsed_git_log')
    
    def aggregateByArtifact(self):
        
        # DataFrame.groupby will drop rows for which the grouping dimension is null. So pad up null 'Package Type'
        # with empty strings to avoid dropping data
        self.raw_df['Repo'].fillna('', inplace=True)
        self.raw_df['Filename'].fillna('', inplace=True)
        self.raw_df['Package Type'].fillna('', inplace=True)
                
        artifacts_df      = self.raw_df.groupby(['Repo', 'Filename', 'Package Type']).apply(self._collapseFilenameMultiplicities)
        artifacts_df.sort_values(by=['# commits'], ascending=False, inplace=True)
        artifacts_df      = artifacts_df.reset_index()
        cols              = list(artifacts_df.columns)
        cols.remove('level_3')
        cols              = cols[1:] + cols[:1]
        artifacts_df      = artifacts_df[cols]
        self.artifacts_df = artifacts_df
        
        return self.artifacts_df
    
    def aggregateByModule(self):
        if self.artifacts_df is not None: # Optimization: re-use partial aggregation already done to file level
            input_df = self.artifacts_df
        else:
            input_df = self.raw_df # Do from scratch: aggregate from each <commit, file> pair

        # DataFrame.groupby will drop rows for which the grouping dimension is null. So before grouping pad up null
        # with empty strings to avoid dropping data
        input_df['Repo'].fillna('', inplace=True)
        input_df['Layer'].fillna('', inplace=True)
        input_df['Submodule'].fillna('', inplace=True)
        input_df['Package Type'].fillna('', inplace=True)
            
        modules_df = input_df.groupby(['Repo', 'Layer', 'Submodule', 'Package Type']).apply(self._collapseModuleMultiplicities)
        modules_df = modules_df.reset_index()
        cols = list(modules_df.columns)
        cols.remove('level_4')
        modules_df = modules_df[cols]
        modules_df.sort_values(by=['# commits'], ascending=False, inplace=True)
        
        self.modules_df = modules_df
        
        return self.modules_df
    
    # Returns a dataframe, 1 row per module, of all the modules where the changed loc exceeds the given
    # 'loc_limit'
    # Assumes that self.modules_df has already been computed (for example, by calling self.aggregateByModule)
    #
    # -loc_limit: an integer stating the lower bound for how many lines of code (loc) must have changed for
    #             a module to quality as volatile
    def buildVolatility_df(self, loc_limit):
        m_df = self.modules_df
        if m_df is None:
            return None
        vol_df = m_df[m_df['Loc'] > loc_limit]
        vol_df = vol_df.sort_values(by=['Loc'], ascending=False)
        
        self.vol_df = vol_df
        return self.vol_df

    def _collapseFilenameMultiplicities(self, df):
        result_df = pd.DataFrame()

        # For any columns that have a unique value, take that value
        for col in df.columns:            
            # !! Ignore the columns by which we are grouping by, lest they appear twice post-aggregation and cause trouble
            if col in ['Repo', 'Filename', 'Package Type']:
                continue
            if (df[col].unique().size==1):
                result_df[col] = [df[col].iloc[0]]
                
        # For other columns, we must aggregate the math.
        self._aggregateMetrics(df, result_df)
        
        return result_df
    
    def _collapseModuleMultiplicities(self, df):
        result_df = pd.DataFrame()

        result_df['Artifact Types'] = [list(df['Artifact Type'].unique())]
        result_df['# files changed'] = [df['Filename'].unique().size]
        result_df['files changed'] = [list(df['Filename'])]
                
        '''
        Commented generic code causes an exception because df[col].unique() crashes if a value is not hashable, like a list
        # For any columns that have a unique value, take that value
        for col in df.columns:            
            # !! Ignore the columns by which we are grouping by, lest they appear twice post-aggregation and cause trouble
            if col in ['Layer', 'Submodule', 'Package Type']:
                continue
            if (df[col].unique().size==1):
                result_df[col] = [df[col].iloc[0]]
        '''
        
        self._aggregateMetrics(df, result_df)

        return result_df
    
    def _aggregateMetrics(self, input_df, result_df):

        author_list                   = self._mergeLists(list(input_df['Author(s)'])) #Avoid duplicates
        commit_ids                    = self._mergeLists(list(input_df['CommitId(s)']))
        if type(commit_ids) is not list:
            result_df['# commits']        = [1]                
        else:    
            result_df['# commits']        = [len(commit_ids)]                

        result_df['# authors']        = [len(author_list)]
        result_df['Loc']              = [input_df['Loc'].sum()]
        result_df['Loc+']             = [input_df['Loc+'].sum()]
        result_df['Loc-']             = [input_df['Loc-'].sum()]
        result_df['Loc?']             = [input_df['Loc?'].sum()]
        
        result_df['Loc other']        = [self._mergeLists(list(input_df['Loc other']))] #Avoid duplicates
        result_df['CommitId(s)']      = [commit_ids]
        result_df['Author(s)']        = [author_list]
        result_df['Author(s) e-mail'] = [self._mergeLists(list(input_df['Author(s) e-mail']))] #Avoid duplicates
        
    # Merges a list of elements under an agreed approach of defaulting to scalars for empty or singleton lists. 
    # Thus, elements which are themselves lists are concatenated with the result, whereas elements that are not lists
    # are treated as scalars and inserted to the resulting list.
    # Duplicates are avoided, and if the resulting list is a singleton then the unique element of the resulting
    # list is returned. Otherwise the resulting list is returned.
    #
    # -list_of_elts: a list where each element is either a string or another list
    def _mergeLists(self, list_of_elts):
        raw_merge = []
        for elt in list_of_elts:
            if type(elt)==list:
                raw_merge.extend(elt)
            else:
                raw_merge.append(elt)
        # Now eliminate duplicates
        no_duplicates_merge = list(set(raw_merge))
        if len(no_duplicates_merge)==1:
            return no_duplicates_merge[0]
        else:
            if len(no_duplicates_merge)==0:
                return ''
            else:
                return no_duplicates_merge

In [2]:
def parseLogsFromAllRepos(repos, root_data_dir, root_git_dir, after_date, max_commits,
                         package_folders, artifact_families, package_prefixes, volatility_threshold):
    # Per repo, accesses the GIT logs and produces and saves 4 dataframes (dataframes are not returned, just saved for 
    # later use). What is returned is a GitLogParser instance that was used to do the parsing. This GitLogParser instance
    # contains some state that may be of interest when debugging.
    # The 4 frames that are produced and saved are:
    #  1) a 'raw' dataframe that has one row for each <commit, filename> pair in the GIT logs
    #  2) an 'artifacts' dataframe that has one row per filename, aggregating the 'raw' dataframe
    #  3) a 'modules' dataframe with one row per 'submodule', where 'submodule' is inferred during parsing time based
    #     on the filename, by stripping the portion of the filename prior to the classname-related portions at the end of the
    #     filename, and the first two directories in the front of the filename, considered a 'layer'
    #  4) a 'volatility' dataframe, obtained by filtering the 'modules' dataframe based on an threshold of how many LOC changed
    #     in the submodule in question
    #
    # Parameters:
    # -repos: a list of strings, each corresponding to the name of a directory for a GIT repo.
    # -root-dat_dir: a string corresponding to the parent directory under which the produced dataframes should be saved,
    #                organized in subfolders given by the repo names
    # -root_git_dir: the absolute path for a directory under which the repos exist. So GIT repos are subdirectories of this one.
    # -after_date: a string in format MM/DD/YY, such as '06/01/18' for the 1st of June, 2018.
    #              Used to filter log entries to only include commits after that date.
    # -max_commits: this is either None (no effect), or an integer. When it is an integer, then it sets the maximum number
    #          of commits to be retrieved from GIT logs. This is useful to limit the amount of information received and
    #          speed up performance, as for example when testing this function.
    # -package_folders, artifact_families, package_prefixes: data structures used when parsing filenames to infer what 
    #           is the classname, submodule, artifact family, etc. for each. Refer to the documentation of the
    #           FilenameParser class.
    # -volatility_threshold: an int, defining the level of LOC that must have changed in a module for it to be considered
    #                        volatile
    print('Parsing logs for repos under ' + root_git_dir + '...')
    for repo in repos:
        GIT_DIR = root_git_dir + '/' + repo
        DATA_DIR = root_data_dir + '/' + repo
        print('Processing ' + repo + '...')
        glog   = GitLogParser(GIT_DIR, repo, after_date, max_commits=max_commits, 
                              package_folders   =package_folders,
                              artifact_families =artifact_families,
                              package_prefixes  =package_prefixes
                             )
        raw_df, commits_df  = glog.parse()
        if len(commits_df.index)==0:
            print('...there are no commits for ' + repo + '; Nothing will be saved')
        else:
            if len(raw_df.index)==0:
                print('...there are no files in the commits for ' + repo + '; only the commits dataframe will be saved')
                agg    = GitLogAggregationEngine(raw_df, commits_df)
                agg.save_all(DATA_DIR)
            else:
                agg    = GitLogAggregationEngine(raw_df, commits_df)
                a_df   = agg.aggregateByArtifact()
                m_df   = agg.aggregateByModule()
                vol_df = agg.buildVolatility_df(volatility_threshold) # TODO: make the limit a parameter, don't hardcode 10k
                agg.save_all(DATA_DIR)
                print('...done processing ' + repo)
            
    return glog

<h1>Library to enrich ingested GIT data with ticket information</h1>

In [11]:
# -dates: list of date_prefix like '190626' when csv files are time-tagged, listed in order in which to search first for
#         existance of a data file with that tag
def loadLogDataForAllRepos(repos, root_data_dir, dates):
    result_dict = {}
    snapshot_types = ['raw_parsed_git_log', 
                  'by_artifact_parsed_git_log', 
                  'by_volatility_parsed_git_log', 
                  'by_module_parsed_git_log', 
                  'Repo Catalogue' ]
    for snapshot_name in snapshot_types:
        result_dict[snapshot_name] = {}
        for repo in repos:
            DATA_DIR = root_data_dir + '/' + repo
            foundData = False
            '''
            for date_prefix in dates:
                filename = DATA_DIR + '/' + date_prefix + ' ' + dt +'.csv'
                if os.path.isfile(filename):
                    # When reading CSV file, explicitly state dtype for columns where the act of reading the
                    # CSV file might trigger a warning of "mixed types". Each time we get such a warning,
                    # add one more column here where we specify the dtype for that column so warning is avoided
                    df                    = pd.read_csv(filename, dtype={'Body':str, 
                                                                         'Symbolic Link':str})
                    df.drop(['Unnamed: 0'], axis=1, inplace=True)
                    result_dict[dt][repo] = df
                    foundData             = True
                    break # Found a date for which we had data, so no need to look at more dates
            '''
            df = load_dataset_snapshot(DATA_DIR, dates, snapshot_name)
            #if not foundData:
            if df is None:
                print('*** WARNING: FOUND NO \'' + snapshot_name + '\' DATA FOR REPO \'' + repo + '\'')
            else:
                result_dict[snapshot_name][repo] = df
    return result_dict

In [10]:
def mergeDataframesAcrossRepos(dataframe_dict):
    repos = dataframe_dict.keys()
    df_list = []
    for repo in repos:
        df = dataframe_dict[repo]
        df['Repo'] = repo
        df_list.append(df)
    merged_df = pd.concat(df_list, sort=True) #sort=True set to avoid a warning when non-concat axis don't align
    return merged_df

In [14]:
# Returns two arrays of strings. First array is th ticket ids,  such as: ['FBP-48732', 'FBE 4.2.6.1']
# Second array is the ticket family, such as ['FBP', 'FBE']
#
# -bad_families: an array of string. Used to flag strings that are not valid ticket families, but which the algorithm
#               can't help itself to think it is the family token of a ticket id. For example, consider this
#               commit subject: 
#                                     '[CMFBE-1563]-Updating scripts for 5315 HF1 builds'
# 
#               The algorithm will think there are two ticket ids ('CMFBE-1563' and 'for 5315'), but only the first
#               is valid. So to help the algorithm in a case like this, include the string 'for' in the 'bad_families'
#               list parameter.
#
# -treat_as_equal_families: dictionary, mapping strings to strings. It is used to replace a family for a user
#               ticket when the family is a tweak on what the family should really be. For example, consider this
#               commit subject:
#
#                                   'FBE PaymentsFBPY-4087 User Exit in Inward Payments processing'
#
#               The algorithm will think that 'PaymentsFBPY' is the family for ticket 'PaymentsFBPY-4087', when
#               in reality the ticket should be 'FBPY-4087' in family 'FBPY'. To hit it on the algorithm that this
#               is what it should be doing, add an entry in treat_as_equal_families where 'PaymentsFBPY' is the key and
#               'FBPY' is the value
def extractTickets(subject, bad_families, treat_as_equal_families):
    END_DELIM     = '[\]\s,-:]+'
    START_DELIM   = '[^\[\s,/{]+'
    
    SPACES        = '\s*'
    HYPHEN        = SPACES + '-' + SPACES
    COLON         = SPACES + ':' + SPACES
    MIDDLE_DELIM  = '[:\s-]+'
    #MIDDLE_DELIM  = HYPHEN + '|' + COLON + '|' + SPACES
    
    REGEX = '(' + START_DELIM + '[a-zA-Z]{2,6}' + MIDDLE_DELIM + '[.0-9]{2,12})' + END_DELIM
    
    tentative_tickets = re.findall(REGEX, subject)
    
    # First need to loop through the tickets to 'clean them up' Sometimes the algorithm above picks up an
    # alleged 'ticket' that is a big string, bigger than the actual ticket. So we loop through the tickets to
    # see if they contain an 'inner ticket', in which case we consider them to be real. As an example, if the
    # subject is 
    #             '[FBP-46915]FBP-46915-multi-entity-entity-based-permission'
    #
    # the algorithm will think that the ticket is 'FBP-46915]FBP-46915'. The following loop will clean this
    # up by replacing the ticket with the inner ticket 'FBP-46915'
    unwrapped_tickets = []
    for ticket in tentative_tickets:
        INNER_REGEX = '[a-zA-Z]+-[0-9]+'
        inners = re.findall(INNER_REGEX, ticket)
        if len(inners) > 0 and len(inners[0]) < len(ticket): # Found a smaller 'inner ticket', so must be the real one
            unwrapped_tickets.append(inners[0])
        else:
            unwrapped_tickets.append(ticket)
   
    validated_tickets = []
    families = []
    for ticket in unwrapped_tickets:
        family = _extractTicketFamily(ticket)
        
        if family in bad_families: #this is not a real ticket, we were fooled. Refer to above documentation
            continue # Ignore this ticket, it is bad

        if family in treat_as_equal_families: # this family is a tweak of the real family, so fix that
            corrected_family = treat_as_equal_families[family] 
            MAX_REPLACE = 1
            corrected_ticket  = ticket.replace(family, corrected_family, MAX_REPLACE)

            ticket    = corrected_ticket
            family   = corrected_family

                
        if family == '':
            # Didn't get a good family, and perhaps we are in a spurious case. If we already added this row
            # then we can ignore this ticket
            if len(validated_tickets) > 0: # Have a prior good ticket that included this row already, so skip spurious
                continue                
                
        # There are cases where the above will result in spurious tickets. For example, if the subject is
        #
        #   '[FBLE-9658]:Yoma - 5.3.1 - Lending Non-Paged Queries.',
        #
        # then the algorithm will think that 'FBLE-9658]:Yoma - 5.3.1' is the ticket and that 'FBLE-9658]' is the family.
        # To fix this, check if the alleged 'family' is really the ticket
        #
        suspected_tickets = re.findall(REGEX, family)
        if len(suspected_tickets) == 1: # The suspect is guilty as charged
            corrected_ticket  = suspected_tickets[0]
            corrected_family = _extractTicketFamily(corrected_ticket)
            families          .append(corrected_family)
            validated_tickets .append(corrected_ticket)
            
        else:
            families          .append(family)
            validated_tickets .append(ticket)
            
    
    return validated_tickets, families

def _extractTicketFamily(ticket):
        #Try different middle delimeters, until success, or leave family not set
        tokens = ticket.strip().split('-')
        if len(tokens)==2:
            return tokens[0].strip()
        tokens = ticket.strip().split(':')
        if len(tokens)==2:
            return tokens[0].strip()
        tokens = ticket.strip().split(' ')
        if len(tokens)==2:
            return tokens[0].strip()
        
        # If we get this far there was no match.
        return ''   

In [15]:
# Returns a dataframe. Input is supposed to be the raw dataframe produced by GitLogParser.parse
# The 'primary key' is the triple of columns <Filename, commitId, ticket>
#
# -bad_families, treat_as_equal_families: these parameters are simply passed to the extractTickets function.
#            Refer to that function's documentation for information and examples on how to set them.
def build_ticket_df(raw_df, bad_families, treat_as_equal_families):
    
    input_columns = list(raw_df.columns)
    
    result_dict = {}
    for col in input_columns:
        result_dict[col] = []
        
    ticket_list        = []
    ticket_family_list = []
        
    for index, row in raw_df.iterrows():
        stories, stories_family = extractTickets(row['Subject'],  bad_families, treat_as_equal_families)
        
        found_valid_story = False
        for storyidx in range(len(stories)):
            story = stories[storyidx]
            family = stories_family[storyidx]
                        
            # Normal case - add a row per ticket
            for col in input_columns:
                result_dict[col].append(row[col])
            ticket_list         .append(story)
            ticket_family_list  .append(family)
            found_valid_story = True
                
        #Boundary case - add a single row with no ticket to avoid losing this row 
        if found_valid_story==False:
            for col in input_columns:
                result_dict[col].append(row[col])
            ticket_list         .append(None)
            ticket_family_list  .append(None)

                
    result_dict['Ticket(s)']         = ticket_list
    result_dict['Ticket(s) Family']  = ticket_family_list

    df = pd.DataFrame(result_dict)
    return df

<h2>Filter tickets to those for a particular release</h2>

In [14]:
# Returns a dictionary, where for the keys are the various Jira extract names and the values are a list of the Jira tickets
# identifiers (strings like 'IBF_17223')
#
# -root_data_dir: a string, for the root directory where data lives, as an absolute path
# -jira_extracts_folder: a string, for the folder under 'root_data_dir' where the Jira extract text files reside.
# -jira_extracts: list of the names of the Jira extract text files to be parsed.
def _parseJiraExtracts(root_data_dir, jira_extracts_folder, jira__extracts):
    result_dict = {}
    
    for extract_filename in jira__extracts:
        text_file = open(root_data_dir + '/' + jira_extracts_folder + '/' + extract_filename,'r')
        lines = text_file.readlines()
        text_file.close()
        # Should be 5 lines:
        #     lines[0] has the Jira URL for the dashboard. 
        #     lines[1] has the number of user stories. This is manually entered from the dashboard by author of extract file.
        #              This is used to reconcile that we correctly parse the extract and don't miss any Jira tickets.
        #     lines[2] has  different Jira URL that is reached from the dashboard, by selecting the hypelink in the dashboard
        #              that has the total number of closed tickets/stories
        #     lines[3] is empty, for readability so prior lines are segregated from the blob of text that follows
        #     lines[4] is the 'extract' from the web page whose URL is in lines[2]. This extract was obtained by doing
        #              'view source' on the web page and copying-and-pasting a section that is delineated by square brackes
        #              which corresponds to the list of closed Jira tickets displayed on the webpage'
        #              It is a long string that starts like
        #
        #                   '[&quot;IBF-17223&quot;,&quot;IBF-17222&quot; ....' 
        #
        #              and finishes like 
        #
        #                   '... &quot;FBBT-19990&quot;,&quot;FBBT-19970&quot;]'
        #        
        # Extract the user stories and reconcile. 
        REGEX = '[a-zA-Z]+-[0-9]+'
        jira_tickets = re.findall(REGEX, lines[4])
        expected_number_of_tickets = re.findall('[0-9]+', lines[1])
        
        # Test we parsed the expected number of ticketss
        assert(len(jira_tickets) == int(expected_number_of_tickets[0]))
        result_dict[extract_filename] = jira_tickets
    return result_dict

In [15]:
# Returns a dataframe whose primary key is the triple <commitId, ticketId, filename>, corresponding to all the 
# commits and tickets for a given release. This is done by filtering the 'global_tickets_df' input to only those
# tickets that are pertinent to a release, as identified from the jira extract text files provided as inputs: these
# files contain the tickets that apply to a the the release of interest.
#
# -global_tickets_df: a dataframe built from the function 'build_ticket_df', i.e., it has all the tickets for all the
#                    work present in the data extracted from GIT logs in the current global context of analysis
# -root_data_dir: a string, for the root directory where data lives, as an absolute path
# -jira_extracts_folder: a string, for the folder under 'root_data_dir' where the Jira extract text files reside.
# -jira_extracts: list of the names of the Jira extract text files to be parsed.
def build_release_df(global_tickets_df, root_data_dir, jira_extracts_folder, jira_story_extracts, jira_bug_extracts):
    all_extracts = list(set(jira_story_extracts).union(set(jira_bug_extracts)))
    jira_tickets_dict = _parseJiraExtracts(root_data_dir, jira_extracts_folder, all_extracts)
    all_jira_tickets = []
    for key in jira_tickets_dict.keys():
        all_jira_tickets.extend(jira_tickets_dict[key]) 
    release_df = global_tickets_df[global_tickets_df['Ticket(s)'].isin(all_jira_tickets)]
    release_df = release_df.reset_index(drop=True)

    # Now lok at each row in 'release-df', and figure out if the 'Ticket(s)' is a story or a bug, to
    # set up the 'Ticket Type column'. Logic requires we first prepare the list of tickets that are either
    # stories or bugs, to then be able to search in them.
    release_df.loc[:, 'Ticket Type']  = None
    
    stories = []
    for extract_key in jira_story_extracts:
        stories.extend(jira_tickets_dict[extract_key])
    bugs = []
    for extract_key in jira_bug_extracts:
        bugs.extend(jira_tickets_dict[extract_key])

    for index, row in release_df.iterrows():
        ticket = row['Ticket(s)']
        if ticket in stories:
            release_df.loc[index, 'Ticket Type'] = 'User Story'
        else:
            if ticket in bugs:
                release_df.loc[index, 'Ticket Type'] = 'Defect'
    return release_df

<h1>[DEPRECATED - SAVE THE USEFUL SOMEWHERE] Library to analyze data</h1>
These routines won't access GIT. Insteady they will rely on loading CSV files previously saved by the other library that does access GIT and parses the logs.