### data loader and parser

In [1]:
import os
import json

def find_files(name, path):
    result = []
    for root, dirs, files in os.walk(path):
        if name in files:
            result.append(os.path.join(root, name))
    return result

def parse_commits(commit_files):
    all_logs = []
    all_commits = []
    for file in commit_files:
        in_stats = json.loads(open(file, 'r').read());
        all_logs.append(in_stats)
        for commit in in_stats:
            all_commits.append(commit)
    return all_commits

# The Main Event
This is where the magic happens. If you want to extend the processed statistics, this is where to do it.

In [2]:
def percent(numerator, denominator):
    if (numerator == 0): return 0
    truncated = int(100 * 100 * numerator / denominator)
    return truncated / 100

def extract_stats(commit):
    js_types = ['js', 'jsx', 'ts', 'tsx']
    rust_types = ['rs', 'toml']
    chars_per_text_line = 30 # just a heuristic for approximating relative weight
    
    if 'files' in commit:
        num_files = commit['files']
    else:
        num_files = commit['file']
    typeDict = commit['fileTypes']
    typeArray = [{'fileType': key, 'stats': typeDict[key]} for key in typeDict.keys()]
    textTypeArray = [typeEntry for typeEntry in typeArray if typeEntry['stats']['textLineCount'] > 0]
    binTypeArray = [typeEntry for typeEntry in typeArray if typeEntry['stats']['binByteCount'] > 0]
    jsTypeArray = [typeEntry for typeEntry in typeArray if typeEntry['fileType'] in js_types]
    rsTypeArray = [typeEntry for typeEntry in typeArray if typeEntry['fileType'] in rust_types]
    
    statDict = {}
    statDict['commit'] = commit['commit']
    
    statDict['totalFiles'] = int(num_files)
    statDict['binFiles'] = sum([typeEntry['stats']['occurrences'] for typeEntry in binTypeArray])
    statDict['textFiles'] = sum([typeEntry['stats']['occurrences'] for typeEntry in textTypeArray])
    
    statDict['binBytes'] = sum([typeEntry['stats']['binByteCount'] for typeEntry in binTypeArray])
    statDict['textLines'] = sum([typeEntry['stats']['textLineCount'] for typeEntry in textTypeArray])
    
    statDict['totalBytes'] = statDict['textLines'] * chars_per_text_line + statDict['binBytes']
    statDict['pctBinBytes'] = percent(statDict['binBytes'], statDict['totalBytes'])
    statDict['pctTextBytes'] = percent(statDict['textLines'] * chars_per_text_line, statDict['totalBytes'])
    
    statDict['jsFiles'] = sum([typeEntry['stats']['occurrences'] for typeEntry in jsTypeArray])
    statDict['jsLines'] = sum([typeEntry['stats']['textLineCount'] for typeEntry in jsTypeArray])
    statDict['pctJsFiles'] = percent(statDict['jsFiles'], statDict['totalFiles'])
    statDict['pctJsLines'] = percent(statDict['jsLines'], statDict['textLines'])
    statDict['pctJsBytes'] = percent(statDict['jsLines'] * chars_per_text_line, statDict['totalBytes'])
    
    statDict['rustFiles'] = sum([typeEntry['stats']['occurrences'] for typeEntry in rsTypeArray])
    statDict['rustLines'] = sum([typeEntry['stats']['textLineCount'] for typeEntry in rsTypeArray])
    statDict['pctRustFiles'] = percent(statDict['rustFiles'], statDict['totalFiles'])
    statDict['pctRustLines'] = percent(statDict['rustLines'], statDict['textLines'])
    statDict['pctRustBytes'] = percent(statDict['rustLines'] * chars_per_text_line, statDict['totalBytes'])
    
    return statDict

### orchestrator

In [3]:
# Example Project Root Path: /home/bob/projects/Web3HackerNetwork
def get_stats_for_all_commits(project_root_path):
    dataset_dir = project_root_path + "/data/github/2022-04-07-marine-search"
    stats_dir = dataset_dir + "/projects"
    
    all_commit_logs = find_files('commit-stat.log.json', stats_dir)
    all_commits = parse_commits(all_commit_logs)
    all_stats = []
    
    for commit in all_commits:
        commit_id = commit['commit']
        commit_stats = extract_stats(commit)
        all_stats.append(commit_stats)
    
    return all_stats

### human observer appeasement cell

In [4]:
all_commit_stats = get_stats_for_all_commits('/home/bob/projects/Web3HackerNetwork')
for stat in all_commit_stats[slice(10)]:
    print(stat)

{'commit': 'fb528905fb6448e2b502f7de8fba877a2a995f6d', 'totalFiles': 2, 'binFiles': 0, 'textFiles': 2, 'binBytes': 0, 'textLines': 26, 'totalBytes': 780, 'pctBinBytes': 0, 'pctTextBytes': 100.0, 'jsFiles': 2, 'jsLines': 26, 'pctJsFiles': 100.0, 'pctJsLines': 100.0, 'pctJsBytes': 100.0, 'rustFiles': 0, 'rustLines': 0, 'pctRustFiles': 0, 'pctRustLines': 0, 'pctRustBytes': 0}
{'commit': '81e38a0da9ea79835d340a8b4ca8c47f127ddc6f', 'totalFiles': 1, 'binFiles': 0, 'textFiles': 1, 'binBytes': 0, 'textLines': 22, 'totalBytes': 660, 'pctBinBytes': 0, 'pctTextBytes': 100.0, 'jsFiles': 0, 'jsLines': 0, 'pctJsFiles': 0, 'pctJsLines': 0, 'pctJsBytes': 0, 'rustFiles': 0, 'rustLines': 0, 'pctRustFiles': 0, 'pctRustLines': 0, 'pctRustBytes': 0}
{'commit': '1eaa1e1a964f946317297b6e4ee279bf4aced3a1', 'totalFiles': 1, 'binFiles': 0, 'textFiles': 1, 'binBytes': 0, 'textLines': 2, 'totalBytes': 60, 'pctBinBytes': 0, 'pctTextBytes': 100.0, 'jsFiles': 0, 'jsLines': 0, 'pctJsFiles': 0, 'pctJsLines': 0, 'pctJs