### set the base params

In [2]:
root = '/home/bob/projects/Web3HackerNetwork'
dataset_dir = root + "/data/github/2022-04-07-marine-search"
stats_dir = dataset_dir + "/projects"

### define a method that finds all files with the same name

In [7]:
import os
def find_all(name, path):
    result = []
    for root, dirs, files in os.walk(path):
        if name in files:
            result.append(os.path.join(root, name))
    return result

### find all the commi stat JSON files

In [10]:
commit_files = find_all('commit-stat.log.json', stats_dir)

### load all the files

In [16]:
import json

all_logs = []
for file in commit_files:
    in_stats = json.loads(open(file, 'r').read());
    all_logs.append(in_stats)
print(all_logs[0][0])

{'commit': 'fb528905fb6448e2b502f7de8fba877a2a995f6d', 'Author': 'tchataigner <tom.chataigner@yahoo.fr>', 'Date': 'Sun Aug 15 20', 'fileTypes': {'js': {'textLineCount': 26, 'binByteCount': 0, 'inserts': 17, 'deletes': 9, 'occurrences': 2}}, 'files': '2', 'insertions(+)': '17', 'deletions(-)\n': '9'}


In [85]:
js_commits = []
rs_commits = []
all_commits = []

js_types = ['js', 'jsx', 'ts', 'tsx']

for log in all_logs:
    #lines.append("\n")
    for commit in log:
        #lines.append(str(commit['commit']))
        #print(commit['fileTypes'])
        for ft in commit['fileTypes']:
            all_commits.append(commit)
            if ft in js_types:
                js_commits.append(commit)
            elif ft == 'rs':
                rs_commits.append(commit)

### method to extract statistics from the commit data

In [100]:
js_types = ['js', 'jsx', 'ts', 'tsx']

def extract_stats(commit):
    if 'files' in commit:
        num_files = commit['files']
    else:
        num_files = commit['file']
    statDict = {}
    statDict['totalFiles'] = int(num_files)
    typeDict = commit['fileTypes']
    typeArray = [{'fileType': key, 'stats': typeDict[key]} for key in typeDict.keys()]
    textTypeArray = [typeEntry for typeEntry in typeArray if typeEntry['stats']['textLineCount'] > 0]
    binTypeArray = [typeEntry for typeEntry in typeArray if typeEntry['stats']['binByteCount'] > 0]
    jsTypeArray = [typeEntry for typeEntry in typeArray if typeEntry['fileType'] in js_types]
    rsTypeArray = [typeEntry for typeEntry in typeArray if typeEntry['fileType'] == 'rs']
    statDict['textLines'] = sum([typeEntry['stats']['textLineCount'] for typeEntry in textTypeArray])
    statDict['binBytes'] = sum([typeEntry['stats']['binByteCount'] for typeEntry in binTypeArray])
    statDict['rustLines'] = sum([typeEntry['stats']['textLineCount'] for typeEntry in rsTypeArray])
    statDict['jsLines'] = sum([typeEntry['stats']['textLineCount'] for typeEntry in jsTypeArray])
    statDict['textFiles'] = sum([typeEntry['stats']['occurrences'] for typeEntry in textTypeArray])
    statDict['binFiles'] = sum([typeEntry['stats']['occurrences'] for typeEntry in binTypeArray])
    statDict['rustFiles'] = sum([typeEntry['stats']['occurrences'] for typeEntry in rsTypeArray])
    statDict['jsFiles'] = sum([typeEntry['stats']['occurrences'] for typeEntry in jsTypeArray])
    statDict['pctRustFiles'] = statDict['rustFiles'] / statDict['totalFiles']
    statDict['pctJsFiles'] = statDict['jsFiles'] / statDict['totalFiles']
    statDict['pctRustLines'] = statDict['rustLines'] / statDict['textLines']
    statDict['pctJsLines'] = statDict['jsLines'] / statDict['textLines']
    statDict['pctRustBytes'] = statDict['rustLines'] * 50 / ((statDict['textLines'] * 50) + statDict['binBytes'])
    statDict['pctJsBytes'] = statDict['jsLines'] * 50 / ((statDict['textLines'] * 50) + statDict['binBytes'])
    return statDict
    # fileType['textLineCount'] for fileType in commit['fileTypes']]

In [101]:
for commit in rs_commits[slice(10)]:
    print(commit)
    statDict = extract_stats(commit)
    print(statDict)
    print()

{'commit': '643153483bbb7476897cbb4fc17939560c278b0f', 'Author': 'tchataigner <tom.chataigner@yahoo.fr>', 'Date': 'Sat Aug 14 21', 'fileTypes': {'null': {'textLineCount': 2, 'binByteCount': 0, 'inserts': 2, 'deletes': 0, 'occurrences': 1}, 'toml': {'textLineCount': 11, 'binByteCount': 0, 'inserts': 11, 'deletes': 0, 'occurrences': 1}, 'md': {'textLineCount': 22, 'binByteCount': 0, 'inserts': 22, 'deletes': 0, 'occurrences': 1}, 'json': {'textLineCount': 5, 'binByteCount': 0, 'inserts': 5, 'deletes': 0, 'occurrences': 1}, 'wasm': {'textLineCount': 0, 'binByteCount': 82934, 'inserts': 0, 'deletes': 0, 'occurrences': 1}, 'sh': {'textLineCount': 2, 'binByteCount': 0, 'inserts': 2, 'deletes': 0, 'occurrences': 1}, 'rs': {'textLineCount': 26, 'binByteCount': 0, 'inserts': 26, 'deletes': 0, 'occurrences': 1}}, 'files': '7', 'insertions(+)\n': '68'}
{'totalFiles': 7, 'textLines': 68, 'binBytes': 82934, 'rustLines': 26, 'jsLines': 0, 'textFiles': 6, 'binFiles': 1, 'rustFiles': 1, 'jsFiles': 0, 