### set the base params

In [2]:
root = '/home/bob/projects/Web3HackerNetwork'
dataset_dir = root + "/data/github/2022-04-07-marine-search"
stats_dir = dataset_dir + "/projects"

### define a method that finds all files with the same name

In [7]:
import os
def find_all(name, path):
    result = []
    for root, dirs, files in os.walk(path):
        if name in files:
            result.append(os.path.join(root, name))
    return result

### find all the commi stat JSON files

In [10]:
commit_files = find_all('commit-stat.log.json', stats_dir)

### load all the files

In [16]:
import json

all_logs = []
for file in commit_files:
    in_stats = json.loads(open(file, 'r').read());
    all_logs.append(in_stats)
print(all_logs[0][0])

{'commit': 'fb528905fb6448e2b502f7de8fba877a2a995f6d', 'Author': 'tchataigner <tom.chataigner@yahoo.fr>', 'Date': 'Sun Aug 15 20', 'fileTypes': {'js': {'textLineCount': 26, 'binByteCount': 0, 'inserts': 17, 'deletes': 9, 'occurrences': 2}}, 'files': '2', 'insertions(+)': '17', 'deletions(-)\n': '9'}


In [85]:
js_commits = []
rs_commits = []
all_commits = []

js_types = ['js', 'jsx', 'ts', 'tsx']

for log in all_logs:
    #lines.append("\n")
    for commit in log:
        #lines.append(str(commit['commit']))
        #print(commit['fileTypes'])
        for ft in commit['fileTypes']:
            all_commits.append(commit)
            if ft in js_types:
                js_commits.append(commit)
            elif ft == 'rs':
                rs_commits.append(commit)

### method to extract statistics from the commit data

In [111]:
js_types = ['js', 'jsx', 'ts', 'tsx']

def extract_stats(commit):
    if 'files' in commit:
        num_files = commit['files']
    else:
        num_files = commit['file']
    statDict = {}
    statDict['commit'] = commit['commit']
    statDict['totalFiles'] = int(num_files)
    typeDict = commit['fileTypes']
    typeArray = [{'fileType': key, 'stats': typeDict[key]} for key in typeDict.keys()]
    textTypeArray = [typeEntry for typeEntry in typeArray if typeEntry['stats']['textLineCount'] > 0]
    binTypeArray = [typeEntry for typeEntry in typeArray if typeEntry['stats']['binByteCount'] > 0]
    jsTypeArray = [typeEntry for typeEntry in typeArray if typeEntry['fileType'] in js_types]
    rsTypeArray = [typeEntry for typeEntry in typeArray if typeEntry['fileType'] == 'rs']
    statDict['textLines'] = sum([typeEntry['stats']['textLineCount'] for typeEntry in textTypeArray])
    statDict['binBytes'] = sum([typeEntry['stats']['binByteCount'] for typeEntry in binTypeArray])
    statDict['rustLines'] = sum([typeEntry['stats']['textLineCount'] for typeEntry in rsTypeArray])
    statDict['jsLines'] = sum([typeEntry['stats']['textLineCount'] for typeEntry in jsTypeArray])
    statDict['textFiles'] = sum([typeEntry['stats']['occurrences'] for typeEntry in textTypeArray])
    statDict['binFiles'] = sum([typeEntry['stats']['occurrences'] for typeEntry in binTypeArray])
    statDict['rustFiles'] = sum([typeEntry['stats']['occurrences'] for typeEntry in rsTypeArray])
    statDict['jsFiles'] = sum([typeEntry['stats']['occurrences'] for typeEntry in jsTypeArray])
    statDict['totalBytes'] = statDict['textLines'] * 50 + statDict['binBytes']
    statDict['pctRustFiles'] = int(10000 * statDict['rustFiles'] / (.01 + statDict['totalFiles'])) / 100
    statDict['pctJsFiles'] = int(10000 * statDict['jsFiles'] / (.01 + statDict['totalFiles'])) / 100
    statDict['pctRustLines'] = int(10000 * statDict['rustLines'] / (1 + statDict['textLines'])) / 100
    statDict['pctJsLines'] = int(100000 * statDict['jsLines'] / (.01 + statDict['textLines'])) / 100
    statDict['pctRustBytes'] = int(10000 * statDict['rustLines'] * 50 / (.01 + statDict['totalBytes'])) / 100
    statDict['pctJsBytes'] = int(10000 * statDict['jsLines'] * 50 / (.01 + statDict['totalBytes'])) / 100
    statDict['pctBinBytes'] = int(10000 * statDict['binBytes'] / (.01 + statDict['totalBytes'])) / 100
    statDict['pctTextBytes'] = int(10000 * statDict['textLines'] * 50 / (.01 + statDict['totalBytes'])) / 100
    return statDict
    # fileType['textLineCount'] for fileType in commit['fileTypes']]

In [112]:
all_stats = []

for commit in rs_commits:
    statDict = extract_stats(commit)
    all_stats.append(statDict)

for stats in all_stats[slice(10)]:
    print(commit)
    print(statDict)
    print()

{'commit': '717a17b9dcd14d6368f01f314843f8839a595dfd', 'Author': 'fsy412 <fsy412@gmail.com>', 'Date': 'Thu Oct 21 09', 'fileTypes': {'gitignore': {'textLineCount': 1, 'binByteCount': 0, 'inserts': 1, 'deletes': 0, 'occurrences': 1}, 'html': {'textLineCount': 99, 'binByteCount': 0, 'inserts': 2, 'deletes': 0, 'occurrences': 2}, 'md': {'textLineCount': 53, 'binByteCount': 0, 'inserts': 2, 'deletes': 0, 'occurrences': 2}, 'png': {'textLineCount': 0, 'binByteCount': 10997, 'inserts': 0, 'deletes': 0, 'occurrences': 1}, 'lock': {'textLineCount': 11909, 'binByteCount': 0, 'inserts': 26, 'deletes': 0, 'occurrences': 3}, 'toml': {'textLineCount': 50, 'binByteCount': 0, 'inserts': 4, 'deletes': 0, 'occurrences': 4}, 'wasm': {'textLineCount': 0, 'binByteCount': 112164, 'inserts': 0, 'deletes': 0, 'occurrences': 1}, 'json': {'textLineCount': 12074, 'binByteCount': 0, 'inserts': 32, 'deletes': 0, 'occurrences': 7}, 'null': {'textLineCount': 39, 'binByteCount': 0, 'inserts': 6, 'deletes': 0, 'occur