### Load the Commit Statistics

In [1]:
project_root_path = '../../..'

import sys
lib_path = project_root_path + "/jupyter/lib/"
sys.path.append(lib_path)

import commit_stat_engineering
import json

dataset_dir = project_root_path + '/data/github/2022-04-07-marine-search'
project_stats_dir = dataset_dir + '/projects'
find_args = 'commit-stat.log.json', project_stats_dir
all_commit_logs = commit_stat_engineering.find_files(*find_args)
all_stats = commit_stat_engineering.get_stats_for_all_commits(all_commit_logs)

### Load the Manual Labels
commit_labelling.csv is a manual labelling of 160 commits, done with an assist from the output of the clustering notebook.

In [2]:
import pandas as pd

labeling_path = project_root_path + '/sandbox/data/bob/commit_labeling.csv'
data_df = pd.read_csv(labeling_path, sep='\t', index_col=0)
data_df

Unnamed: 0,wtf,commit_type,cluster,binFiles,textFiles,textLines,javascriptLinePct,rustLinePct,markdownLinePct,jsonLinePct,imgFiles,lockLinePct,yarnLinePct,htmlLinePct
12074a45bc9b22dfffbf433bb73dde6aad88e2be,0,minor_commit,7,0,0,0,0.00,0.0,0.00,0.00,2,0.00,0.0,0.00
e190bc9dcda78b5688b25154c1eb6e2e1febfd1c,0,minor_commit,7,0,0,0,0.00,0.0,0.00,0.00,0,0.00,0.0,0.00
e4807d4413c12c4b9a4600736ebd3642a8e8b2ce,0,minor_commit,7,1,0,0,0.00,0.0,0.00,0.00,1,0.00,0.0,0.00
5b1f6608a3ae88026b9f2bf317a80287f382e9b7,0,minor_commit,7,0,1,1,100.00,0.0,0.00,0.00,0,0.00,0.0,0.00
dabe94da0d22f41a555012e6ec4ef689a8c1c6e6,0,minor_commit,7,0,1,1,100.00,0.0,0.00,0.00,0,0.00,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
768edfc7eb1bd020830f211265da9848de339b3d,0,big_json,2,0,13,10912,5.26,0.0,0.00,78.36,2,15.25,0.0,0.00
e8e033de75164a99af30c3ab997fb6b028a6ab75,0,big_json,2,0,6,13251,0.35,0.0,0.00,99.64,0,0.00,0.0,0.00
c991d0d153f6c7d3f94767359043e93cd2136ea0,0,big_json,2,0,23,16797,2.45,0.0,0.08,97.27,0,0.00,0.0,0.00
f815a4764ab1946106baea984a20fa423e79a8f3,0,big_json,2,3,16,20429,1.20,0.0,0.09,97.56,2,0.00,0.0,0.91


### Link Manual Labels to Feature Stats
Supervised learning traditionally uses X for features and y for targets. Matrices are notated in capital letters, vectors in lowercase.

X is the matrix of input features, with one row per observation and one column per feature.

y is the vector of labeled target values with one value for each observation. In some multiclass and multioutput cases, we would use a matrix Y instead.

In [3]:
import numpy as np

features = ['binFiles','textFiles','textLines','javascriptLines','rustLines',
            'lockLines','htmlLines','jsonLines','markdownLines','imgFiles','yarnLines']

def make_feature_vector(commit_stats):
    return [np.log1p(commit_stats[feature]) for feature in features]

X = []
all_commit_stats = {}
for commit_stats in all_stats:
    commit_id = commit_stats['commit']
    all_commit_stats[commit_id] = commit_stats
    feature_vector = make_feature_vector(commit_stats)
    X.append(feature_vector)

X_train = []
y_train = []
for commit_id, labelling_spreadsheet_row in data_df.iterrows():
    commit_stats = all_commit_stats[commit_id]
    feature_vector = make_feature_vector(commit_stats)
    target_label = labelling_spreadsheet_row[1]
    X_train.append(feature_vector)
    y_train.append(target_label)

print(len(X))

2629


### Reliable Workhorse Classifier
If neural networks are the awesome hotness of ML algorithms, Linear Regression and it's classifier sibling, Logistic Regression (sometimes abbreviated as Logit), are the reliable workhorses. Even in data science projects that end with a more advanced classifier, Logit is frequently used to give a baseline score during early analysis.

In cases like we have here, with independent features that are mostly correlated to one or two classes, LogisticRegression gets a good answer with very little compute.

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

random_state = 0
clf = LogisticRegression(random_state=random_state, max_iter=10000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X)
print(len(y_pred))

2629


### Compile Hacker Histories

In [5]:
hacker_histories = {}
for i in range(len(y_pred)):
    commit_stats = all_stats[i]
    pred = y_pred[i]
    author = commit_stats['author']
    commit_stats['predicted_class'] = pred
    if author not in hacker_histories: hacker_histories[author] = []
    hacker_commits = hacker_histories[author]
    hacker_commits.append(commit_stats)

### Show Hacker Commit Types

In [12]:
import matplotlib.pyplot as plt

hacker_histograms = []
for hacker in hacker_histories.keys():
    commits = hacker_histories[hacker]
    commit_types = [commit['predicted_class'] for commit in commits]
    observed_types = list(set(commit_types))
    hacker_commit_types = []
    for observed_type in observed_types:
        observations = [1 for commit_type in commit_types if commit_type == observed_type]
        count = len(observations)
        hacker_commit_types.append(dict(num_commits=count, commit_type=observed_type))
    sorted_list = sorted(hacker_commit_types, key=lambda d: d['num_commits'])
    sorted_list.reverse()
    hacker_histograms.append(dict(hacker=hacker, commit_stats=sorted_list))

sorted_histograms = sorted(hacker_histograms, key=lambda d: d['hacker'].lower())
for histo in sorted_histograms:
    print(histo['hacker'])
    for row in histo['commit_stats']:
        print(str(row['num_commits']) + "\t" + row['commit_type'])
    print()
    break

Aleksandar Hadzibabic <48260177+hadzija7@users.noreply.github.com>
1	minor_commit
1	documentation



In [13]:
import json

out_path = project_root_path + '/sandbox/data/bob/hacker_history.json'
out_handle = open(out_path, "w");
out_handle.write(json.dumps(sorted_histograms, indent=2));
out_handle.flush()
out_handle.close()