### Load the Commit Statistics

In [1]:
project_root_path = '../../..'
lib_path = project_root_path + '/python/lib/'
import json
import sys
sys.path.append(lib_path)
import commit_stats
import feature_map
dataset_dir = project_root_path + '/data/github/2022-04-07-marine-search'
project_stats_dir = dataset_dir + '/projects'
all_commit_stats = commit_stats.load_commit_stats('commit-stat.log.json', project_stats_dir)
all_commit_features = feature_map.get_commit_features(all_commit_stats)
#all_commit_features[slice(2)]

### Load the Manual Labels
commit_labelling.csv is a manual labelling of 160 commits, done with an assist from the output of the clustering notebook.

In [2]:
import pandas as pd

labeling_path = project_root_path + '/sandbox/data/bob/commit_labeling.csv'
data_df = pd.read_csv(labeling_path, sep='\t', index_col=0)
data_df

Unnamed: 0,label,clstr,binF,txtF,txt,js,rust,md,json,img,lock,html
ab05ef8f08f34b3b39585c0367afb83b9d658f9c,bulk,1,82,39069,5658518,66.62,0.00,6.43,16.36,25,0.44,0.30
48fb8bb782cd493cb4b41e1e21488ba8acd3b507,bulk,1,0,38948,5636363,66.74,0.00,6.45,16.42,15,0.21,0.30
5629c4b1eae0d9db3dd25e5ada2d91393005a3d0,bulk,1,62,789,402917,2.46,2.09,1.45,91.66,48,0.00,0.84
fdb28fe4084b7f182c3da9a8966d6142dd96dfe1,bulk,1,31,411,220745,2.38,1.92,1.27,92.19,23,0.00,0.87
0730644dc859d5395f1ab8f614bbc83903a5d6e0,bulk,1,28,389,201204,2.46,2.07,1.40,91.76,21,0.00,0.85
...,...,...,...,...,...,...,...,...,...,...,...,...
29f15f2b4e404816ea805b610c7ad3432127fdad,rust_pack,3,0,5,75,0.00,64.00,0.00,0.00,0,29.33,0.00
01cdb26c2cb0207abb915703b203fca8fae114dd,rust_pack,3,0,2,56,0.00,3.57,0.00,0.00,0,96.42,0.00
8eb4a1da7c024d1fd9261b4da1a68f0c40d4e375,rust_pack,3,0,7,52,0.00,42.30,0.00,0.00,0,42.30,0.00
6df4e470d37bdbfeb128e55815e73f7ab34fbaeb,rust_pack,3,0,4,51,0.00,7.84,21.56,0.00,0,70.58,0.00


### Link Manual Labels to Feature Stats
Supervised learning traditionally uses X for features and y for targets. Matrices are notated in capital letters, vectors in lowercase.

X is the matrix of input features, with one row per observation and one column per feature.

y is the vector of labeled target values with one value for each observation. In some multiclass and multioutput cases, we would use a matrix Y instead.

In [3]:
import numpy as np

selected_features = ['binFiles','textFiles','textLines','javascriptLines','rustLines',
                    'lockLines','htmlLines','jsonLines','markdownLines','imgFiles']

def make_feature_vector(features):
    return [np.log1p(features[name]) for name in selected_features]

X = []
commit_feature_dict = {}
for features in all_commit_features:
    commit_id = features['commit']
    commit_feature_dict[commit_id] = features
    feature_vector = make_feature_vector(features)
    X.append(feature_vector)

X_train = []
y_train = []
for commit_id, labelling_spreadsheet_row in data_df.iterrows():
    features = commit_feature_dict[commit_id]
    feature_vector = make_feature_vector(features)
    target_label = labelling_spreadsheet_row[0]
    X_train.append(feature_vector)
    y_train.append(target_label)

print(len(X))

2629


### Reliable Workhorse Classifier
If neural networks are the awesome hotness of ML algorithms, Linear Regression and it's classifier sibling, Logistic Regression (sometimes abbreviated as Logit), are the reliable workhorses. Even in data science projects that end with a more advanced classifier, Logit is frequently used to give a baseline score during early analysis.

In cases like we have here, with independent features that are mostly correlated to one or two classes, LogisticRegression gets a good answer with very little compute.

In [4]:
from sklearn.linear_model import LogisticRegression

random_state = 0
clf = LogisticRegression(random_state=random_state, max_iter=10000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X)
print(len(y_pred))
print(y_pred[slice(0,10)])

2629
['javascript' 'documentation' 'minor' 'documentation' 'minor' 'minor'
 'json' 'javascript' 'javascript' 'javascript']


### Compile Hacker Histories

In [5]:
hacker_histories = {}
for i in range(len(y_pred)):
    features = all_commit_features[i]
    pred = y_pred[i]
    author = features['author']
    features['predicted_class'] = pred
    if author not in hacker_histories: hacker_histories[author] = []
    hacker_commits = hacker_histories[author]
    hacker_commits.append(features)

### Show Hacker Commit Types

In [6]:
import matplotlib.pyplot as plt

hacker_histograms = []
for hacker in hacker_histories.keys():
    commits = hacker_histories[hacker]
    commit_types = [commit['predicted_class'] for commit in commits]
    observed_types = list(set(commit_types))
    hacker_commit_types = []
    for observed_type in observed_types:
        observations = [1 for commit_type in commit_types if commit_type == observed_type]
        count = len(observations)
        hacker_commit_types.append(dict(num_commits=count, commit_type=observed_type))
    sorted_list = sorted(hacker_commit_types, key=lambda d: d['num_commits'])
    sorted_list.reverse()
    hacker_histograms.append(dict(hacker=hacker, commit_stats=sorted_list))

sorted_histograms = sorted(hacker_histograms, key=lambda d: d['hacker'].lower())
for histo in sorted_histograms:
    print(histo['hacker'])
    for row in histo['commit_stats']:
        print(str(row['num_commits']) + "\t" + str(row['commit_type']))
    print()
    break

Aleksandar Hadzibabic <48260177+hadzija7@users.noreply.github.com>
1	documentation
1	minor



In [7]:
import json

out_path = project_root_path + '/sandbox/data/bob/hacker_history.json'
out_handle = open(out_path, "w");
out_handle.write(json.dumps(sorted_histograms, indent=2));
out_handle.flush()
out_handle.close()