### Load the Commit Statistics
For this clustering we examined 39 GitHub projects containing a total of 2,629 commits. The projects all included the Fluence Marine library. Code for Fluence is generally written in React and Rust.

In [1]:
project_root_path = '../../..'

import sys
lib_path = project_root_path + "/jupyter/lib/"
sys.path.append(lib_path)

import commit_stat_engineering
import json

dataset_dir = project_root_path + '/data/github/2022-04-07-marine-search'
project_stats_dir = dataset_dir + '/projects'
all_commit_logs = commit_stat_engineering.find_files('commit-stat.log.json', project_stats_dir)
all_stats = commit_stat_engineering.get_stats_for_all_commits(all_commit_logs)

### Available Fields

In [2]:
print(all_stats[0].keys())

dict_keys(['commit', 'totalFiles', 'binFiles', 'textFiles', 'textLines', 'javascriptFiles', 'javascriptLines', 'javascriptFilePct', 'javascriptLinePct', 'rustFiles', 'rustLines', 'rustFilePct', 'rustLinePct', 'markdownFiles', 'markdownLines', 'markdownFilePct', 'markdownLinePct', 'jsonFiles', 'jsonLines', 'jsonFilePct', 'jsonLinePct', 'imgFiles', 'imgFilePct', 'lockFiles', 'lockLines', 'lockFilePct', 'lockLinePct', 'yarnFiles', 'yarnLines', 'yarnFilePct', 'yarnLinePct', 'htmlFiles', 'htmlLines', 'htmlFilePct', 'htmlLinePct', 'clojureFiles', 'clojureLines', 'clojureFilePct', 'clojureLinePct', 'shellFiles', 'shellLines', 'shellFilePct', 'shellLinePct', 'gitignoreFiles', 'gitignoreLines', 'gitignoreFilePct', 'gitignoreLinePct', 'noextbinFiles', 'noextbinFilePct', 'noexttextFiles', 'noexttextLines', 'noexttextFilePct', 'noexttextLinePct'])


### Predict Cluster Membership
X is standard notation for the feature matrix. Here you can see the input data that is provided to K-Means.

In [3]:
import numpy as np
from sklearn.cluster import KMeans

X = [[np.log1p(stat['javascriptLines']),
      np.log1p(stat['rustLines']),
      np.log1p(stat['markdownLines']),
      np.log1p(stat['jsonLines']),
      np.log1p(np.abs(stat['imgFiles'])),
      np.log1p(stat['lockLines']),
      np.log1p(stat['yarnLines']),
      np.log1p(stat['clojureLines']),
      np.log1p(stat['htmlLines']),] for stat in all_stats]

num_clusters = 8
random = 0
y_pred = KMeans(n_clusters=num_clusters, random_state=random).fit_predict(X)

### Reshape the Stats for Easier Handling

In [4]:
label_columns = ['wtf','rust_development', 'interface_development', 'big_json', 'rust_packaging',
                 'documentation', 'javascript_development', 'bulk_commit', 'minor_commit']

for i in range(0, len(y_pred)):
    all_stats[i]['cluster'] = y_pred[i]
    for column in label_columns: all_stats[i][column] = ''

clusters = [i for i in range(0, num_clusters)]
for i in range(0, num_clusters):
    clusters[i] = [stats for stats in all_stats if stats['cluster'] == i]


### Sample by Cluster ID

In [5]:
import numpy as np
import pandas as pd

to_be_labeled = []
fields = label_columns + ['cluster','binFiles', 'textFiles', 'textLines',
          'javascriptLinePct', 'rustLinePct', 'markdownLinePct', 'jsonLinePct',
          'imgFiles', 'lockLinePct', 'yarnLinePct', 'htmlLinePct']
commit_ids = []
for cluster in clusters:
    sample = np.random.choice(cluster, size=20, replace=False)
    for commit in sample:
        to_be_labeled.append([commit[field] for field in fields])
        commit_ids.append(commit['commit'])

### Write a TSV for Manual Labeling

In [6]:
cluster_df = pd.DataFrame(to_be_labeled, index=commit_ids, columns=fields)
cluster_df.to_csv(project_root_path + '/sandbox/data/bob/commit_labeling.tsv', sep="\t")
cluster_df

Unnamed: 0,wtf,rust_development,interface_development,big_json,rust_packaging,documentation,javascript_development,bulk_commit,minor_commit,cluster,...,textFiles,textLines,javascriptLinePct,rustLinePct,markdownLinePct,jsonLinePct,imgFiles,lockLinePct,yarnLinePct,htmlLinePct
b1b875db537dfb97185899c465eeb20dfa1117a3,,,,,,,,,,0,...,4,43,0.00,97.67,0.0,0.0,0,0.00,2.32,0.0
07fe0bb39ea0a019bb73e6b78cc94d8d2539ac16,,,,,,,,,,0,...,14,270,0.00,92.22,0.0,0.0,0,2.59,0.00,0.0
305ca68c102b137063af297d042bdd3b19592111,,,,,,,,,,0,...,4,107,0.00,100.00,0.0,0.0,0,0.00,0.00,0.0
6ad2870fafbea1a580320c0835ac122ccaa7736c,,,,,,,,,,0,...,1,84,0.00,100.00,0.0,0.0,0,0.00,0.00,0.0
541a19ff296ae99fd20adabd980370bdf358a2de,,,,,,,,,,0,...,3,21,0.00,100.00,0.0,0.0,0,0.00,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ee7572c36ad46d3b6881ec70a64388108b7b8d57,,,,,,,,,,7,...,1,4,0.00,0.00,0.0,0.0,0,0.00,0.00,0.0
c3e149f8a2a6a8f98fe9da5ac85d252480a4b674,,,,,,,,,,7,...,3,20,0.00,30.00,0.0,0.0,0,70.00,0.00,0.0
e304524f46e164b7ab0535ff60736b0956c400ca,,,,,,,,,,7,...,3,9,55.55,44.44,0.0,0.0,0,0.00,0.00,0.0
d98014bf8e9ef92b28b885c5e08846e62e96233d,,,,,,,,,,7,...,1,2,0.00,100.00,0.0,0.0,0,0.00,0.00,0.0


### Show Median Statistics for Each Cluster
The table generated here has median values for each cluster to facilitate interpetation of the data. The output fields shown were not included directly as input features, though they are dependent variables.

In [7]:
import numpy as np
import pandas as pd

cluster_names = ['#' + str(i) for i in range(0,len(clusters))]
fields = ['numCommits', 'binFiles', 'textFiles', 'textLines',
          'javascriptLinePct', 'rustLinePct', 'markdownLinePct', 'jsonLinePct',
          'imgFilePct', 'lockLinePct', 'yarnLinePct', 'htmlLinePct']

for cluster in clusters:
    numCommits = len(cluster)
    for stats in cluster:
        stats['numCommits'] = numCommits

data = [[int(np.round(np.median([stats[field] for stats in cluster]))) for field in fields] for cluster in clusters]
cluster_df = pd.DataFrame(data, index=cluster_names, columns=fields)
cluster_df.transpose()

Unnamed: 0,#0,#1,#2,#3,#4,#5,#6,#7
numCommits,601,101,131,364,258,282,40,852
binFiles,0,0,0,0,0,0,3,0
textFiles,4,1,8,10,1,2,48,1
textLines,62,41,4920,312,26,88,30194,4
javascriptLinePct,0,0,2,0,0,100,2,0
rustLinePct,100,0,0,38,0,0,0,0
markdownLinePct,0,0,0,0,100,0,0,0
jsonLinePct,0,0,97,0,0,0,80,0
imgFilePct,0,0,0,0,0,0,4,0
lockLinePct,0,0,0,54,0,0,4,0
