### Load the Commit Statistics
For this clustering we examined 39 GitHub projects containing a total of 2,629 commits. The projects all included the Fluence Marine library. Code for Fluence is generally written in React and Rust.

In [2]:
project_root_path = '../../..'

import sys
bob_lib_path = project_root_path + "/sandbox/python/bob/"
sys.path.append(bob_lib_path)

import commit_stat_engineering
import json

dataset_dir = project_root_path + '/data/github/2022-04-07-marine-search'
project_stats_dir = dataset_dir + '/projects'
all_commit_logs = commit_stat_engineering.find_files('commit-stat.log.json', project_stats_dir)
all_stats = commit_stat_engineering.get_stats_for_all_commits(all_commit_logs)

### Available Fields

In [6]:
print(all_stats[0].keys())

dict_keys(['commit', 'totalFiles', 'binFiles', 'textFiles', 'textLines', 'javascriptFiles', 'javascriptLines', 'javascriptFilePct', 'javascriptLinePct', 'rustFiles', 'rustLines', 'rustFilePct', 'rustLinePct', 'markdownFiles', 'markdownLines', 'markdownFilePct', 'markdownLinePct', 'jsonFiles', 'jsonLines', 'jsonFilePct', 'jsonLinePct', 'imgFiles', 'imgFilePct', 'lockFiles', 'lockLines', 'lockFilePct', 'lockLinePct', 'yarnFiles', 'yarnLines', 'yarnFilePct', 'yarnLinePct', 'htmlFiles', 'htmlLines', 'htmlFilePct', 'htmlLinePct', 'clojureFiles', 'clojureLines', 'clojureFilePct', 'clojureLinePct', 'shellFiles', 'shellLines', 'shellFilePct', 'shellLinePct', 'gitignoreFiles', 'gitignoreLines', 'gitignoreFilePct', 'gitignoreLinePct', 'noextbinFiles', 'noextbinFilePct', 'noexttextFiles', 'noexttextLines', 'noexttextFilePct', 'noexttextLinePct'])


### Predict Cluster Membership
X is standard notation for the feature matrix. Here you can see the input data that is provided to K-Means.

In [11]:
import numpy as np
from sklearn.cluster import KMeans

num_clusters = 7
X = [[np.log1p(stat['javascriptLines']),
      np.log1p(stat['rustLines']),
      np.log1p(stat['jsonLines']),
      np.log1p(np.abs(stat['imgFiles'])),
      np.log1p(stat['lockLines']),
      np.log1p(stat['htmlLines'])] for stat in all_stats]

random_state = 0
y_pred = KMeans(n_clusters=num_clusters, random_state = random_state).fit_predict(X)

### Reshape the Stats for Easier Handling

In [12]:
label_columns = ['wtf','is_bulk', 'is_standard', 'is_minor', 'is_artifact',
                 'is_interface', 'is_javascript', 'is_rust']

for i in range(0, len(y_pred)):
    all_stats[i]['cluster'] = y_pred[i]
    for column in label_columns: all_stats[i][column] = ''

clusters = [i for i in range(0, num_clusters)]
for i in range(0, num_clusters):
    clusters[i] = [stats for stats in all_stats if stats['cluster'] == i]


In [13]:
import numpy as np
import pandas as pd

to_be_labeled = []
fields = label_columns + ['cluster','binFiles', 'textFiles', 'textLines',
          'javascriptLinePct', 'rustLinePct', 'markdownLinePct', 'jsonLinePct',
          'imgFiles', 'lockLinePct', 'yarnLinePct', 'htmlLinePct']
commit_ids = []
for cluster in clusters:
    sample = np.random.choice(cluster, size=20, replace=False)
    for commit in sample:
        to_be_labeled.append([commit[field] for field in fields])
        commit_ids.append(commit['commit'])


# print(to_be_labeled)
cluster_df = pd.DataFrame(to_be_labeled, index=commit_ids, columns=fields)
cluster_df.to_csv(project_root_path + '/sandbox/data/bob/commit_labeling.tsv', sep="\t")
cluster_df

Unnamed: 0,wtf,is_bulk,is_standard,is_minor,is_artifact,is_interface,is_javascript,is_rust,cluster,binFiles,textFiles,textLines,javascriptLinePct,rustLinePct,markdownLinePct,jsonLinePct,imgFiles,lockLinePct,yarnLinePct,htmlLinePct
e3b155c53bbddf8e02188306a66732785fbdbfcb,,,,,,,,,0,0,1,5,0.00,0.0,100.00,0.00,0,0.0,0.00,0.0
8a4a4d941b83711b509f57de1f51665e0cfdec06,,,,,,,,,0,0,2,6,0.00,0.0,66.66,0.00,0,0.0,33.33,0.0
c07f19718e8ba11e6079ac5347a7c36b24c6f311,,,,,,,,,0,0,1,5,0.00,0.0,0.00,100.00,0,0.0,0.00,0.0
350b4d3352aaec6e2ad9004cbc5a95403c2dfecd,,,,,,,,,0,0,1,12,0.00,0.0,0.00,0.00,0,0.0,0.00,0.0
19e1e198b4db8d5ac98926c1b80e56163ae6b439,,,,,,,,,0,0,1,2,100.00,0.0,0.00,0.00,0,0.0,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ed972d32d8e5b68a3e230080cfc6342ef003420c,,,,,,,,,6,0,14,5495,2.76,0.0,0.00,88.98,0,0.0,0.00,0.0
91021d8b4099343049053dc3f0411ed5f59a0167,,,,,,,,,6,0,9,5928,1.46,0.0,0.00,98.06,0,0.0,0.00,0.0
90a25bedfeb738a9c16ea6508be733c91758ecd1,,,,,,,,,6,0,23,6760,32.30,0.0,12.64,46.90,0,0.0,0.00,0.0
6c291073644710476bd212855558ca9f3aae4a5c,,,,,,,,,6,0,7,10803,0.00,0.0,0.00,99.99,0,0.0,0.00,0.0


### Show Median Statistics for Each Cluster
The table generated here has median values for each cluster to facilitate interpetation of the data. The output fields shown were not included directly as input features, though they are dependent variables.

In [7]:
import numpy as np
import pandas as pd

cluster_names = ['#' + str(i) for i in range(0,len(clusters))]
fields = ['numCommits', 'binFiles', 'textFiles', 'binBytes', 'textLines',
          'javascriptBytePct', 'rustBytePct', 'markdownBytePct', 'jsonBytePct',
          'imgBytePct', 'lockBytePct', 'yarnBytePct', 'htmlBytePct']

for cluster in clusters:
    numCommits = len(cluster)
    for stats in cluster:
        stats['numCommits'] = numCommits

data = [[int(np.round(np.median([stats[field] for stats in cluster]))) for field in fields] for cluster in clusters]
cluster_df = pd.DataFrame(data, index=cluster_names, columns=fields)
cluster_df.transpose()

Unnamed: 0,#0,#1,#2,#3,#4,#5,#6
numCommits,36,1088,130,59,652,366,298
binFiles,4,0,0,3,0,0,0
textFiles,32,1,8,1,4,10,2
binBytes,87230,0,0,154400,0,0,0
textLines,28642,6,5142,12,56,312,82
javascriptBytePct,1,0,1,0,0,0,100
rustBytePct,0,0,0,0,100,31,0
markdownBytePct,0,0,0,0,0,0,0
jsonBytePct,63,0,95,0,0,0,0
imgBytePct,2,0,0,100,0,0,0
