In [1]:
project_root_path = '/home/bob/projects/Web3HackerNetwork'

### Load the Commit Statistics
For this clustering we examined 39 GitHub projects containing a total of 2,629 commits. The projects all included the Fluence Marine library. Code for Fluence is generally written in React and Rust.

In [2]:
import sys
bob_lib_path = project_root_path + "/sandbox/python/bob/"
sys.path.append(bob_lib_path)

import commit_stat_engineering
import json

dataset_dir = project_root_path + '/data/github/2022-04-07-marine-search'
project_stats_dir = dataset_dir + '/projects'
all_commit_logs = commit_stat_engineering.find_files('commit-stat.log.json', project_stats_dir)
all_stats = commit_stat_engineering.get_stats_for_all_commits(all_commit_logs)

### Data Dipper
This cell exists only to show what a set of commit stats looks like. It doesn't contribute anything to the clustering.

In [3]:
complex_stats = [stat for stat in all_stats if stat['rustFiles'] > 0
                 and stat['markdownFiles'] > 0 and stat['javascriptFiles'] > 0
                 and stat['jsonFiles'] > 0 and stat['binBytes'] > 0]
print(json.dumps(complex_stats[0], indent=2))

{
  "commit": "5629c4b1eae0d9db3dd25e5ada2d91393005a3d0",
  "totalFiles": 851,
  "binFiles": 62,
  "textFiles": 789,
  "binBytes": 7329844,
  "textLines": 402917,
  "totalBytes": 19417354,
  "pctBinBytes": 37.74,
  "pctTextBytes": 62.25,
  "javascriptFiles": 114,
  "javascriptLines": 9928,
  "javascriptFilePct": 13.39,
  "javascriptLinePct": 2.46,
  "javascriptBytePct": 1.53,
  "rustFiles": 209,
  "rustLines": 8440,
  "rustFilePct": 24.55,
  "rustLinePct": 2.09,
  "rustBytePct": 1.3,
  "markdownFiles": 36,
  "markdownLines": 5858,
  "markdownFilePct": 4.23,
  "markdownLinePct": 1.45,
  "markdownBytePct": 0.9,
  "jsonFiles": 141,
  "jsonLines": 369338,
  "jsonFilePct": 16.56,
  "jsonLinePct": 91.66,
  "jsonBytePct": 57.06,
  "imgFiles": 48,
  "imgBytes": 7160304,
  "imgFilePct": 5.64,
  "imgBytePct": 36.87,
  "lockFiles": 0,
  "lockLines": 0,
  "lockFilePct": 0,
  "lockLinePct": 0,
  "lockBytePct": 0,
  "yarnFiles": 3,
  "yarnLines": 233,
  "yarnFilePct": 0.35,
  "yarnLinePct": 0.05,
  

### Predict Cluster Membership
X is standard notation for the feature matrix. Here you can see the input data that is provided to K-Means.

In [4]:
import numpy as np
from sklearn.cluster import KMeans

num_clusters = 7
X = [[np.log1p(stat['javascriptLines']),
      np.log1p(stat['rustLines']),
      np.log1p(stat['markdownLines']),
      np.log1p(stat['jsonLines']),
      np.log1p(np.abs(stat['imgBytes'])),
      np.log1p(stat['lockLines']),
      np.log1p(stat['yarnLines']),
      np.log1p(stat['htmlLines']),
      np.log1p(stat['clojureLines'])] for stat in all_stats]

random_state = 8675309
y_pred = KMeans(n_clusters=num_clusters, random_state = random_state).fit_predict(X)

### Reshape the Stats for Easier Handling

In [5]:
label_columns = ['wtf','is_bulk', 'is_standard', 'is_minor', 'is_artifact',
                 'is_interface', 'is_javascript', 'is_rust']

for i in range(0, len(y_pred)):
    all_stats[i]['cluster'] = y_pred[i]
    for column in label_columns: all_stats[i][column] = ''

clusters = [i for i in range(0, num_clusters)]
for i in range(0, num_clusters):
    clusters[i] = [stats for stats in all_stats if stats['cluster'] == i]


In [6]:
import numpy as np
import pandas as pd

to_be_labeled = []
fields = label_columns + ['cluster','binFiles', 'textFiles', 'binBytes', 'textLines',
          'javascriptBytePct', 'rustBytePct', 'markdownBytePct', 'jsonBytePct',
          'imgBytePct', 'lockBytePct', 'yarnBytePct', 'htmlBytePct']
commit_ids = []
for cluster in clusters:
    sample = np.random.choice(cluster, size=20, replace=False)
    for commit in sample:
        to_be_labeled.append([commit[field] for field in fields])
        commit_ids.append(commit['commit'])


# print(to_be_labeled)
cluster_df = pd.DataFrame(to_be_labeled, index=commit_ids, columns=fields)
cluster_df.to_csv(project_root_path + '/sandbox/data/bob/cluster_labeling.tsv', sep="\t")
cluster_df

Unnamed: 0,wtf,is_bulk,is_standard,is_minor,is_artifact,is_interface,is_javascript,is_rust,cluster,binFiles,...,binBytes,textLines,javascriptBytePct,rustBytePct,markdownBytePct,jsonBytePct,imgBytePct,lockBytePct,yarnBytePct,htmlBytePct
4d75823232e302db5573c053debb182c21ad7cf9,,,,,,,,,0,9,...,104676,61690,1.01,0.11,0.55,91.83,2.98,0.00,0.00,0.83
f1907df36197dc3d7be452aab244d7652c4c4181,,,,,,,,,0,3,...,18881,12355,0.65,0.00,0.36,0.51,3.85,92.68,0.00,0.72
76ab616360d9651b4444e8933aadb16362b3ff47,,,,,,,,,0,12,...,322490,29574,3.98,0.92,0.16,67.64,25.38,0.00,0.00,0.05
48fb8bb782cd493cb4b41e1e21488ba8acd3b507,,,,,,,,,0,0,...,0,5636363,66.74,0.00,6.45,16.42,-0.12,0.21,0.02,0.30
768edfc7eb1bd020830f211265da9848de339b3d,,,,,,,,,0,0,...,0,10912,5.26,0.00,0.00,78.36,-5.95,15.25,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
b7be3aa89f4521b2dc1dc600db82357b85096f1e,,,,,,,,,6,0,...,0,369,82.92,0.00,0.00,0.00,0.00,0.00,0.00,17.07
e55ed70029f41ad1fa59a2cb89b88ebea96bb9f2,,,,,,,,,6,0,...,0,16,100.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
fbf6540ed115934fb9641e9743b67608c978a942,,,,,,,,,6,0,...,0,23,100.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
79a8073960d788230a51bc2c0e6cfac402b8e887,,,,,,,,,6,0,...,0,109,51.37,0.00,1.83,0.00,0.00,0.00,0.00,0.00


### Show Median Statistics for Each Cluster
The table generated here has median values for each cluster to facilitate interpetation of the data. The output fields shown were not included directly as input features, though they are dependent variables.

In [7]:
import numpy as np
import pandas as pd

cluster_names = ['#' + str(i) for i in range(0,len(clusters))]
fields = ['numCommits', 'binFiles', 'textFiles', 'binBytes', 'textLines',
          'javascriptBytePct', 'rustBytePct', 'markdownBytePct', 'jsonBytePct',
          'imgBytePct', 'lockBytePct', 'yarnBytePct', 'htmlBytePct']

for cluster in clusters:
    numCommits = len(cluster)
    for stats in cluster:
        stats['numCommits'] = numCommits

data = [[int(np.round(np.median([stats[field] for stats in cluster]))) for field in fields] for cluster in clusters]
cluster_df = pd.DataFrame(data, index=cluster_names, columns=fields)
cluster_df.transpose()

Unnamed: 0,#0,#1,#2,#3,#4,#5,#6
numCommits,36,1088,130,59,652,366,298
binFiles,4,0,0,3,0,0,0
textFiles,32,1,8,1,4,10,2
binBytes,87230,0,0,154400,0,0,0
textLines,28642,6,5142,12,56,312,82
javascriptBytePct,1,0,1,0,0,0,100
rustBytePct,0,0,0,0,100,31,0
markdownBytePct,0,0,0,0,0,0,0
jsonBytePct,63,0,95,0,0,0,0
imgBytePct,2,0,0,100,0,0,0
