In [1]:
project_root_path = '/home/bob/projects/Web3HackerNetwork'

### Load the Commit Statistics
For this clustering we examined 39 GitHub projects containing a total of 2,629 commits. The projects all included the Fluence Marine library. Code for Fluence is generally written in React and Rust.

In [2]:
import sys
bob_lib_path = project_root_path + "/sandbox/python/bob/"
sys.path.append(bob_lib_path)

import commit_stat_engineering
import json

dataset_dir = project_root_path + '/data/github/2022-04-07-marine-search'
project_stats_dir = dataset_dir + '/projects'
all_commit_logs = commit_stat_engineering.find_files('commit-stat.log.json', project_stats_dir)
all_stats = commit_stat_engineering.get_stats_for_all_commits(all_commit_logs)

### Commit Stats


In [3]:
complex_stats = [stat for stat in all_stats if stat['rustFiles'] > 0
                 and stat['markdownFiles'] > 0 and stat['javascriptFiles'] > 0
                 and stat['jsonFiles'] > 0 and stat['binBytes'] > 0]
print(json.dumps(complex_stats[0], indent=2))

{
  "commit": "5629c4b1eae0d9db3dd25e5ada2d91393005a3d0",
  "totalFiles": 851,
  "binFiles": 62,
  "textFiles": 789,
  "binBytes": 7329844,
  "textLines": 402917,
  "totalBytes": 19417354,
  "pctBinBytes": 37.74,
  "pctTextBytes": 62.25,
  "javascriptFiles": 114,
  "javascriptLines": 9928,
  "javascriptFilePct": 13.39,
  "javascriptLinePct": 2.46,
  "javascriptBytePct": 1.53,
  "rustFiles": 209,
  "rustLines": 8440,
  "rustFilePct": 24.55,
  "rustLinePct": 2.09,
  "rustBytePct": 1.3,
  "markdownFiles": 36,
  "markdownLines": 5858,
  "markdownFilePct": 4.23,
  "markdownLinePct": 1.45,
  "markdownBytePct": 0.9,
  "jsonFiles": 141,
  "jsonLines": 369338,
  "jsonFilePct": 16.56,
  "jsonLinePct": 91.66,
  "jsonBytePct": 57.06,
  "imgFiles": 48,
  "imgBytes": 7160304,
  "imgFilePct": 5.64,
  "imgBytePct": 36.87,
  "lockFiles": 0,
  "lockLines": 0,
  "lockFilePct": 0,
  "lockLinePct": 0,
  "lockBytePct": 0,
  "yarnFiles": 3,
  "yarnLines": 233,
  "yarnFilePct": 0.35,
  "yarnLinePct": 0.05,
  

### Load Labels

In [10]:
import pandas as pd

data = pd.read_csv(project_root_path + '/sandbox/data/bob/cluster_labeling.tsv', sep='\t')
data

Unnamed: 0.1,Unnamed: 0,is_bulk,is_standard,is_minor,is_artifact,is_javascript,is_rust
0,a7503dcb63af05b1381c8fbc7d1aa989ce61d605,1,0,0,0,0.0,0.0
1,a91521059eeac8bfaa6b81a0333413b0015c8863,1,0,0,0,0.0,0.0
2,e5e0d59835a849eb9c43b2cd20b25046e1ddab3e,1,0,0,0,0.0,0.0
3,cf3c18490aaee04cdee75a3d22d67e8877b8abe7,1,0,0,0,0.0,0.0
4,f1907df36197dc3d7be452aab244d7652c4c4181,1,0,0,0,0.0,0.0
...,...,...,...,...,...,...,...
64,e2560c165fbd05e6bb41a7e9166f5b2283f15de4,0,1,0,0,1.0,0.0
65,fbf6540ed115934fb9641e9743b67608c978a942,0,1,0,0,1.0,0.0
66,e55ed70029f41ad1fa59a2cb89b88ebea96bb9f2,0,0,1,0,1.0,0.0
67,e19efe724f7e0a1054a62a7c3b064882e6048365,0,1,0,0,0.0,0.0


### Link Labels to Stats

In [23]:
import numpy as np
def make_x(commit_stats):
    return [
        np.log1p(commit_stats['binFiles']),
        np.log1p(commit_stats['textFiles']),
        np.log1p(commit_stats['textLines']),
        np.log1p(commit_stats['javascriptFiles']),
        np.log1p(commit_stats['javascriptLines']),
        commit_stats['javascriptLinePct'],
        np.log1p(commit_stats['rustFiles']),
        np.log1p(commit_stats['rustLines']),
        commit_stats['rustLinePct'],
        np.log1p(commit_stats['markdownFiles']),
        np.log1p(commit_stats['markdownLines']),
        commit_stats['markdownLinePct'],
        np.log1p(commit_stats['jsonFiles']),
        np.log1p(commit_stats['jsonLines']),
        commit_stats['jsonLinePct'],
        np.log1p(commit_stats['imgFiles']),
        np.log1p(commit_stats['lockFiles']),
        np.log1p(commit_stats['lockLines']),
        commit_stats['lockLinePct'],
        np.log1p(commit_stats['yarnFiles']),
        np.log1p(commit_stats['yarnLines']),
        commit_stats['yarnLinePct'],
        np.log1p(commit_stats['htmlFiles']),
        np.log1p(commit_stats['htmlLines']),
        commit_stats['htmlLinePct'],
        np.log1p(commit_stats['clojureFiles']),
        np.log1p(commit_stats['clojureLines']),
        commit_stats['clojureLinePct'],
        np.log1p(commit_stats['shellFiles']),
        np.log1p(commit_stats['shellLines']),
        commit_stats['shellLinePct'],
        np.log1p(commit_stats['gitignoreFiles']),
        np.log1p(commit_stats['gitignoreLines']),
        commit_stats['gitignoreLinePct']
    ]


In [50]:
stats = {}
for commit in all_stats:
    stats[commit['commit']] = commit

X = []
Y = []
a = 0
for row in data.to_numpy():
    commit_id = row[0]
    row_labels = row[slice(1, 5)]
    x = make_x(stats[commit_id])
    #print(x)
    #print(a)
    X.append(make_x(stats[commit_id]))
    Y.append(row_labels.tolist())
    a += 1
#print(a)

#print(Y[slice(2)])
#print(Y[0][0])
#print(X[slice(2)])

In [41]:
from sklearn.datasets import make_multilabel_classification

sample_X, sample_y = make_multilabel_classification(n_classes=3, random_state=0)
print(sample_y[slice(2)])
print(Y[slice(2)])

[[0 1 0]
 [0 1 0]]
[[1, 0, 0, 0], [1, 0, 0, 0]]


In [63]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputClassifier

print(len(X))
print(len(Y))

clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(7, 8), random_state = 1)
clf.fit(X, Y)
score = clf.score(X, Y)
Y_pred = clf.predict(X)
# print(score)
# print(Y_pred)

69
69


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [93]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = \
    train_test_split(X, Y, test_size=0.33, random_state=3)

clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(7, 8), random_state = 1)
clf.fit(X_train, Y_train)
score = clf.score(X_test, Y_test)
# Y_pred = clf.predict(X)
print(score)
# print(Y_pred)

0.6956521739130435


In [105]:
#print(X_test)
#print(Y_pred)
pairs = []
for i in range(0,len(X_test)):
    #print(Y_pred[i].tolist())
    pairs.append(Y_pred[i].tolist() + X_test[i])
# print(pairs)

print(len(X_train))
import pandas as pd
pd.DataFrame(pairs).to_csv("/home/bob/test.tsv", sep="\t")

46
