In [1]:
import json
import numpy as np
import pandas as pd
import pydriller

In [2]:
df = pd.DataFrame()
for commit in pydriller.RepositoryMining("..").traverse_commits():
    for m in commit.modifications:
        df = df.append([[commit.hash, m.added, m.removed, m.nloc, m.token_count, commit.committer_date, commit.committer_timezone, commit.in_main_branch, m.complexity]])
df.rename(columns={0: 'commit_hash', 1: 'linesAdded', 2: 'linesRemoved', 3: 'nloc', 4: 'tokenCount', 5: 'committerDate', 6: 'committerTimezone', 7: 'inMainBranch', 8: 'complexity'}, inplace=True)
df.dropna(inplace=True)
df['nloc'] = df.nloc.astype('int')
df['complexity'] = df.complexity.astype('int')
df['tokenCount'] = df.tokenCount.astype('int')

In [3]:
grp1 = df.groupby('commit_hash', as_index=False).sum()[['commit_hash', 'nloc', 'linesAdded', 'linesRemoved']]

In [4]:
grp2 = df.groupby('commit_hash', as_index=False).mean()[['commit_hash', 'tokenCount', 'complexity']]

In [5]:
grp3 = df.groupby('commit_hash', as_index=False).max()[['commit_hash', 'tokenCount', 'complexity']]

In [6]:
grp4 = df.groupby('commit_hash', as_index=False).count()

In [8]:
grp4['changedFiles'] = grp4['linesAdded']
grp4 = grp4[['commit_hash', 'changedFiles']].copy()

In [9]:
final_df = df.merge(grp1, how='left', on='commit_hash', suffixes=['', 'total']) \
             .merge(grp2, how='left', on='commit_hash', suffixes=['', 'mean']) \
             .merge(grp3, how='left', on='commit_hash', suffixes=['', 'max']) \
             .merge(grp4, how='left', on='commit_hash', suffixes=['', 'Files']) \
             .drop(columns=['linesAdded', 'linesRemoved', 'nloc', 'tokenCount', 'complexity']) \
             .rename(columns={'linesAddedtotal': 'totalLinesAdded', 'linesRemovedtotal': 'totalLinesRemoved',
                              'complexitymax': 'maxComplexity', 'complexitymean': 'meanComplexity',
                              'tokenCountmean': 'meanTokenCount', 'tokenCountmax': 'maxTokenCount',
                              'nloctotal': 'totalNloc'}) \
             .reset_index(drop=True) \
             .drop_duplicates(subset=['commit_hash'])

In [11]:
final_df['committerDate'] = pd.to_datetime(final_df.committerDate, utc=True)
final_df['committerTimezone'] = final_df.committerTimezone.astype('int')
final_df['committerDateLocal'] = final_df.apply(lambda x: x.committerDate + np.timedelta64(x.committerTimezone,'s'), axis=1)

In [13]:
week_days = final_df.committerDateLocal.dt.dayofweek
final_df = pd.concat([final_df, pd.get_dummies(week_days, prefix='dayOfWeek')], axis=1)
final_df['committerHourOfDay'] = final_df.committerDateLocal.apply(lambda x: x.hour)
final_df.drop(columns=['committerDate', 'committerDateLocal', 'committerTimezone'], inplace=True)
final_df.dropna(axis=0, how='any', inplace=True)

In [16]:
for i in range(7):
    if 'dayOfWeek_{}'.format(i) not in final_df.columns:
        final_df['dayOfWeek_{}'.format(i)] = 0
final_df = final_df[['commit_hash', 'inMainBranch', 'maxComplexity', 'meanComplexity', 'totalLinesAdded',
       'totalLinesRemoved', 'totalNloc', 'maxTokenCount', 'meanTokenCount',
       'changedFiles', 'dayOfWeek_0', 'dayOfWeek_1', 'dayOfWeek_2',
       'dayOfWeek_3', 'dayOfWeek_4', 'dayOfWeek_5', 'dayOfWeek_6',
       'committerHourOfDay']].reset_index(drop=True)

In [17]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NpEncoder, self).default(obj)
        
def df_to_json(df, json_filename='test.json'):
    lst = []
    for i in range(df.shape[0]):
        lst.append(dict())
        for j, col in enumerate(df.columns.tolist()):
            lst[i][col] = df.iloc[i,j] if col != 'inMainBranch' else int(df.iloc[i, j])
    with open(json_filename, 'w') as fb:
        json.dump(lst, fb, cls=NpEncoder)
    return lst

In [19]:
df_to_json(final_df, '../data/processed/inference_test_data.json')

[{'commit_hash': 'f61598c75f9d23664fb95badb750cd81b55b4d7a',
  'inMainBranch': 1,
  'maxComplexity': 4,
  'meanComplexity': 1.25,
  'totalLinesAdded': 309,
  'totalLinesRemoved': 0,
  'totalNloc': 75,
  'maxTokenCount': 122,
  'meanTokenCount': 86.5,
  'changedFiles': 4,
  'dayOfWeek_0': 0,
  'dayOfWeek_1': 1,
  'dayOfWeek_2': 0,
  'dayOfWeek_3': 0,
  'dayOfWeek_4': 0,
  'dayOfWeek_5': 0,
  'dayOfWeek_6': 0,
  'committerHourOfDay': 10},
 {'commit_hash': 'c18192f3c488ac26bffec719f89523f82244420c',
  'inMainBranch': 1,
  'maxComplexity': 4,
  'meanComplexity': 3.0,
  'totalLinesAdded': 62,
  'totalLinesRemoved': 2,
  'totalNloc': 69,
  'maxTokenCount': 326,
  'meanTokenCount': 320.0,
  'changedFiles': 2,
  'dayOfWeek_0': 0,
  'dayOfWeek_1': 0,
  'dayOfWeek_2': 0,
  'dayOfWeek_3': 0,
  'dayOfWeek_4': 0,
  'dayOfWeek_5': 0,
  'dayOfWeek_6': 1,
  'committerHourOfDay': 14},
 {'commit_hash': 'e8a48664da1a6af462d4b312df6d0c95ae4632d9',
  'inMainBranch': 1,
  'maxComplexity': 11,
  'meanComplex