In [1]:
import json
from elasticsearch import Elasticsearch, helpers
import pandas as pd

from grimoirelab.toolkit.datetime import str_to_datetime


HOST = "http://localhost:9200/"


def collect_items(index):
    es = Elasticsearch([HOST], timeout=120, max_retries=20, retry_on_timeout=True)
    page = es.search(
        index=index,
        scroll="1m",
        size=10,
        body={"query": {"match_all": {}}}
    )

    sid = page['_scroll_id']
    scroll_size = page['hits']['total']

    items = []
    while scroll_size > 0:
        for item in page['hits']['hits']:
            items.append(item['_source']['data'])

        page = es.scroll(scroll_id=sid, scroll='1m')
        sid = page['_scroll_id']
        scroll_size = len(page['hits']['hits'])

    return items


def analysis_effort():
    issues = collect_items("issue")
    prs = collect_items("pull_request")
    commits = collect_items("commit")
    snapshots = collect_items("graal-py")
    snapshots = sorted(snapshots, key=lambda k: k['CommitDate'])

    items = []
    for issue in issues:
        if 'pull_request' not in issue:
            continue

        number = issue['number']
        created_at = issue['created_at']
        closed_at = issue['closed_at']

        pr = [pr for pr in prs if pr['number'] == number][0]

        merged_at = None
        merged = 0
        found_commits = []
        if 'merged_at' in pr:
            merged_at = pr['merged_at']
            merged = 1
            found_commits = pr['commits_data']

        if not found_commits:
            continue

        found_files = []
        for cc in [c for c in commits if c['commit'] in found_commits]:
            found_files = [c['file'] for c in cc['files']]

        if not found_files:
            continue

        for i in range(len(snapshots)):
            snapshot = snapshots[i]
            commit_date = snapshot['CommitDate']
            sha = snapshot['commit']
            prev_snapshot = snapshots[i - 1]
            prev_commit_date = prev_snapshot['CommitDate']
            prev_sha = prev_snapshot['commit']

            if snapshot['commit'] not in found_commits:
                continue

            for a in snapshot['analysis']:
                if a['file_path'] not in found_files:
                    continue

                target_file = a['file_path']

                num_funs = a['num_funs']
                loc = a['loc']
                ccn = a['ccn']

                prev_num_funs = 0
                prev_loc = 0
                prev_ccn = 0
                for pa in prev_snapshot['analysis']:
                    if pa['file_path'] == target_file:
                        prev_num_funs = pa['num_funs']
                        prev_loc = pa['loc']
                        prev_ccn = pa['ccn']
                        break

                item = {}
                item['issue'] = number
                item['hash'] = sha
                item['prev_hash'] = prev_sha
                item['created_at'] = str_to_datetime(created_at)
                item['closed_at'] = str_to_datetime(closed_at)
                item['merged_at'] = str_to_datetime(merged_at)
                item['merged'] = merged
                item['file_path'] = target_file
                item['is_test'] = 1 if 'tests/' in target_file else 0
                item['date'] = str_to_datetime(commit_date).timestamp()
                item['funs'] = num_funs
                item['loc'] = loc
                item['ccn'] = ccn
                item['prev_date'] = str_to_datetime(prev_commit_date)
                item['prev_funs'] = prev_num_funs
                item['prev_loc'] = prev_loc
                item['prev_ccn'] = prev_ccn
                item['diff_funs'] = num_funs - prev_num_funs
                item['diff_loc'] = loc - prev_loc
                item['diff_ccn'] = ccn - prev_ccn

                items.append(item)

    return items


def analysis_tests():
    snapshots = collect_items("graal-py")
    snapshots = sorted(snapshots, key=lambda k: k['CommitDate'])

    code = []

    for snapshot in snapshots:
        for a in snapshot['analysis']:
            sha = snapshot['commit']
            committed_date = str_to_datetime(snapshot['CommitDate'])
            t = {
                'hash': sha,
                'commit_date': committed_date,
                'file_path': a['file_path'],
                'num_funs': a['num_funs'],
                'loc': a['loc'],
                'comments': a['comments'],
                'ccn': a['ccn'],
                'is_test': 0
            }

            if 'tests/' in t['file_path']:
                t['is_test'] = 1
            code.append(t)

    return code

ImportError: No module named 'grimoirelab'

In [18]:
df_tests = pd.DataFrame(analysis_tests())

In [44]:
df_tests.groupby('is_test').count()

Unnamed: 0_level_0,ccn,comments,commit_date,file_path,hash,loc,num_funs
is_test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,34882,34882,34882,34882,34882,34882,34882
1,29809,29809,29809,29809,29809,29809,29809


In [48]:
df_groups = df_tests.groupby(['hash', 'commit_date', 'is_test'])['ccn', 'comments', 'loc', 'num_funs'].agg(['sum'])

In [78]:
df_groups = df_groups.sort_values(by=['commit_date'])

In [79]:
df_samplings = df_groups.drop(df_groups.index[[i for i in range(0, len(df_groups)) if i % 100 != 0]])

In [80]:
len(df_samplings)

24

In [81]:
df_samplings

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ccn,comments,loc,num_funs
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,sum,sum,sum
hash,commit_date,is_test,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
57bc204822832a6c23ac7883e5392f4da6f4ca37,2015-08-18 18:11:40+02:00,1,0,21,6,0
2bada6492ea5c7fcd7762f376a0e515c6b5e0562,2016-01-04 02:27:22+01:00,0,213,604,1119,84
edcbc5817c9bf74134c99a3d513a0d692b4e7405,2016-02-12 19:48:16+01:00,1,144,301,1142,86
496951397ecdf41a0252a73f471015d0a468ec6b,2016-02-25 19:03:12+01:00,1,184,335,1408,112
2eec2cf82d794ea0d0e7c74223e88c3dee78cc5c,2016-03-30 19:24:43+02:00,0,426,1256,2278,174
d36293a4ba3cd4e6c95be900c145c09bf57c6f00,2016-06-03 21:00:07+02:00,0,567,1536,3035,220
8449683b5ee9588c155b1c6a7ba7f68cf5b4b1f0,2016-07-18 13:20:19+02:00,1,730,1132,5741,392
fdf511b0144cb7707cae1a6b8905e83004cf003b,2016-09-26 15:00:01+02:00,0,1083,2633,5733,409
3a9318ccef725cebe6f616918bb2a448c91a1f92,2016-10-04 20:25:25+02:00,0,1157,2797,5989,446
b17e53f5bafd9ccd914ac3aa95d0ac2eb628a1ac,2016-10-25 13:04:13+02:00,1,985,1444,7734,531


In [11]:
import matplotlib as plt
import seaborn as sns

In [None]:
ax = df_groups.plot(x='hash')