In [24]:
from git import Repo, Diff, Commit, Actor
import os
from tqdm import tqdm
import pickle
import datetime
import time
import json


In [25]:
def debug(*messages):
    import inspect
    caller = inspect.stack()[1]
    fpath = caller.filename
    ln = caller.lineno
    location = "File \"%s\", line %d " % (fpath, ln) + '\t'
    message = ' '.join([str(m) for m in messages])
    t = time.strftime('%y-%m-%d %H:%M:%S')
    print(location + t + '\t' + message)
  

In [46]:
def download_commit_summaries(
        repo_owner_name, repo_name, time_budget, 
        include_merge_commit=False, max_commits=100000
    ):
    debug('Generating Summary for \"' + repo_owner_name + '/' + repo_name + '\"', ' in last ', time_budget, 'months')
    if not include_merge_commit:
        debug('Ignoring the merge commits!')
    else:
        debug('Including the merge commits!')
    repository_path = 'tmp/' + repo_name
    if not os.path.exists(repository_path):
        repo = Repo.clone_from(
            'https://github.com/' + repo_owner_name + '/' + repo_name + '.git', repository_path)
    else:
        repo = Repo(repository_path)
    author_str_to_id = {}
    file_str_to_id = {}
    all_authors = []
    all_files = []
    all_commit_summaries = []
    time_margin = (datetime.datetime.now() - datetime.timedelta(time_budget * 365/ 12)) \
                        if time_budget != 1 else None
    if time_margin is not None:
        timestamp_margin = time_margin.timestamp()
    commits = list(repo.iter_commits('master'))
    debug('Total commits', len(commits))
    for i, commit in enumerate(commits):
        mx = int(min(max_commits, len(commits)) / 100)
        if i % mx == 0:
            debug("Completed", i/mx, "%")
        if i == max_commits:
            break
        author_time = commit.authored_datetime
        if time_margin is not None and author_time.timestamp() < timestamp_margin:
            break
        author = commit.author
        commit_time = commit.committed_datetime
        if author_time == commit_time:
            post_commit_change = False
        else:
            post_commit_change = True       
        file_ids = []
        if not include_merge_commit and len(commit.parents) > 1:
            continue
        for parent in commit.parents:
            diffs = commit.diff(parent)
            for diff in diffs:
                fp = diff.b_path.strip()
                if fp not in file_str_to_id.keys():
                    file_str_to_id[fp] = len(file_str_to_id.keys())
                    new_file = {
                        'id': file_str_to_id[fp],
                        'file_path': fp
                    }
                    all_files.append(new_file)
                file_ids.append(file_str_to_id[fp])
        author_str = author.name + '-' + author.email
        if author_str not in author_str_to_id:
            author_str_to_id[author_str] = len(author_str_to_id)
            author_dict = {
                'id': '',
                'name': author.name,
                'email': author.email
            }
            author_dict['id'] = author_str_to_id[author_str]
            all_authors.append(author_dict)

        commit_summary = {
            'id': commit.hexsha,
            'author_id': author_str_to_id[author_str],
            'timestamp': author_time.timestamp(),
            'time': str(author_time),
            'files': file_ids,
            'post_commit_change': post_commit_change,
            'is_merge_commit': len(commit.parents) > 1
        }
        all_commit_summaries.append(commit_summary)

    save_dir = repo_owner_name + '_' + repo_name + '_commits'
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    author_file = open(os.path.join(save_dir, 'authors.json'), 'w')
    json.dump(all_authors, author_file)
    author_file.close()

    file_path_file = open(os.path.join(save_dir, 'files.json'), 'w')
    json.dump(all_files, file_path_file)
    file_path_file.close()

    commits_file = open(os.path.join(save_dir, 'commits.json'), 'w')
    json.dump(all_commit_summaries, commits_file)
    commits_file.close()
    debug('Total authors: %d\tTotal Unique files: %d\tTotal commits: %d' \
          % (len(all_authors), len(all_files), len(all_commit_summaries)))    

In [47]:
budget = 12

owner = 'torvalds'
name = 'linux' 
download_commit_summaries(owner, name, budget)

owner = 'apache'
name = 'incubator-superset'
download_commit_summaries(owner, name, budget)


File "<ipython-input-46-1e5611f213c9>", line 5 	20-04-07 01:06:59	Generating Summary for "torvalds/linux"  in last  12 months
File "<ipython-input-46-1e5611f213c9>", line 7 	20-04-07 01:06:59	Ignoring the merge commits!
File "<ipython-input-46-1e5611f213c9>", line 26 	20-04-07 01:07:06	Total commits 914038
File "<ipython-input-46-1e5611f213c9>", line 30 	20-04-07 01:07:06	Completed 0.0 %
File "<ipython-input-46-1e5611f213c9>", line 94 	20-04-07 01:07:23	Total authors: 220	Total Unique files: 989	Total commits: 632
File "<ipython-input-46-1e5611f213c9>", line 5 	20-04-07 01:07:23	Generating Summary for "apache/incubator-superset"  in last  12 months
File "<ipython-input-46-1e5611f213c9>", line 7 	20-04-07 01:07:23	Ignoring the merge commits!
File "<ipython-input-46-1e5611f213c9>", line 26 	20-04-07 01:07:23	Total commits 4978
File "<ipython-input-46-1e5611f213c9>", line 30 	20-04-07 01:07:23	Completed 0.0 %
File "<ipython-input-46-1e5611f213c9>", line 30 	20-04-07 01:07:24	Completed 1.0








  0%|          | 0/4978 [00:00<?, ?it/s][A[A[A[A[A[A[A






  0%|          | 4/4978 [00:00<02:15, 36.81it/s][A[A[A[A[A[A[A

File "<ipython-input-28-0cc441947077>", line 20 	20-04-07 00:54:19	Total commits 4978









  0%|          | 9/4978 [00:00<02:07, 38.92it/s][A[A[A[A[A[A[A






  0%|          | 13/4978 [00:00<02:07, 38.98it/s][A[A[A[A[A[A[A






  0%|          | 20/4978 [00:00<01:53, 43.81it/s][A[A[A[A[A[A[A






  1%|          | 26/4978 [00:00<01:46, 46.36it/s][A[A[A[A[A[A[A






  1%|          | 32/4978 [00:00<01:41, 48.94it/s][A[A[A[A[A[A[A






  1%|          | 38/4978 [00:00<01:37, 50.58it/s][A[A[A[A[A[A[A






  1%|          | 45/4978 [00:00<01:32, 53.42it/s][A[A[A[A[A[A[A






  1%|          | 51/4978 [00:00<01:29, 54.82it/s][A[A[A[A[A[A[A






  1%|          | 57/4978 [00:01<01:30, 54.48it/s][A[A[A[A[A[A[A






  1%|▏         | 63/4978 [00:01<01:31, 53.97it/s][A[A[A[A[A[A[A






  1%|▏         | 69/4978 [00:01<01:32, 53.25it/s][A[A[A[A[A[A[A






  2%|▏         | 75/4978 [00:01<01:31, 53.37it/s][A[A[A[A[A[A[A






  2%|▏         | 81/4978 [00:01<01:30, 54.38it/s][A[A[A[A[A[A[A

 14%|█▎        | 683/4978 [00:13<01:07, 63.89it/s][A[A[A[A[A[A[A






 14%|█▍        | 690/4978 [00:13<01:06, 64.92it/s][A[A[A[A[A[A[A






 14%|█▍        | 697/4978 [00:13<01:07, 63.00it/s][A[A[A[A[A[A[A






 14%|█▍        | 704/4978 [00:13<01:07, 63.53it/s][A[A[A[A[A[A[A






 14%|█▍        | 711/4978 [00:13<01:11, 59.81it/s][A[A[A[A[A[A[A






 14%|█▍        | 718/4978 [00:13<01:10, 60.50it/s][A[A[A[A[A[A[A






 15%|█▍        | 725/4978 [00:14<01:11, 59.66it/s][A[A[A[A[A[A[A






 15%|█▍        | 731/4978 [00:14<01:11, 59.46it/s][A[A[A[A[A[A[A






 15%|█▍        | 737/4978 [00:14<01:11, 59.20it/s][A[A[A[A[A[A[A






 15%|█▍        | 743/4978 [00:14<01:18, 54.25it/s][A[A[A[A[A[A[A






 15%|█▌        | 749/4978 [00:14<01:18, 53.90it/s][A[A[A[A[A[A[A






 15%|█▌        | 755/4978 [00:14<01:16, 54.85it/s][A[A[A[A[A[A[A






 15%|█▌        | 762/4978 [00:14<01:14, 56.48it/s][A[A[A[A[

File "<ipython-input-28-0cc441947077>", line 85 	20-04-07 00:54:37	Total authors: 125	Total Unique files: 1935	Total commits: 967









 19%|█▉        | 966/4978 [00:28<01:18, 51.39it/s][A[A[A[A[A[A[A