# PySAL Change Log Statistics

In [1]:
import gitreleases

gitreleases.get_release_info()

gitreleases.clone_releases()

In [3]:
from __future__ import print_function
import os
import json
import re
import sys
import pandas

from datetime import datetime, timedelta
from time import sleep
from subprocess import check_output
try:
    from urllib import urlopen
except:
    from urllib.request import urlopen

import ssl
#import yaml

context = ssl._create_unverified_context()

In [4]:
with open('package_versions.txt', 'r') as package_list:
    packages = dict([line.strip().split() for line in package_list.readlines()])

In [5]:
packages

{'libpysal': '4.2.1',
 'esda': '2.2.0',
 'giddy': '2.3.0',
 'inequality': '1.0.0',
 'pointpats': '2.1.0',
 'segregation': '1.1.1',
 'spaghetti': '1.4.0',
 'mgwr': '2.1.1',
 'spglm': '1.0.7',
 'spint': '1.0.6',
 'spreg': '1.0.4',
 'spvcm': '0.2.1.post1',
 'tobler': '0.2.0',
 'mapclassify': '2.2.0',
 'splot': '1.1.2'}

In [6]:
release_date = '2020-01-31'
start_date = '2019-07-31'

In [7]:
since_date = '--since="{start}"'.format(start=start_date)
since_date
since = datetime.strptime(start_date+" 0:0:0", "%Y-%m-%d %H:%M:%S")
since

datetime.datetime(2019, 7, 31, 0, 0)

## Total commits by subpackage

- get release info for each package

In [8]:
cmd = ['git', 'log', '--format=* %aN', since_date] 

In [9]:
CWD = os.path.abspath(os.path.curdir)

In [10]:
CWD

'/home/serge/Dropbox/p/pysal/src/pysal/tools'

In [11]:
os.chdir('tmp/esda')
os.getcwd()

'/home/serge/Dropbox/p/pysal/src/pysal/tools/tmp/esda'

In [12]:
ncommits = len(check_output(cmd).splitlines())

In [13]:
ncommits

17

In [14]:
identities = {'Levi John Wolf': ('ljwolf', 'Levi John Wolf'),
              'Serge Rey': ('Serge Rey', 'Sergio Rey', 'sjsrey', 'serge'),
              'Wei Kang': ('Wei Kang', 'weikang9009'),
              'Dani Arribas-Bel': ('Dani Arribas-Bel', 'darribas')
}

def regularize_identity(string):
    string = string.decode()
    for name, aliases in identities.items():
        for alias in aliases:
            if alias in string:
                string = string.replace(alias, name)
    if len(string.split(' '))>1:
        string = string.title()
    return string.lstrip('* ')

In [15]:
from collections import Counter
author_cmd = ['git', 'log', '--format=* %aN', since_date]

In [16]:
ncommits = len(check_output(cmd).splitlines())
all_authors = check_output(author_cmd).splitlines()
counter = Counter([regularize_identity(author) for author in all_authors])
#        global_counter += counter
#        counters.update({'.'.join((package,subpackage)): counter})
unique_authors = sorted(set(all_authors))

In [17]:
unique_authors

[b'* James Gaboardi',
 b'* Serge Rey',
 b'* Sergio Rey',
 b'* Wei Kang',
 b'* ljwolf']

In [18]:
counter.keys()

dict_keys(['Serge Rey', 'Levi John Wolf', 'James Gaboardi', 'Wei Kang'])

## Add in meta package diffs

In [19]:
from datetime import datetime, timedelta
ISO8601 = "%Y-%m-%dT%H:%M:%SZ"
PER_PAGE = 100
element_pat = re.compile(r'<(.+?)>')
rel_pat = re.compile(r'rel=[\'"](\w+)[\'"]')


In [20]:

def parse_link_header(headers):
    link_s = headers.get('link', '')
    urls = element_pat.findall(link_s)
    rels = rel_pat.findall(link_s)
    d = {}
    for rel,url in zip(rels, urls):
        d[rel] = url
    return d

def get_paged_request(url):
    """get a full list, handling APIv3's paging"""
    results = []
    while url:
        #print("fetching %s" % url, file=sys.stderr)
        f = urlopen(url)
        results.extend(json.load(f))
        links = parse_link_header(f.headers)
        url = links.get('next')
    return results

def get_issues(project="pysal/pysal", state="closed", pulls=False):
    """Get a list of the issues from the Github API."""
    which = 'pulls' if pulls else 'issues'
    url = "https://api.github.com/repos/%s/%s?state=%s&per_page=%i" % (project, which, state, PER_PAGE)
    return get_paged_request(url)


def _parse_datetime(s):
    """Parse dates in the format returned by the Github API."""
    if s:
        return datetime.strptime(s, ISO8601)
    else:
        return datetime.fromtimestamp(0)


def issues2dict(issues):
    """Convert a list of issues to a dict, keyed by issue number."""
    idict = {}
    for i in issues:
        idict[i['number']] = i
    return idict


def is_pull_request(issue):
    """Return True if the given issue is a pull request."""
    return 'pull_request_url' in issue


def issues_closed_since(period=timedelta(days=365), project="pysal/pysal", pulls=False):
    """Get all issues closed since a particular point in time. period
can either be a datetime object, or a timedelta object. In the
latter case, it is used as a time before the present."""

    which = 'pulls' if pulls else 'issues'

    if isinstance(period, timedelta):
        period = datetime.now() - period
    url = "https://api.github.com/repos/%s/%s?state=closed&sort=updated&since=%s&per_page=%i" % (project, which, period.strftime(ISO8601), PER_PAGE)
    allclosed = get_paged_request(url)
    # allclosed = get_issues(project=project, state='closed', pulls=pulls, since=period)
    filtered = [i for i in allclosed if _parse_datetime(i['closed_at']) > period]

    # exclude rejected PRs
    if pulls:
        filtered = [ pr for pr in filtered if pr['merged_at'] ]

    return filtered


def sorted_by_field(issues, field='closed_at', reverse=False):
    """Return a list of issues sorted by closing date date."""
    return sorted(issues, key = lambda i:i[field], reverse=reverse)


def report(issues, show_urls=False):
    """Summary report about a list of issues, printing number and title.
    """
    # titles may have unicode in them, so we must encode everything below
    if show_urls:
        for i in issues:
            role = 'ghpull' if 'merged_at' in i else 'ghissue'
            print('* :%s:`%d`: %s' % (role, i['number'],
                                        i['title'].encode('utf-8')))
    else:
        for i in issues:
            print('* %d: %s' % (i['number'], i['title'].encode('utf-8')))



In [23]:
since

datetime.datetime(2019, 7, 31, 0, 0)

In [24]:
all_issues = {}
all_pulls = {}
total_commits = 0
issue_details = {}
pull_details = {}
for package in packages:
    subpackage = package
    subpackages = packages[package].split()
    print(package)
    prj = 'pysal/{subpackage}'.format(subpackage=package)
    os.chdir(CWD)
    os.chdir('tmp/{subpackage}'.format(subpackage=package))
    issues = issues_closed_since(since, project=prj,pulls=False)
    pulls = issues_closed_since(since, project=prj,pulls=True)
    issues = sorted_by_field(issues, reverse=True)
    pulls = sorted_by_field(pulls, reverse=True)
    issue_details[subpackage] = issues
    pull_details[subpackage] = pulls
    n_issues, n_pulls = map(len, (issues, pulls))
    n_total = n_issues + n_pulls
    all_issues[subpackage] = n_total, n_pulls
   
os.chdir(CWD)

libpysal
esda
giddy
inequality
pointpats
segregation
spaghetti
mgwr
spglm
spint
spreg
spvcm
tobler
mapclassify
splot


In [25]:
issue_details

{'libpysal': [{'url': 'https://api.github.com/repos/pysal/libpysal/issues/230',
   'repository_url': 'https://api.github.com/repos/pysal/libpysal',
   'labels_url': 'https://api.github.com/repos/pysal/libpysal/issues/230/labels{/name}',
   'comments_url': 'https://api.github.com/repos/pysal/libpysal/issues/230/comments',
   'events_url': 'https://api.github.com/repos/pysal/libpysal/issues/230/events',
   'html_url': 'https://github.com/pysal/libpysal/pull/230',
   'id': 545322537,
   'node_id': 'MDExOlB1bGxSZXF1ZXN0MzU5MjQwMTk5',
   'number': 230,
   'user': {'login': 'ljwolf',
    'id': 2250995,
    'node_id': 'MDQ6VXNlcjIyNTA5OTU=',
    'avatar_url': 'https://avatars3.githubusercontent.com/u/2250995?v=4',
    'gravatar_id': '',
    'url': 'https://api.github.com/users/ljwolf',
    'html_url': 'https://github.com/ljwolf',
    'followers_url': 'https://api.github.com/users/ljwolf/followers',
    'following_url': 'https://api.github.com/users/ljwolf/following{/other_user}',
    'gists_u

The issues are pulled since the last release date of the meta package.
However, each package that is going into the meta release, has a specific release tag that pins the code making it into the release. We don't want to report the commits post the packages tag date so we have to do some filtering here before building our change log statistics for the meta package.

For now let's pickle the issues and pull records to filter later and not have to rehit  github api


In [26]:
import pickle 

pickle.dump( issue_details, open( "issue_details.p", "wb" ) )

pickle.dump( pull_details, open("pull_details.p", "wb"))