In [1]:
import psycopg2
import pandas as pd
import pandas.io.sql as sqlio
import datetime
import pytz
import re

In [2]:
try:
    conn = psycopg2.connect("dbname=swhgd-popular-3k-python user=postgres password=postgres")
except psycopg2.Error as e:
    print(type(e))
    print(e)

The authors of "Cheating Death: A Statistical Survival Analysis of Publicly Available Python Projects" state the overall they extracted the author identifier and date for each revision and release made on a project as well as the VCS used to host the project repository.

This translates fairly nicely to an SQL query because the SWHG has relations for both releases and revisions as shown in the scheme below:  
![](dataset-schema.png)

Lets wrangle all the data we need and we will do it in one query

In [12]:
query = """with rev as (select id as rev_id, author as rev_author, date as rev_date from revision),
rel as (select id as rel_id, target, author as rel_author,date as rel_date from release order by rel_date),
origins as (select ov.snapshot_id, o.type, o.url from origin_visit ov join origin o on ov.origin = o.id),
snapshots as (select s.object_id as s_id, sbs.snapshot_id, sbs.branch_id, sb.object_id, sb.target from snapshot s join snapshot_branches sbs on s.object_id = sbs.snapshot_id join snapshot_branch sb on sbs.branch_id = sb.object_id)
select rev_author, rev_date, rel_author, rel_date, type as VCS, origins.url as repo_url from (rev full join rel on rev.rev_id = rel.target full join snapshots on snapshots.target = rev_id full join origins on origins.snapshot_id = snapshots.s_id) where type is not null and url is not null"""
data = sqlio.read_sql_query(query, conn)
data

Unnamed: 0,rev_author,rev_date,rel_author,rel_date,vcs,repo_url
0,96.0,2013-05-12 10:40:53-07:00,96.0,2013-05-12 10:41:06-07:00,git,https://github.com/hylang/hy
1,96.0,2013-12-31 11:24:38-08:00,96.0,2013-12-31 11:25:02-08:00,git,https://github.com/hylang/hy
2,96.0,2013-07-06 16:34:01-07:00,96.0,2013-07-06 16:34:11-07:00,git,https://github.com/hylang/hy
3,28.0,2012-12-31 15:48:51-08:00,28.0,2012-12-31 15:49:00-08:00,git,https://github.com/hylang/hy
4,15.0,2013-03-29 17:38:02-07:00,15.0,2013-03-29 17:38:11-07:00,git,https://github.com/hylang/hy
...,...,...,...,...,...,...
9829591,,,,,deb,deb://Debian/packages/python-monotonic
9829592,,,,,deb,deb://Debian/packages/python-monotonic
9829593,,,,,deb,deb://Debian/packages/python-monotonic
9829594,,,,,deb,deb://Debian/packages/python-monotonic


Now we will narrow down the dataset so that it only includes revisions and releases between 2005 - January 2018

In [14]:
#generate the start and end date so we can query to find start_date <= date <= end_date
start_date = datetime.datetime(2005,1,1)
end_date = datetime.datetime(2018, 1, 31)
#change the naive datetime objects to aware ones
utc = pytz.timezone('UTC')
start_date = utc.localize(start_date)
end_date = utc.localize(end_date)
#query the dateframe to narrow down the dates
data = data.query('@start_date <= rev_date <= @end_date')

In [21]:
grouped = data.groupby('repo_url').agg({'rev_author': 'count', 'vcs': 'first'})
grouped

Unnamed: 0_level_0,rev_author,vcs
repo_url,Unnamed: 1_level_1,Unnamed: 2_level_1
deb://Debian/packages/accerciser,1192,deb
deb://Debian/packages/agtl,1147,deb
deb://Debian/packages/ajaxterm,1248,deb
deb://Debian/packages/alacarte,1242,deb
deb://Debian/packages/angrydd,1254,deb
...,...,...
https://pypi.org/project/youtube_dl/,798,pypi
https://pypi.org/project/zc.lockfile/,7,pypi
https://pypi.org/project/zope.component/,31,pypi
https://pypi.org/project/zope.deprecation/,13,pypi


In [11]:
data = data.assign(authorCount = data['rev_author'].nunique(),
                   hostType = data['vcs'],
                   project = data['repo_url'].str.findall("(.+)/(.+)(/*$)")[0][0][1])
data

Unnamed: 0,rev_author,rev_date,rel_author,rel_date,vcs,repo_url,authorCount,hostType,project
0,96.0,2013-05-12 10:40:53-07:00,96.0,2013-05-12 10:41:06-07:00,git,https://github.com/hylang/hy,76993,git,hy
1,96.0,2013-12-31 11:24:38-08:00,96.0,2013-12-31 11:25:02-08:00,git,https://github.com/hylang/hy,76993,git,hy
2,96.0,2013-07-06 16:34:01-07:00,96.0,2013-07-06 16:34:11-07:00,git,https://github.com/hylang/hy,76993,git,hy
3,28.0,2012-12-31 15:48:51-08:00,28.0,2012-12-31 15:49:00-08:00,git,https://github.com/hylang/hy,76993,git,hy
4,15.0,2013-03-29 17:38:02-07:00,15.0,2013-03-29 17:38:11-07:00,git,https://github.com/hylang/hy,76993,git,hy
...,...,...,...,...,...,...,...,...,...
9800348,401917.0,2011-01-27 08:05:17-08:00,,,pypi,https://pypi.org/project/croniter/,76993,pypi,hy
9800349,20771008.0,2010-01-26 21:30:56-08:00,,,pypi,https://pypi.org/project/croniter/,76993,pypi,hy
9800350,20825400.0,2017-08-31 00:43:50-07:00,,,pypi,https://pypi.org/project/croniter/,76993,pypi,hy
9800351,401917.0,2010-08-09 17:13:08-07:00,,,pypi,https://pypi.org/project/croniter/,76993,pypi,hy


for a dataframe containing a single project
majorReleases = (data.shape[0] - data['rel_date'].isna().sum() > 0)