In [1]:
import psycopg2 as psql
import pandas.io.sql as sqlio

# Connect to and Query the Database

In [2]:
try:
    conn = psql.connect(dbname="swhgd-popular-3k-python", user="postgres", password="postgres")
except psql.Error as e:
    print(type(e))
    print(e)

![](dataset-schema.png)

In [3]:
query = """SELECT DISTINCT ON (o.url) o.url as url, 
o.type AS host_type,
COUNT(DISTINCT rev.author) as author_count,
CASE WHEN COUNT(rel.id) >= 1 THEN 1 ELSE 0 END AS major_releases,
MAX(rev.date) as end_date,
MIN(rev.date) as start_date,
COUNT(rev.date) as rev_count
FROM revision rev 
FULL JOIN release rel on rev.id = rel.target
FULL JOIN snapshot_branch s_b ON rev.id = s_b.target 
FULL JOIN snapshot_branches s_bs ON s_bs.branch_id = s_b.object_id
FULL JOIN snapshot s ON s.object_id = s_bs.snapshot_id
FULL JOIN origin_visit ov ON ov.snapshot_id = s.object_id
FULL JOIN origin o on o.id = ov.origin
WHERE (rev.date >= '2005-05-09'::DATE AND rev.date <= '2018-01-01'::DATE)
GROUP BY o.url, o.type;"""
data = sqlio.read_sql_query(query, conn)
data = data.dropna(how='any',axis=0)
data

Unnamed: 0,url,host_type,author_count,major_releases,end_date,start_date,rev_count
0,deb://Debian/packages/accerciser,deb,4,0,2017-12-13 05:26:58-08:00,2012-04-18 10:06:21-07:00,1192
1,deb://Debian/packages/agtl,deb,2,0,2015-08-19 13:36:54-07:00,2011-01-22 04:55:12-08:00,1147
2,deb://Debian/packages/ajaxterm,deb,2,0,2015-05-31 10:43:09-07:00,2011-08-19 07:31:06-07:00,1248
3,deb://Debian/packages/alacarte,deb,3,0,2017-12-18 18:44:02-08:00,2012-06-29 03:43:50-07:00,1242
4,deb://Debian/packages/angrydd,deb,3,0,2016-04-24 12:35:20-07:00,2008-10-19 05:19:40-07:00,1254
...,...,...,...,...,...,...,...
2080,https://pypi.org/project/youtube_dl/,pypi,2,0,2017-12-30 13:32:44-08:00,2013-01-19 16:34:50-08:00,794
2081,https://pypi.org/project/zc.lockfile/,pypi,2,0,2016-06-19 09:27:26-07:00,2007-07-18 04:27:21-07:00,7
2082,https://pypi.org/project/zope.component/,pypi,3,0,2017-09-26 04:17:58-07:00,2007-02-19 06:16:09-08:00,31
2083,https://pypi.org/project/zope.deprecation/,pypi,2,0,2017-08-07 12:24:01-07:00,2007-02-18 13:55:54-08:00,13


In [4]:
query = """SELECT o.url as url, 1 AS censored
FROM revision rev 
FULL JOIN release rel on rev.id = rel.target
FULL JOIN snapshot_branch s_b ON rev.id = s_b.target 
FULL JOIN snapshot_branches s_bs ON s_bs.branch_id = s_b.object_id
FULL JOIN snapshot s ON s.object_id = s_bs.snapshot_id
FULL JOIN origin_visit ov ON ov.snapshot_id = s.object_id
FULL JOIN origin o on o.id = ov.origin
WHERE ('2018-01-01'::DATE <= rev.date)
GROUP BY o.url"""
censored = sqlio.read_sql_query(query, conn)
censored = censored.dropna(how='any',axis=0)
censored

Unnamed: 0,url,censored
0,deb://Debian/packages/anki,1
1,deb://Debian/packages/ansible,1
2,deb://Debian/packages/apt-listchanges,1
3,deb://Debian/packages/aptfs,1
4,deb://Debian/packages/aqemu,1
...,...,...
1364,https://pypi.org/project/xgboost/,1
1365,https://pypi.org/project/yamllint/,1
1366,https://pypi.org/project/youtube_dl/,1
1367,https://pypi.org/project/zc.lockfile/,1


In [5]:
sql_data = data.merge(censored, on='url', how='outer')
sql_data = sql_data.fillna(value={"censored": 0})
sql_data = sql_data.query('rev_count > 1')

In [6]:
sql_data = sql_data.astype({'author_count':int, 'major_releases':bool, 'rev_count':int, 'censored':bool})

In [7]:
sql_data["major_releases"].value_counts(), sql_data["host_type"].value_counts(), sql_data['censored'].value_counts(), sql_data[sql_data["author_count"] > 20]["author_count"].count()

(False    1840
 True      226
 Name: major_releases, dtype: int64,
 deb     964
 git     739
 pypi    363
 Name: host_type, dtype: int64,
 True     1291
 False     775
 Name: censored, dtype: int64,
 515)

In [8]:
sql_data.to_csv('data/sql_data.csv', index=False)

In [9]:
query = """SELECT MIN(rev.date), MAX(rev.date)
FROM revision rev;"""
date_range = sqlio.read_sql_query(query, conn)
date_range

Unnamed: 0,min,max
0,1980-01-20 22:17:31+00:00,2019-03-22 21:06:44+00:00
