# Variant data

In [1]:
import pandas
import gc

%matplotlib inline

This notebook requires `data/variants.csv.gz` and, as such, needs to be executed after "Detect variants".
Running this notebook requires at least 16Gb of memory. It should be quite easy to adapt it so that 8 Gb is enough...

The goal of this notebook is to collect and extract data from the various files located in `data-raw`. These data will be a subset restricted to mainlines and variants (so the files are smaller, and can be stored and distributed in `data/`).

### Load data

In [2]:
df_variants = pandas.read_csv(
    '../data/variants.csv.gz'
)

In [3]:
df_variants

Unnamed: 0,mainline,mainline_repo,mainline_repoid,variant,variant_repo,variant_repoid
0,wheat,creationix/wheat,162291,11zwheat,sun11/wheat,49882
1,wheat,creationix/wheat,162291,barley,frodare/barley,124697
2,keypair,juliangruber/keypair,110982,akeypair,quartzjer/akeypair,86500
3,keypair,juliangruber/keypair,110982,jh-keypair,johnhaley81/keypair,805497
4,sasl-digest-md5,jaredhanson/js-sasl-digest-md5,149511,alt-sasl-digest-md5,legastero/js-sasl-digest-md5,86665
...,...,...,...,...,...,...
12808,dot-values,bajankristof/dot-values,34049409,dot-values2,bluelovers/dot-values,41256794
12809,kompression,tuananh/kompression,30312975,@nivinjoseph/kompression,nivinjoseph/kompression,41256967
12810,contentful-typescript-codegen,intercom/contentful-typescript-codegen,39168489,@zeusdeux/contentful-typescript-codegen,zeusdeux/contentful-typescript-codegen,41257476
12811,prometheus-gc-stats,SimenB/node-prometheus-gc-stats,13589391,prometheus-gc-stats2,acifani/node-prometheus-gc-stats,41257504


In [4]:
df_packages = pandas.read_csv(
    '../data-raw/packages.csv.gz'
)

In [5]:
df_packages

Unnamed: 0,package,license,repository,repoid
0,0,BSD-2-Clause,,
1,001,,,
2,001_skt,,,
3,001_test,,,
4,007,MIT,https://github.com/btford/007,49873.0
...,...,...,...,...
1275077,urlparser-simple,MIT,https://github.com/dcmox/urlparser,
1275078,red-contrib-samsung-tv-control,MIT,https://github.com/Toxblh/node-red-contrib-sam...,41257604.0
1275079,ss_react_ts_ui,ISC,https://github.com/shjyy1983/ss_react_ts_ui,41257629.0
1275080,dropdown-act,,,


In [6]:
df_releases = pandas.read_csv(
    '../data-raw/releases.csv.gz'
)

In [7]:
df_releases

Unnamed: 0,package,version,date
0,0,0.0.0,2014-04-01 22:51:11 UTC
1,001,0.0.1,2014-08-08 06:02:45 UTC
2,001_skt,0.0.1,2014-08-08 06:09:50 UTC
3,001_test,0.0.1,2014-08-08 06:36:13 UTC
4,007,0.0.0,2013-07-26 19:46:10 UTC
...,...,...,...
11400697,ss_react_ts_ui,1.0.2,2020-01-13 01:17:56 UTC
11400698,ss_react_ts_ui,1.0.3,2020-01-13 01:18:23 UTC
11400699,ss_react_ts_ui,1.0.5,2020-01-13 01:20:46 UTC
11400700,dropdown-act,0.2.0,2020-01-13 01:18:07 UTC


In [8]:
df_dependencies = pandas.read_csv(
    '../data-raw/dependencies.csv.gz'
)

In [9]:
df_dependencies

Unnamed: 0,source,version,kind,target,constraint
0,007,0.0.1,Development,should,~1.2.2
1,007,0.0.1,Development,mocha,~1.12.0
2,007,0.0.2,Development,should,~1.2.2
3,007,0.0.2,Development,mocha,~1.12.0
4,01,0.0.1,runtime,commander,1.0.x
...,...,...,...,...,...
154471982,dropdown-act,0.2.0,runtime,@progress/kendo-react-charts,^3.9.0
154471983,dropdown-act,0.2.0,runtime,@progress/kendo-react-buttons,^3.9.0
154471984,dropdown-act,0.2.0,runtime,@progress/kendo-react-animation,^3.9.0
154471985,dropdown-act,0.2.0,runtime,@progress/kendo-drawing,^1.6.0


In [10]:
df_repositories = pandas.read_csv(
    '../data-raw/repositories.csv.gz'
)

In [11]:
df_repositories

Unnamed: 0,host,repository,repoid,forks,stars,watchers,contributors,license,forked_from,created_at,collected_at,last_push
0,GitHub,brianmhunt/knockout-modal,1,0,7,2.0,1,MIT,,2014-09-15 01:21:34 UTC,2016-12-28 16:33:17 UTC,2016-12-18 18:31:32 UTC
1,GitHub,SteveSanderson/knockout.mapping,2,797,559,62.0,21,Other,,2010-11-01 09:27:43 UTC,2018-11-22 02:20:37 UTC,2017-06-21 22:54:45 UTC
2,GitHub,azman-co/knockout-model,3,0,1,1.0,5,,devco/knockup,2014-09-13 03:14:07 UTC,2017-03-18 22:40:02 UTC,2015-01-14 02:01:03 UTC
3,GitHub,zonuexe/aozora-ruby-parser.js,4,1,3,1.0,1,,,2014-12-27 21:02:09 UTC,2016-12-28 16:45:20 UTC,2015-01-07 18:04:42 UTC
4,GitHub,immense/knockout-pickatime,5,1,1,2.0,1,MIT,,2014-12-04 21:13:48 UTC,2018-12-21 11:30:13 UTC,2014-12-11 16:12:08 UTC
...,...,...,...,...,...,...,...,...,...,...,...,...
1209606,GitHub,sunnysingh/use-context-state,41257545,0,0,1.0,1,,,2020-01-12 23:53:51 UTC,2020-01-13 07:22:59 UTC,2020-01-12 23:54:05 UTC
1209607,GitHub,daywiss/utils,41257575,0,1,2.0,1,,,2019-12-22 18:31:00 UTC,2020-01-13 01:15:36 UTC,2020-01-13 01:01:47 UTC
1209608,GitHub,Toxblh/node-red-contrib-samsung-tv-control,41257604,0,0,1.0,1,MIT,,2020-01-13 00:56:01 UTC,2020-01-13 08:18:31 UTC,2020-01-13 01:54:36 UTC
1209609,GitHub,shjyy1983/ss_react_ts_ui,41257629,0,0,1.0,1,Apache-2.0,,2020-01-12 05:50:09 UTC,2020-01-13 01:31:08 UTC,2020-01-13 01:20:42 UTC


In [12]:
df_repodeps = pandas.read_csv(
    '../data-raw/repo_deps.csv.gz'
)

In [13]:
df_repodeps

Unnamed: 0,host,repository,repoid,kind,target,constraint
0,GitHub,brianmhunt/knockout-modal,1,development,gulp,^3.8.8
1,GitHub,brianmhunt/knockout-modal,1,development,gulp-autoprefixer,^1.0.0
2,GitHub,brianmhunt/knockout-modal,1,development,gulp-bump,^0.1.11
3,GitHub,brianmhunt/knockout-modal,1,development,gulp-connect,^2.0.6
4,GitHub,brianmhunt/knockout-modal,1,development,gulp-filter,^1.0.2
...,...,...,...,...,...,...
13048464,GitHub,protzi/dm,20842311,development,eslint,^3.16.0
13048465,GitHub,protzi/dm,20842311,development,eslint-config-airbnb,^14.1.0
13048466,GitHub,elsewares/sidr,20842312,development,grunt,~0.4.0
13048467,GitHub,luisvilches/luisvilches.cl,20842321,runtime,vue,^2.1.0


### Identify packages (mainlines & variants) and their dependents

In [14]:
variants = df_variants['mainline'].append(df_variants['variant']).drop_duplicates()
variant_repoid = df_variants['mainline_repoid'].append(df_variants['variant_repoid']).drop_duplicates()

In [15]:
len(variants), len(variant_repoid) 

(23219, 23219)

In [16]:
dependents = df_dependencies[lambda d: d['target'].isin(variants)]['source'].drop_duplicates()

In [17]:
len(dependents)

345265

In [18]:
dependent_repo = df_repodeps[lambda d: d['target'].isin(variants)]['repoid'].drop_duplicates()

In [19]:
len(dependent_repo)

333889

### Extract data

In [20]:
(
    df_packages
    [lambda d: d['package'].isin(variants.append(dependents))]
    .to_csv('../data/packages.csv.gz', compression='gzip', index=False)
)

In [21]:
del df_packages
gc.collect()

0

In [22]:
(
    df_releases
    [lambda d: d['package'].isin(variants.append(dependents))]
    .to_csv('../data/releases.csv.gz', compression='gzip', index=False)
)

In [23]:
del df_releases
gc.collect()

0

In [24]:
(
    df_dependencies
    [lambda d: d['source'].isin(variants) | d['target'].isin(variants)]
    .to_csv('../data/dependencies.csv.gz', compression='gzip', index=False)
)

In [25]:
del df_dependencies
gc.collect()

0

In [26]:
(
    df_repositories
    [lambda d: d['repoid'].isin(variant_repoid.append(dependent_repo))]
    .to_csv('../data/repositories.csv.gz', compression='gzip', index=False)
)

In [27]:
del df_repositories
gc.collect()

0

In [28]:
(
    df_repodeps
    [lambda d: d['repoid'].isin(dependent_repo)]
    .to_csv('../data/repo_deps.csv.gz', compression='gzip', index=False)
)

In [29]:
del df_repodeps
gc.collect()

0