# Licenses at fork time

This notebook computes licenses at fork time. It produces `data/licenses.csv.gz'.

In [1]:
import pandas

In [2]:
df_variants = pandas.read_csv('../data/variants.csv.gz')

In [3]:
df_repo = pandas.read_csv(
    '../data/repositories.csv.gz',
    parse_dates=['created_at'],
    infer_datetime_format=True,
)

In [4]:
df_data = (
    df_variants
    .merge(
        df_repo[['repoid', 'created_at']],
        how='left',
        left_on=['variant_repoid'],
        right_on=['repoid'],
    )
    .rename(columns={'created_at': 'fork_time'})
    [['mainline', 'variant', 'fork_time']]
)

### License at fork time

In [5]:
df_licenses = (
    pandas.read_csv('../data/releases.csv.gz')
    .merge(
        pandas.read_csv('../data/releases_npm.csv.gz'),
        how='inner',
        on=['package', 'version'],
    )
    .assign(date=lambda d: pandas.to_datetime(d['date'], infer_datetime_format=True))
    [['package', 'version', 'date', 'license']]
)

In [6]:
df_data = (
    df_data
    .sort_values('fork_time')
    
    # Mainline license before fork
    .pipe(lambda df: 
        pandas.merge_asof(
            df,
            df_licenses.sort_values('date'),
            left_on='fork_time',
            right_on='date',
            left_by='mainline',
            right_by='package',
        )
        .drop(columns=['package', 'version'])
        .rename(columns={'date': 'mainline_date_before_fork', 'license': 'mainline_license_before_fork'})
    )
    
    # Mainline license after fork
    .pipe(lambda df: 
        pandas.merge_asof(
            df,
            df_licenses.sort_values('date'),
            left_on='fork_time',
            right_on='date',
            left_by='mainline',
            right_by='package',
            direction='forward',
        )
        .drop(columns=['package', 'version'])
        .rename(columns={'date': 'mainline_date_after_fork', 'license': 'mainline_license_after_fork'})
    )
    
    # Variant license after fork
    .pipe(lambda df: 
        pandas.merge_asof(
            df,
            df_licenses.sort_values('date'),
            left_on='fork_time',
            right_on='date',
            left_by='variant',
            right_by='package',
            direction='forward',
        )
        .drop(columns=['package', 'version'])
        .rename(columns={'date': 'variant_date_after_fork', 'license': 'variant_license_after_fork'})
    )
)

In [8]:
df_data

Unnamed: 0,mainline,variant,fork_time,mainline_date_before_fork,mainline_license_before_fork,mainline_date_after_fork,mainline_license_after_fork,variant_date_after_fork,variant_license_after_fork
0,nodelint,lint,2010-10-20 21:55:15,NaT,,2011-09-15 23:16:08,type,2011-01-03 15:26:10,MIT
1,scraper,wsscraper,2011-01-09 21:35:17,2011-01-02 11:55:54,,2011-02-05 13:24:14,,2011-01-12 13:58:28,
2,step,stepup,2011-01-17 20:10:14,NaT,,2011-02-21 20:52:21,,2011-10-01 00:34:40,
3,ain,ain-tcp,2011-03-03 20:23:20,NaT,,2015-01-30 08:07:20,MIT,2011-03-09 19:03:31,MIT
4,juggernaut,juggernaut-yoomee,2011-03-10 16:19:22,NaT,,2011-03-23 21:34:41,MIT,2011-04-13 10:25:15,MIT
...,...,...,...,...,...,...,...,...,...
12808,homebridge-camera-ffmpeg,homebridge-camera-ffmpeg-rest-motion,2020-01-12 10:59:24,2019-11-01 01:39:30,ISC,NaT,,2020-01-12 11:44:31,ISC
12809,dot-values,dot-values2,2020-01-12 11:29:33,2019-07-23 09:27:05,MIT,NaT,,2020-01-12 12:50:57,MIT
12810,kompression,@nivinjoseph/kompression,2020-01-12 13:40:55,2019-06-28 04:14:47,MIT,NaT,,2020-01-12 14:00:04,MIT
12811,homebridge-http-garage,homebridge-garage-remote-http,2020-01-12 16:34:47,2019-10-22 20:14:54,MIT,NaT,,2020-01-12 16:58:24,MIT


In [7]:
df_data.to_csv('../data/licenses.csv.gz', compression='gzip', index=False)