# Licenses at fork time

This notebook computes licenses at fork time. It produces `data/licenses.csv.gz'.

In [1]:
import pandas

In [2]:
df_variants = pandas.read_csv('../data/variants.csv.gz')

In [3]:
df_repo = pandas.read_csv(
    '../data/repositories.csv.gz',
    parse_dates=['created_at'],
    infer_datetime_format=True,
)

In [4]:
df_data = (
    df_variants
    .merge(
        df_repo[['repoid', 'created_at']],
        how='left',
        left_on=['variant_repoid'],
        right_on=['repoid'],
    )
    .rename(columns={'created_at': 'fork_time'})
    [['mainline', 'variant', 'fork_time']]
)

### License at fork time

In [5]:
df_licenses = (
    pandas.read_csv('../data/releases.csv.gz')
    .merge(
        pandas.read_csv('../data/releases_npm.csv.gz'),
        how='inner',
        on=['package', 'version'],
    )
    .assign(date=lambda d: pandas.to_datetime(d['date'], infer_datetime_format=True))
    [['package', 'version', 'date', 'license']]
)

In [8]:
df_data = (
    df_data
    .sort_values('fork_time')
    
    # Mainline license before fork
    .pipe(lambda df: 
        pandas.merge_asof(
            df,
            df_licenses.sort_values('date'),
            left_on='fork_time',
            right_on='date',
            left_by='mainline',
            right_by='package',
        )
        .drop(columns=['package', 'version'])
        .rename(columns={'date': 'mainline_date_before_fork', 'license': 'mainline_license_before_fork'})
    )
    
    # Mainline license after fork
    .pipe(lambda df: 
        pandas.merge_asof(
            df,
            df_licenses.sort_values('date'),
            left_on='fork_time',
            right_on='date',
            left_by='mainline',
            right_by='package',
            direction='forward',
        )
        .drop(columns=['package', 'version'])
        .rename(columns={'date': 'mainline_date_after_fork', 'license': 'mainline_license_after_fork'})
    )
    
    # Variant license after fork
    .pipe(lambda df: 
        pandas.merge_asof(
            df,
            df_licenses.sort_values('date'),
            left_on='fork_time',
            right_on='date',
            left_by='variant',
            right_by='package',
            direction='forward',
        )
        .drop(columns=['package', 'version'])
        .rename(columns={'date': 'variant_date_after_fork', 'license': 'variant_license_after_fork'})
    )
)

In [10]:
df_data.to_csv('../data/licenses.csv.gz', compression='gzip', index=False)