In [1]:
import pandas as pd

pd.set_option('display.max_columns', 100)

In [2]:
pypi_data = pd.read_json('../../data/repology/pypicache.json')
pypi_data

Unnamed: 0,info,last_serial,releases,urls,vulnerabilities
0,"{'author': 'Dmitry Berezovsky', 'author_email'...",8973353,"{'0.0.8': [{'comment_text': '', 'digests': {'m...","[{'comment_text': '', 'digests': {'md5': '360c...",[]
1,"{'author': 'Abraham', 'author_email': 'abraham...",9481777,"{'1.0.0': [{'comment_text': '', 'digests': {'m...","[{'comment_text': '', 'digests': {'md5': 'dfef...",[]
2,"{'author': 'Sergei Minaev', 'author_email': 'i...",9764135,"{'0.5': [{'comment_text': '', 'digests': {'md5...","[{'comment_text': '', 'digests': {'md5': 'b612...",[]
3,{'author': 'Wolfgang Schnerring <wosc@wosc.de>...,11184293,"{'1.0.4': [{'comment_text': '', 'digests': {'m...","[{'comment_text': '', 'digests': {'md5': '55ed...",[]
4,"{'author': 'Thea Barnes', 'author_email': 'the...",11644911,"{'0.0.1': [{'comment_text': '', 'digests': {'m...","[{'comment_text': '', 'digests': {'md5': '3613...",[]
...,...,...,...,...,...
169616,"{'author': 'Amazon Web Services', 'author_emai...",13647017,"{'1.154.0': [{'comment_text': '', 'digests': {...","[{'comment_text': '', 'digests': {'md5': 'ae15...",[]
169617,"{'author': 'Amazon Web Services', 'author_emai...",13647009,"{'1.154.0': [{'comment_text': '', 'digests': {...","[{'comment_text': '', 'digests': {'md5': '9bc9...",[]
169618,"{'author': '', 'author_email': '', 'bugtrack_u...",13625952,"{'0.0.23': [{'comment_text': '', 'digests': {'...","[{'comment_text': '', 'digests': {'md5': '0d0a...",[]
169619,"{'author': 'Amazon Web Services', 'author_emai...",13647157,"{'1.154.0': [{'comment_text': '', 'digests': {...","[{'comment_text': '', 'digests': {'md5': 'dde6...",[]


### Converting Info JSON to a DataFrame

In [3]:
# pypi_data['info'].map(lambda x: x['author'])
# df = pypi_data['info'].apply(pd.Series) --- Slow in general. For the given case, just as fast as json_normalize (without a set max level) as that attempts to unnest the entire json object. ~ 30 seconds
# df = pd.json_normalize(pypi_data['info']) --- Slow due to previously mentioned unnesting. ~ 30 seconds
# df = pd.json_normalize(pypi_data['info'], max_level=0) --- Faster. ~ 5 seconds
# df = pd.DataFrame(pypi_data['info'].values.tolist()) --- Fastest. ~ 0.5 seconds

info_df = pd.DataFrame(pypi_data['info'].values.tolist())
# display(info_df)
info_df = info_df[['name', 'version', 'requires_dist', 'author']]
# Rename headers to make it more readable
info_df.rename(columns={'requires_dist': 'dependency', }, inplace=True)
info_df = info_df.explode('dependency').reset_index(drop=True)
info_df

Unnamed: 0,name,version,dependency,author
0,ws-sizzle,0.0.8,json-rpc (>=1.13.0),Dmitry Berezovsky
1,ws-sizzle,0.0.8,pydantic (>=1.6),Dmitry Berezovsky
2,ws-sizzle,0.0.8,aiohttp ; extra == 'aiohttp',Dmitry Berezovsky
3,ws-sizzle,0.0.8,requests ; extra == 'requests-client',Dmitry Berezovsky
4,ws-sizzle,0.0.8,tornado ; extra == 'tornado',Dmitry Berezovsky
...,...,...,...,...
660143,aws-cdk.aws-s3-notifications,1.154.0,publication (>=0.0.3),Amazon Web Services
660144,peapy,1.1.0,pygame (==2.1.3.dev4),Silvan Schmidt
660145,peapy,1.1.0,keyboard,Silvan Schmidt
660146,peapy,1.1.0,mouse,Silvan Schmidt


### Methods for extracting information from the dependency string

In [4]:
def version_extractor(string: str) -> str:
    if not string:
        return '*'
    # Replace ) with ( to make splitting easier and more precise
    split = string.replace(')', '(').split('(')
    # Remove the trailing parenthesis
    return split[1] if len(split) > 1 else '*'

def name_extractor(string: str) -> str:
    # If we can't find either symbol, then we assume that's the dependency name
    if not string:
        return 'None'
    if '(' not in string and ';' not in string:
        return string
    # If there are no parenthesis, this will return the given string as a singleton list. If there are parenthesis, get rid of them.
    no_parenthesis = string.split('(')[0]
    no_semicolon = no_parenthesis.split(';')[0].strip()

    return no_semicolon

In [5]:
dependency_version_series = info_df['dependency'].apply(version_extractor)
dependency_name_series = info_df['dependency'].apply(name_extractor)

# Used just for visual purposes
dependencies_df = pd.concat([dependency_name_series, dependency_version_series], axis=1, ignore_index=True)
dependencies_df.columns = ['dependency_name', 'dependency_version']
dependencies_df
# Attempt to use multiprocessing. Ended up not using it since I discovered that using built-in strings is extremely fast compared to regex.
# cores = multiprocessing.cpu_count()
# chunks = np.array_split(info_df['dependency'], cores)
#
# with Pool(cores) as pool:
#     processed = pd.concat(pool.map(extract_semantic_version, chunks), ignore_index=True)

# processed
# info_df.dependency.str.extract(compiled_rx)

Unnamed: 0,dependency_name,dependency_version
0,json-rpc,>=1.13.0
1,pydantic,>=1.6
2,aiohttp,*
3,requests,*
4,tornado,*
...,...,...
660143,publication,>=0.0.3
660144,pygame,==2.1.3.dev4
660145,keyboard,*
660146,mouse,*


In [6]:
info_df['dependency'] = dependency_name_series
info_df.insert(3, 'dependency_version', dependency_version_series)
info_df

Unnamed: 0,name,version,dependency,dependency_version,author
0,ws-sizzle,0.0.8,json-rpc,>=1.13.0,Dmitry Berezovsky
1,ws-sizzle,0.0.8,pydantic,>=1.6,Dmitry Berezovsky
2,ws-sizzle,0.0.8,aiohttp,*,Dmitry Berezovsky
3,ws-sizzle,0.0.8,requests,*,Dmitry Berezovsky
4,ws-sizzle,0.0.8,tornado,*,Dmitry Berezovsky
...,...,...,...,...,...
660143,aws-cdk.aws-s3-notifications,1.154.0,publication,>=0.0.3,Amazon Web Services
660144,peapy,1.1.0,pygame,==2.1.3.dev4,Silvan Schmidt
660145,peapy,1.1.0,keyboard,*,Silvan Schmidt
660146,peapy,1.1.0,mouse,*,Silvan Schmidt


### Saving the processed data to a file

In [9]:
info_df.to_json('../../data/output/dependencies.json', orient='records')
info_df.to_csv('../../data/output/dependencies.csv')

## Converting Releases JSON to a DataFrame
Decided against using releases as they do not contain meaningful information. Most of the time they only contain the most recent version that can be recovered from the info JSON

In [None]:
# releases_df = pd.DataFrame(pypi_data['releases'].values.tolist()) --- Runs out of memory
# releases_df = pypi_data['releases'].map(lambda x: x.keys())

### Bits and bobs that were tinkered with but were scrapped

In [None]:
# test = pypi_data['info'].apply(lambda el: json.loads(json.dumps(el)))
# test
# pypi_data['info'][0]


In [None]:
# pypi_data['info'].to_json('../../data/repology/pypi_info.json', orient='records', lines=True)

In [None]:
# pypi_data_reduced = pd.read_json('../../data/repology/pypi_info.json', orient='records', lines=True)

In [None]:
# pypi_data_reduced.dropna(subset=['requires_dist'], inplace=True)
# pypi_data_reduced.reset_index(drop=True, inplace=True)
# # pypi_data_reduced['requires_dist'] = pypi_data_reduced['requires_dist'].apply(json.loads)
# pypi_data_reduced['requires_dist'] = [','.join(x) for x in pypi_data_reduced['requires_dist']]
# pypi_data_reduced[['name', 'requires_dist']]
# Select from pypi_data_reduced all the data that has name zzzzls-Spider
# pypi_data_reduced[pypi_data_reduced['name'] == 'pandas']['requires_dist'].values[0]
# pypi_data_reduced['']

In [None]:
# with open('../../data/repology/pypicache.json', 'r') as file:
#     json_data = json.load(file)

In [None]:
# nested_json_data = pd.json_normalize(json_data, max_level=2)
# nested_json_data

In [None]:
# chunks = pd.read_json('../../data/repology/pypicache.json', lines=True, chunksize=100000)
#
# for chunk in chunks:
#     display(chunk)