In [1]:

import multiprocessing
import re
from multiprocessing import Pool
from src.lib import extract_semantic_version
import numpy as np
import pandas as pd
from IPython.core.display_functions import display

pd.set_option('display.max_columns', 100)

In [2]:
pypi_data = pd.read_json('../../data/repology/pypicache.json')
pypi_data

Unnamed: 0,info,last_serial,releases,urls,vulnerabilities
0,"{'author': 'Dmitry Berezovsky', 'author_email'...",8973353,"{'0.0.8': [{'comment_text': '', 'digests': {'m...","[{'comment_text': '', 'digests': {'md5': '360c...",[]
1,"{'author': 'Abraham', 'author_email': 'abraham...",9481777,"{'1.0.0': [{'comment_text': '', 'digests': {'m...","[{'comment_text': '', 'digests': {'md5': 'dfef...",[]
2,"{'author': 'Sergei Minaev', 'author_email': 'i...",9764135,"{'0.5': [{'comment_text': '', 'digests': {'md5...","[{'comment_text': '', 'digests': {'md5': 'b612...",[]
3,{'author': 'Wolfgang Schnerring <wosc@wosc.de>...,11184293,"{'1.0.4': [{'comment_text': '', 'digests': {'m...","[{'comment_text': '', 'digests': {'md5': '55ed...",[]
4,"{'author': 'Thea Barnes', 'author_email': 'the...",11644911,"{'0.0.1': [{'comment_text': '', 'digests': {'m...","[{'comment_text': '', 'digests': {'md5': '3613...",[]
...,...,...,...,...,...
169616,"{'author': 'Amazon Web Services', 'author_emai...",13647017,"{'1.154.0': [{'comment_text': '', 'digests': {...","[{'comment_text': '', 'digests': {'md5': 'ae15...",[]
169617,"{'author': 'Amazon Web Services', 'author_emai...",13647009,"{'1.154.0': [{'comment_text': '', 'digests': {...","[{'comment_text': '', 'digests': {'md5': '9bc9...",[]
169618,"{'author': '', 'author_email': '', 'bugtrack_u...",13625952,"{'0.0.23': [{'comment_text': '', 'digests': {'...","[{'comment_text': '', 'digests': {'md5': '0d0a...",[]
169619,"{'author': 'Amazon Web Services', 'author_emai...",13647157,"{'1.154.0': [{'comment_text': '', 'digests': {...","[{'comment_text': '', 'digests': {'md5': 'dde6...",[]


### Converting Info JSON to a DataFrame

In [3]:
# pypi_data['info'].map(lambda x: x['author'])
# df = pypi_data['info'].apply(pd.Series) --- Slow in general. For the given case, just as fast as json_normalize (without a set max level) as that attempts to unnest the entire json object. ~ 30 seconds
# df = pd.json_normalize(pypi_data['info']) --- Slow due to previously mentioned unnesting. ~ 30 seconds
# df = pd.json_normalize(pypi_data['info'], max_level=0) --- Faster. ~ 5 seconds
# df = pd.DataFrame(pypi_data['info'].values.tolist()) --- Fastest. ~ 0.5 seconds

info_df = pd.DataFrame(pypi_data['info'].values.tolist())
# display(info_df)
info_df = info_df[['name', 'version', 'requires_dist', 'author']]
# Rename headers to make it more readable
info_df.rename(columns={'requires_dist': 'dependency', }, inplace=True)
info_df = info_df.explode('dependency').reset_index(drop=True)
info_df

Unnamed: 0,name,version,dependency,author
0,ws-sizzle,0.0.8,json-rpc (>=1.13.0),Dmitry Berezovsky
1,ws-sizzle,0.0.8,pydantic (>=1.6),Dmitry Berezovsky
2,ws-sizzle,0.0.8,aiohttp ; extra == 'aiohttp',Dmitry Berezovsky
3,ws-sizzle,0.0.8,requests ; extra == 'requests-client',Dmitry Berezovsky
4,ws-sizzle,0.0.8,tornado ; extra == 'tornado',Dmitry Berezovsky
...,...,...,...,...
660143,aws-cdk.aws-s3-notifications,1.154.0,publication (>=0.0.3),Amazon Web Services
660144,peapy,1.1.0,pygame (==2.1.3.dev4),Silvan Schmidt
660145,peapy,1.1.0,keyboard,Silvan Schmidt
660146,peapy,1.1.0,mouse,Silvan Schmidt


In [None]:
# compiled_rx = re.compile(r'\((?P<version>(\w+|\.|=|!|>|<)*)\)')
# cores = multiprocessing.cpu_count()
#
# chunks = np.array_split(info_df['dependency'], cores)
# print("I made it here!!!")
#
# with Pool(cores) as pool:
#     processed = pd.concat(pool.map(extract_semantic_version, chunks), ignore_index=True)

# processed
# info_df.dependency.str.extract(compiled_rx)

I made it here!!!


## Converting Releases JSON to a DataFrame
Decided against using releases as they do not contain meaningful information. Most of the time they only contain the most recent version that can be recovered from the info JSON

In [46]:
# releases_df = pd.DataFrame(pypi_data['releases'].values.tolist()) --- Runs out of memory
# releases_df = pypi_data['releases'].map(lambda x: x.keys())

0           (0.0.8)
1           (1.0.0)
2             (0.5)
3           (1.0.4)
4           (0.0.1)
            ...    
169616    (1.154.0)
169617    (1.154.0)
169618     (0.0.23)
169619    (1.154.0)
169620      (1.1.0)
Name: releases, Length: 169621, dtype: object

In [11]:
# test = pypi_data['info'].apply(lambda el: json.loads(json.dumps(el)))
# test
pypi_data['info'][0]


{'author': 'Dmitry Berezovsky',
 'author_email': '',
 'bugtrack_url': None,
 'classifiers': ['Development Status :: 3 - Alpha',
  'Intended Audience :: Developers',
  'License :: OSI Approved :: MIT License',
  'Programming Language :: Python :: 3',
  'Programming Language :: Python :: 3.7',
  'Topic :: Internet',
  'Topic :: Internet :: WWW/HTTP',
  'Topic :: Software Development :: Libraries :: Python Modules'],
 'description_content_type': '',
 'docs_url': None,
 'download_url': '',
 'downloads': {'last_day': -1, 'last_month': -1, 'last_week': -1},
 'home_page': 'https://github.com/corvis/ws-sizzle',
 'keywords': 'json,json-rpc,rpc,jsonrpc,jsonrpc-websockets,bidirectional-jsonrpc',
 'license': 'MIT',
 'maintainer': '',
 'maintainer_email': '',
 'name': 'ws-sizzle',
 'package_url': 'https://pypi.org/project/ws-sizzle/',
 'platform': '',
 'project_url': 'https://pypi.org/project/ws-sizzle/',
 'project_urls': {'Homepage': 'https://github.com/corvis/ws-sizzle'},
 'release_url': 'https:/

In [12]:
pypi_data['info'].to_json('../../data/repology/pypi_info.json', orient='records', lines=True)

In [13]:
pypi_data_reduced = pd.read_json('../../data/repology/pypi_info.json', orient='records', lines=True)

In [21]:
# pypi_data_reduced.dropna(subset=['requires_dist'], inplace=True)
# pypi_data_reduced.reset_index(drop=True, inplace=True)
# # pypi_data_reduced['requires_dist'] = pypi_data_reduced['requires_dist'].apply(json.loads)
# pypi_data_reduced['requires_dist'] = [','.join(x) for x in pypi_data_reduced['requires_dist']]
# pypi_data_reduced[['name', 'requires_dist']]
# Select from pypi_data_reduced all the data that has name zzzzls-Spider
pypi_data_reduced[pypi_data_reduced['name'] == 'pandas']['requires_dist'].values[0]
# pypi_data_reduced['']

['python-dateutil (>=2.8.1)',
 'pytz (>=2020.1)',
 'numpy (>=1.18.5) ; platform_machine != "aarch64" and platform_machine != "arm64" and python_version < "3.10"',
 'numpy (>=1.19.2) ; platform_machine == "aarch64" and python_version < "3.10"',
 'numpy (>=1.20.0) ; platform_machine == "arm64" and python_version < "3.10"',
 'numpy (>=1.21.0) ; python_version >= "3.10"',
 "hypothesis (>=5.5.3) ; extra == 'test'",
 "pytest (>=6.0) ; extra == 'test'",
 "pytest-xdist (>=1.31) ; extra == 'test'"]

In [12]:
# with open('../../data/repology/pypicache.json', 'r') as file:
#     json_data = json.load(file)

In [13]:
# nested_json_data = pd.json_normalize(json_data, max_level=2)
# nested_json_data

In [14]:
chunks = pd.read_json('../../data/repology/pypicache.json', lines=True, chunksize=100000)

for chunk in chunks:
    display(chunk)

ValueError: Expected object or value