In [None]:
import pandas as pd
from pandas import DataFrame, Series

from src.lib import version_extractor, name_extractor

pd.set_option('display.max_columns', 100)

In [None]:
pypi_data = pd.read_json('../../data/repology/pypicache.json')
pypi_data

### Converting Info JSON to a DataFrame

In [None]:
# pypi_data['info'].map(lambda x: x['author'])
# df = pypi_data['info'].apply(pd.Series) --- Slow in general. For the given case, just as fast as json_normalize (without a set max level) as that attempts to unnest the entire json object. ~ 30 seconds
# df = pd.json_normalize(pypi_data['info']) --- Slow due to previously mentioned unnesting. ~ 30 seconds
# df = pd.json_normalize(pypi_data['info'], max_level=0) --- Faster. ~ 5 seconds
# df = pd.DataFrame(pypi_data['info'].values.tolist()) --- Fastest. ~ 0.5 seconds

info_df = pd.DataFrame(pypi_data['info'].values.tolist())
info_df = info_df[['name', 'version', 'requires_dist', 'author']]
# Rename headers to make it more readable
info_df.rename(columns={'requires_dist': 'dependency', }, inplace=True)

In [None]:
sorted_df: DataFrame = info_df.sort_values(by=['name', 'version'], ascending=[True, False], ignore_index=True)
sorted_df

In [None]:
def extract_date_from_nested_releases_json(releases_json):
    if isinstance(releases_json, dict):
        latest_release = [*releases_json.values()][0]
        if latest_release:
            return latest_release[0]['upload_time']
    else:
        return None

In [None]:
upload_time_series: Series = pypi_data['releases'].map(extract_date_from_nested_releases_json)

In [None]:
sorted_df.insert(loc=2, column='upload_time', value=upload_time_series)
sorted_df

In [None]:
sorted_df = sorted_df.explode('dependency').reset_index(drop=True)
sorted_df

### Extracting information from the dependency string

In [None]:
dependency_version_series = sorted_df['dependency'].apply(version_extractor)
dependency_name_series = sorted_df['dependency'].apply(name_extractor)

# Used just for visual purposes
dependencies_df = pd.concat([dependency_name_series, dependency_version_series], axis=1, ignore_index=True)
dependencies_df.columns = ['dependency_name', 'dependency_version']
dependencies_df

In [None]:
sorted_df['dependency'] = dependency_name_series
sorted_df.insert(4, 'dependency_version', dependency_version_series)

In [None]:
def convert_to_normalized_format(grouped_df: DataFrame):
    # print(grouped_df)
    normalized_form = {
        # We know the name is the same for all rows
        'name': grouped_df['name'].iloc[0],
        'versions': {}
    }
    for index, version in enumerate(grouped_df['version']):
        normalized_form['versions'][version] = {
            'timestamp': grouped_df['upload_time'].iloc[index],
            'dependencies': {}
        }
        for dependency, dependency_version in zip(grouped_df['dependency'], grouped_df['dependency_version']):
            normalized_form['versions'][version]['dependencies'][dependency] = dependency_version

    return normalized_form


normalized_df: DataFrame = sorted_df.copy().dropna()
normalized_json_df = normalized_df.groupby('name').apply(convert_to_normalized_format)
normalized_json_df

### Saving the processed data to file

In [None]:
normalized_df.to_csv('../../data/output/pypi-repology-dependencies.csv', index=False)
normalized_json_df.to_json('../../data/output/pypi-repology-dependencies.json', orient='records')

## Converting Releases JSON to a DataFrame
Decided against using releases as they do not contain meaningful information. Most of the time they only contain the most recent version that can be recovered from the info JSON

In [None]:
# releases_df = pd.DataFrame(pypi_data['releases'].values.tolist()) --- Runs out of memory
# releases_df = pypi_data['releases'].map(lambda x: x.keys())

### Bits and bobs that were tinkered with but were scrapped

In [None]:
# test = pypi_data['info'].apply(lambda el: json.loads(json.dumps(el)))
# test
# pypi_data['info'][0]


In [None]:
# pypi_data['info'].to_json('../../data/repology/pypi_info.json', orient='records', lines=True)

In [None]:
# pypi_data_reduced = pd.read_json('../../data/repology/pypi_info.json', orient='records', lines=True)

In [None]:
# pypi_data_reduced.dropna(subset=['requires_dist'], inplace=True)
# pypi_data_reduced.reset_index(drop=True, inplace=True)
# # pypi_data_reduced['requires_dist'] = pypi_data_reduced['requires_dist'].apply(json.loads)
# pypi_data_reduced['requires_dist'] = [','.join(x) for x in pypi_data_reduced['requires_dist']]
# pypi_data_reduced[['name', 'requires_dist']]
# Select from pypi_data_reduced all the data that has name zzzzls-Spider
# pypi_data_reduced[pypi_data_reduced['name'] == 'pandas']['requires_dist'].values[0]
# pypi_data_reduced['']

In [None]:
# with open('../../data/repology/pypicache.json', 'r') as file:
#     json_data = json.load(file)

In [None]:
# nested_json_data = pd.json_normalize(json_data, max_level=2)
# nested_json_data

In [None]:
# chunks = pd.read_json('../../data/repology/pypicache.json', lines=True, chunksize=100000)
#
# for chunk in chunks:
#     display(chunk)

In [None]:
# Initial method for converting to a normalized format. Was hard to read, and it contained quite a few bugs. Decided it was better to rewrite it
# def convert_to_normalized_format(grouped_df: DataFrameGroupBy):
#     return_list = []
#     for _, rows in grouped_df:
#         inner_dict = {
#             'name': rows['name'].values[0],
#             'versions': {
#                 rows['version'].values[0]: {
#                     'timestamp': rows['upload_time'].values[0],
#                     'dependencies': {}
#                 }
#             }
#         }
#         for dep, v in zip(rows['dependency'].values, rows['dependency_version'].values):
#             inner_dict['versions'][rows['version'].values[0]]['dependencies'] |= {dep: v}
#         return_list.append(inner_dict)
#     return pd.DataFrame(return_list)

In [None]:
# normalized_df: DataFrame = info_df.copy().dropna()
# display(normalized_df.loc[normalized_df['name'] == '024travis-test024'])
# normalized_json_df = normalized_df.groupby('name').pipe(convert_to_normalized_format)
# normalized_json_df

In [None]:
# Attempt to use multiprocessing. Ended up not using it since I discovered that using built-in strings is extremely fast compared to regex.
# cores = multiprocessing.cpu_count()
# chunks = np.array_split(info_df['dependency'], cores)
#
# with Pool(cores) as pool:
#     processed = pd.concat(pool.map(extract_semantic_version, chunks), ignore_index=True)

# processed
# info_df.dependency.str.extract(compiled_rx)