In [196]:
import json
import codecs
import csv
import pandas as pd

NEO_VER_CSV = 'data/out/neo_ver_nonpm.csv'
NEO_OUT_JSON = 'data/input/records-full.json'
df = pd.read_csv(NEO_VER_CSV)

def get_csv_data(file_path):
    with open(file_path) as f:
        data = csv.reader(f)
    return data

jdata = json.load(codecs.open(NEO_OUT_JSON, 'r', 'utf-8-sig'))

In [157]:
# Single example of getting a dataframe with packages that have same repo url

dfs = []
df2 = df[df['pkg_name'] == 'glib']
df2 = df2[~df2['repo'].isnull()]
pkgmans = df2['pkgman'].unique()
for pkgman in pkgmans:
    dfs.append(df2[df2['pkgman'] == pkgman]) # Only drops repo duplicated because duplicate e.g. NaN in homepage would drop two different repos

master_df = pd.concat(dfs)
df_repo = master_df[master_df.duplicated(subset='repo', keep=False)]
df_repo

Unnamed: 0,:ID,pkg_name,pkgman,version,:LABEL,homepage,repo
192520,version17029cargo,glib,cargo,0.0.5,Version,https://github.com/gtk-rs/gtk-rs-core,https://developer.gnome.org/glib
192521,version177941cargo,glib,cargo,0.8.2,Version,https://github.com/gtk-rs/gtk-rs-core,https://gtk-rs.org/
192522,version135106cargo,glib,cargo,0.7.0,Version,https://github.com/gtk-rs/gtk-rs-core,https://gtk-rs.org/
192523,version63643cargo,glib,cargo,0.3.1,Version,https://github.com/gtk-rs/gtk-rs-core,https://gtk-rs.org/
192524,version18630cargo,glib,cargo,0.0.7,Version,https://github.com/gtk-rs/gtk-rs-core,https://gtk-rs.org/
...,...,...,...,...,...,...,...
17316106,version16588286npm,glib,npm,1.0.2,Version,git://github.com/bleupen/hapi-logger.git,https://github.com/bleupen/hapi-logger
17316107,version16588287npm,glib,npm,1.1.0,Version,git://github.com/bleupen/hapi-logger.git,https://github.com/bleupen/hapi-logger
17316108,version16588288npm,glib,npm,1.2.0,Version,git://github.com/bleupen/hapi-logger.git,https://github.com/bleupen/hapi-logger
17316109,version16588289npm,glib,npm,1.3.0,Version,git://github.com/bleupen/hapi-logger.git,https://github.com/bleupen/hapi-logger


In [143]:

rhdf = master_df[master_df.duplicated(subset='homepage', keep=False)]
rhdf = rhdf[~rhdf['homepage'].isnull()]
rhdf

Unnamed: 0,:ID,pkg_name,pkgman,version,:LABEL,homepage,repo
192520,version17029cargo,glib,cargo,0.0.5,Version,https://github.com/gtk-rs/gtk-rs-core,https://developer.gnome.org/glib
192521,version177941cargo,glib,cargo,0.8.2,Version,https://github.com/gtk-rs/gtk-rs-core,https://gtk-rs.org/


In [145]:
hrdf = master_df[master_df["homepage"].isin(master_df["repo"]) + master_df["repo"].isin(master_df["homepage"])]
hrdf

Unnamed: 0,:ID,pkg_name,pkgman,version,:LABEL,homepage,repo
534170,version205chromebrew,glib,chromebrew,2.72.1,Version,https://gitlab.gnome.org/GNOME/glib,https://developer.gnome.org/glib
704803,version1140conan,glib,conan,2.72.1,Version,,https://gitlab.gnome.org/GNOME/glib


In [147]:
pd.concat([df_repo, rhdf, hrdf]).drop_duplicates()

Unnamed: 0,:ID,pkg_name,pkgman,version,:LABEL,homepage,repo
192520,version17029cargo,glib,cargo,0.0.5,Version,https://github.com/gtk-rs/gtk-rs-core,https://developer.gnome.org/glib
534170,version205chromebrew,glib,chromebrew,2.72.1,Version,https://gitlab.gnome.org/GNOME/glib,https://developer.gnome.org/glib
709199,version1875homebrew,glib,homebrew,2.72.1,Version,https://download.gnome.org/sources/glib/2.72/g...,https://developer.gnome.org/glib/
25850731,version625vcpkg,glib,vcpkg,2.70.1,Version,,https://developer.gnome.org/glib/
192521,version177941cargo,glib,cargo,0.8.2,Version,https://github.com/gtk-rs/gtk-rs-core,https://gtk-rs.org/
704803,version1140conan,glib,conan,2.72.1,Version,,https://gitlab.gnome.org/GNOME/glib


In [195]:
import math

# Example that iterates the records.json
result = []
TOTAL = len(jdata)
CURRENT = 0
PROGRESS = 0

for r in jdata:

    # Get name pkg and create initial list for dataframes
    name = r["name"]
    frames = []

    CURRENT += 1

    if PROGRESS < math.ceil((CURRENT/TOTAL)*100):
        PROGRESS = math.ceil((CURRENT/TOTAL)*100)
        print(f"Processing {PROGRESS}%  | {CURRENT}/{TOTAL}")

    # For each pkg(name), iterate the pkg managers 
    for pkgman in r["pkgmans"]:

        # Get df from NEO_VER for that pkg and remove rows with NaN in 'repo'
        ndf = df[df['pkg_name'] == name]
        ndf = ndf[~ndf['repo'].isnull()]

        # Get df without duplicates and append to dataframes list
        frames.append(ndf[ndf['pkgman'] == pkgman].drop_duplicates('repo').drop_duplicates('homepage'))

    # Concat all frames and keep only duplicated on repo field. Duplicated would be those who appear < 1
    mdf = pd.concat(frames)
    rdf = mdf[mdf.duplicated(subset='repo', keep=False)]
    hdf = mdf[mdf.duplicated(subset='homepage', keep=False)]
    hdf = hdf[~hdf['homepage'].isnull()]
    hrdf = mdf[mdf["homepage"].isin(mdf["repo"]) + mdf["repo"].isin(mdf["homepage"])]
    mdf = pd.concat([rdf,hdf,hrdf]).drop_duplicates()

    # Only add if merged dataframe is not empty
    if not mdf.empty:
        result.append(mdf)
        
fin = pd.concat(result)
fin.to_csv("ne4out_nonpm.csv", encoding='utf-8', index=False)

Processing 50%  | 1/2
Processing 100%  | 2/2
