In [None]:
import json
import codecs
import csv
import pandas as pd

NEO_VER_CSV = 'data/out/neo_ver.csv'
NEO_OUT_JSON = 'data/input/records.json'
CROSS_CSV = 'ne4out.csv'
df = pd.read_csv(NEO_VER_CSV)

def get_csv_data(file_path):
    with open(file_path) as f:
        data = csv.reader(f)
    return data

jdata = json.load(codecs.open(NEO_OUT_JSON, 'r', 'utf-8-sig'))

In [34]:
# Single example of getting a dataframe with packages that have same repo url

dfs = []
df2 = df[df['pkg_name'] == 'glib']
df2 = df2[~df2['url'].isnull()]
pkgmans = df2['pkgman'].unique()
for pkgman in pkgmans:
    dfs.append(df2[df2['pkgman'] == pkgman]) # Only drops repo duplicated because duplicate e.g. NaN in homepage would drop two different repos

master_df = pd.concat(dfs)
df_repo = master_df[master_df.duplicated(subset='url', keep=False)]
df_repo

Unnamed: 0,:ID,pkg_name,pkgman,version,:LABEL,url
102530,version62888cargo,glib,cargo,0.3.0,Version,https://github.com/gtk-rs/gtk-rs-core
102531,version157792cargo,glib,cargo,0.8.0,Version,https://github.com/gtk-rs/gtk-rs-core
102532,version203315cargo,glib,cargo,0.9.1,Version,https://github.com/gtk-rs/gtk-rs-core
102533,version17029cargo,glib,cargo,0.0.5,Version,https://github.com/gtk-rs/gtk-rs-core
102534,version416599cargo,glib,cargo,0.14.4,Version,https://github.com/gtk-rs/gtk-rs-core
...,...,...,...,...,...,...
17428076,version16696003npm,glib,npm,1.0.1,Version,git://github.com/bleupen/hapi-logger.git
17428077,version16696004npm,glib,npm,1.0.2,Version,git://github.com/bleupen/hapi-logger.git
17428078,version16696005npm,glib,npm,1.1.0,Version,git://github.com/bleupen/hapi-logger.git
17428079,version16696006npm,glib,npm,1.2.0,Version,git://github.com/bleupen/hapi-logger.git


In [31]:

rhdf = master_df[master_df.duplicated(subset='url', keep=False)]
rhdf = rhdf[~rhdf['url'].isnull()]
rhdf

Unnamed: 0,:ID,pkg_name,pkgman,version,:LABEL,url
102530,version62888cargo,glib,cargo,0.3.0,Version,https://github.com/gtk-rs/gtk-rs-core
102531,version157792cargo,glib,cargo,0.8.0,Version,https://github.com/gtk-rs/gtk-rs-core
102532,version203315cargo,glib,cargo,0.9.1,Version,https://github.com/gtk-rs/gtk-rs-core
102533,version17029cargo,glib,cargo,0.0.5,Version,https://github.com/gtk-rs/gtk-rs-core
102534,version416599cargo,glib,cargo,0.14.4,Version,https://github.com/gtk-rs/gtk-rs-core
...,...,...,...,...,...,...
17428076,version16696003npm,glib,npm,1.0.1,Version,git://github.com/bleupen/hapi-logger.git
17428077,version16696004npm,glib,npm,1.0.2,Version,git://github.com/bleupen/hapi-logger.git
17428078,version16696005npm,glib,npm,1.1.0,Version,git://github.com/bleupen/hapi-logger.git
17428079,version16696006npm,glib,npm,1.2.0,Version,git://github.com/bleupen/hapi-logger.git


In [None]:
hrdf = master_df[master_df["homepage"].isin(master_df["repo"]) + master_df["repo"].isin(master_df["homepage"])]
hrdf

In [None]:
pd.concat([df_repo, rhdf, hrdf]).drop_duplicates()

In [38]:
import math

# Example that iterates the records.json
result = []
TOTAL = len(jdata)
CURRENT = 0
PROGRESS = 0

for r in jdata:

    # Get name pkg and create initial list for dataframes
    name = r["name"]
    frames = []

    CURRENT += 1

    if PROGRESS < math.ceil((CURRENT/TOTAL)*100):
        PROGRESS = math.ceil((CURRENT/TOTAL)*100)
        print(f"Processing {PROGRESS}%  | {CURRENT}/{TOTAL}")

    # For each pkg(name), iterate the pkg managers 
    for pkgman in r["pkgmans"]:

        # Get df from NEO_VER for that pkg and remove rows with NaN in 'repo'
        ndf = df[df['pkg_name'] == name]
        ndf = ndf[~ndf['url'].isnull()]

        # Get df without duplicates and append to dataframes list
        frames.append(ndf[ndf['pkgman'] == pkgman].drop_duplicates('url'))

    # Concat all frames and keep only duplicated on repo field. Duplicated would be those who appear < 1
    mdf = pd.concat(frames)
    udf = mdf[mdf.duplicated(subset='url', keep=False)]
    mdf = udf.drop_duplicates()

    # Only add if merged dataframe is not empty
    if not mdf.empty:
        result.append(mdf)
        
fin = pd.concat(result)
fin.to_csv("out/neo4jout3.csv", encoding='utf-8', index=False)

Processing 13%  | 1/8
Processing 25%  | 2/8
Processing 38%  | 3/8
Processing 50%  | 4/8
Processing 63%  | 5/8
Processing 75%  | 6/8
Processing 88%  | 7/8
Processing 100%  | 8/8


In [2]:
CROSS_CSV = 'ne4out.csv'
df = pd.read_csv(CROSS_CSV)
df = df[df.duplicated(subset=['pkg_name'], keep=False)] # remove packages that occur once, but is a hit because repo and homepage are the same

df = df[df.duplicated(subset=['repo'], keep=False)]



#df 
df.to_csv('out/non_unique_repo.csv', encoding='utf-8', index=False)


In [15]:
nu_df = pd.read_csv('non_unique_repo.csv')
un_df = nu_df['pkg_name'].unique()
len(un_df) # 993 unique pkgs


993

In [16]:
gp_df = nu_df.groupby(['pkgman'])
gp_df.size()
# alire           1
# cargo         153
# chromebrew    354
# conan         453
# homebrew      602
# luarocks        7
# nimble          3
# npm            98
# vcpkg         449


pkgman
alire           1
nimble          3
luarocks        7
npm            98
cargo         153
chromebrew    354
vcpkg         449
conan         453
homebrew      602
dtype: int64

In [51]:
def get_pkgmans(pkgman):
    nimble_pkgs = nu_df[nu_df['pkgman'] == pkgman]['pkg_name'].to_list()
    unique_pkgmans = nu_df[nu_df['pkg_name'].isin(nimble_pkgs)]['pkgman'].unique()
    unique_pkgmans_list = unique_pkgmans.tolist()
    unique_pkgmans_list.remove(pkgman)
    unique_pkgmans_list.sort()
    return unique_pkgmans_list

print(get_pkgmans('alire'))
print(get_pkgmans('nimble'))
print(get_pkgmans('luarocks'))
print(get_pkgmans('npm'))
print(get_pkgmans('cargo'))
print(get_pkgmans('chromebrew'))
print(get_pkgmans('vcpkg'))
print(get_pkgmans('conan'))
print(get_pkgmans('homebrew'))


['chromebrew', 'vcpkg']
['cargo', 'conan', 'luarocks', 'vcpkg']
['cargo', 'conan', 'homebrew', 'nimble', 'npm', 'vcpkg']
['cargo', 'conan', 'homebrew', 'luarocks', 'vcpkg']
['chromebrew', 'conan', 'homebrew', 'luarocks', 'nimble', 'npm', 'vcpkg']
['alire', 'cargo', 'conan', 'homebrew', 'vcpkg']
['alire', 'cargo', 'chromebrew', 'conan', 'homebrew', 'luarocks', 'nimble', 'npm']
['cargo', 'chromebrew', 'homebrew', 'luarocks', 'nimble', 'npm', 'vcpkg']
['cargo', 'chromebrew', 'conan', 'luarocks', 'npm', 'vcpkg']


ValueError: list.remove(x): x not in list