### Load Initial Libraries and Datasets

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import altair as alt
alt.renderers.enable('mimetype')
alt.data_transformers.disable_max_rows()
import networkx as nx
import os

In [2]:
# user_df = pd.read_csv("../../datasets/entity_files/users_dataset.csv")
# repo_df = pd.read_csv("../../datasets/large_files/entity_files/repos_dataset.csv", low_memory=False)
# org_df = pd.read_csv("../../datasets/entity_files/orgs_dataset.csv", low_memory=False)
initial_core_users = pd.read_csv("../../datasets/derived_files/initial_core_users.csv")
initial_core_users['origin'] = 'initial_core'
initial_core_repos = pd.read_csv("../../datasets/derived_files/initial_core_repos.csv")
initial_core_repos['origin'] = 'initial_core'
initial_core_orgs = pd.read_csv("../../datasets/derived_files/initial_core_orgs.csv")
initial_core_orgs['origin'] = 'initial_core'

firstpass_core_users = pd.read_csv("../../datasets/derived_files/firstpass_core_users.csv")
firstpass_core_users['origin'] = 'firstpass_core'
firstpass_core_repos = pd.read_csv("../../datasets/derived_files/firstpass_core_repos.csv")
firstpass_core_repos['origin'] = 'firstpass_core'
firstpass_core_orgs = pd.read_csv("../../datasets/derived_files/firstpass_core_orgs.csv")
firstpass_core_orgs['origin'] = 'firstpass_core'

finalpass_core_users = pd.read_csv("../../datasets/derived_files/finalpass_core_users.csv")
finalpass_core_users['origin'] = 'finalpass_core'
finalpass_core_repos = pd.read_csv("../../datasets/large_files/derived_files/finalpass_core_repos.csv", low_memory=False, on_bad_lines='skip')
finalpass_core_repos['origin'] = 'finalpass_core'
finalpass_core_orgs = pd.read_csv("../../datasets/derived_files/finalpass_core_orgs.csv")
finalpass_core_orgs['origin'] = 'finalpass_core'

  firstpass_core_repos = pd.read_csv("../../datasets/derived_files/firstpass_core_repos.csv")


In [3]:
core_users = pd.concat([initial_core_users, firstpass_core_users, finalpass_core_users])
core_repos = pd.concat([initial_core_repos, firstpass_core_repos, finalpass_core_repos])
core_orgs = pd.concat([initial_core_orgs, firstpass_core_orgs, finalpass_core_orgs])

### Process Network Connections

- user-user interactions:
  - org_members
  - user_orgs
  - user_followers
  - user_following

- user-repo interactions:
  - user_repos
  - contirbutors_repos
  - starrers_repos
  - subscribers_repos
  - forkers_repos
  - committers_repos
  - 

In [4]:
def process_network_connections():
    dfs = []
    for dir, subdir, files in os.walk("../../datasets/join_files/"):
        for f in files:
            if ('search' not in f) and (f.endswith(".csv")):
                entity_type = f.split("_")[0]
                print(f)
                df = pd.read_csv(os.path.join(dir, f))
                cols = df.columns
                cols = [col for col in cols if any(x in col for x in ['full_name', 'login'])]
                cols = [col for col in cols if ('.' not in col) or ('user.login' in col)]
                if len(cols) > 2:
                    cols = ['user.login', 'repo_full_name']
                if len(cols) == 1:
                    cols = cols + ['author.login']
                df_dict = {}
                df_dict['entity_type'] = entity_type
                
                df_dict['source'] = cols[0] if 'forks' not in f else "owner.login"
                df_dict['target'] = cols[1]
                df_dict['file_name'] = os.path.join(dir, f)
                df_dict['file_length'] = len(df)
                dfs.append(df_dict)

    for dir, subdir, files in os.walk("../../datasets/large_files/join_files/"):
        for f in files:
            if ('search' not in f) and (f.endswith(".csv")) and ('commits' not in f):
                entity_type = f.split("_")[0]
                print(f)
                df = pd.read_csv(os.path.join(dir, f))
                cols = df.columns
                cols = [col for col in cols if any(x in col for x in ['full_name', 'login'])]
                cols = [col for col in cols if ('.' not in col) or ('user.login' in col)]
                if len(cols) > 2:
                    cols = ['user.login', 'repo_full_name']
                if len(cols) == 1:
                    cols = cols + ['author.login']
                df_dict = {}
                df_dict['entity_type'] = entity_type
                df_dict['source'] = cols[0] if 'forks' not in f else "owner.login"
                df_dict['target'] = cols[1]
                df_dict['file_name'] = os.path.join(dir, f)
                df_dict['file_length'] = len(df)
                dfs.append(df_dict)
    return dfs

In [5]:
name_mapping = {
    '../../datasets/join_files/org_members_join_dataset.csv': 'Members that are part of orgs',
    '../../datasets/join_files/org_followers_join_dataset.csv': 'Followers of orgs',
    '../../datasets/join_files/repo_subscribers_join_dataset.csv': 'Subscribers of repos',
    '../../datasets/join_files/repo_orgs_join_dataset.csv': 'Orgs of repos',
    '../../datasets/large_files/join_files/org_repos_join_dataset.csv': 'Repos that belong to orgs',
    '../../datasets/large_files/join_files/user_repos_join_dataset.csv': 'Repos that belong to users',
    '../../datasets/large_files/join_files/user_following_join_dataset.csv': 'Users that are following other users',
    '../../datasets/large_files/join_files/repo_comments_join_dataset.csv': 'Comments from repos',
    '../../datasets/large_files/join_files/user_subscriptions_join_dataset.csv': 'Subscriptions of users',
    '../../datasets/large_files/join_files/issues_comments_join_dataset.csv': 'Comments from repo issues',
    '../../datasets/large_files/join_files/repo_subscribers_join_dataset.csv': 'Subscribers of repos',
    '../../datasets/large_files/join_files/user_followers_join_dataset.csv': 'Followers of users',
    '../../datasets/large_files/join_files/user_starred_join_dataset.csv': 'Starred repos of users',
    '../../datasets/large_files/join_files/repo_stargazers_join_dataset.csv': 'Stargazers of repos',
    '../../datasets/large_files/join_files/repo_contributors_join_dataset.csv': 'Contributors to repos',
    '../../datasets/large_files/join_files/repo_issues_join_dataset.csv': 'Issues from repos',
    '../../datasets/large_files/join_files/repo_pulls_join_dataset.csv': 'Pull requests from repos',
    '../../datasets/large_files/join_files/user_orgs_join_dataset.csv': 'Orgs of users',
    '../../datasets/large_files/join_files/pulls_comments_join_dataset.csv': 'Comments from repo pull requests',
    '../../datasets/large_files/join_files/repo_forks_join_dataset.csv': 'Forks of repos',
}
def get_bipartite_connections(cols_df):
    cols_df['network_type'] = None
    for index, row in cols_df.iterrows():
        if row.target == 'org_login':
            network_first = "org"
            network_second = 'user' if row.source == 'login' else 'repo'
            network_type = network_first + "_" + network_second
            cols_df.loc[index, 'network_type'] = network_type
        if row.target == 'repo_full_name':
            network_first = "repo"
            if 'orgs' in row.file_name:
                network_second = 'org'
            else:
                network_second = 'user' if 'login' in row.source else 'repo'
            network_type = network_first + "_" + network_second
            cols_df.loc[index, 'network_type'] = network_type
        if row.target == 'user_login':
            network_first = "user"
            if "orgs" in row.file_name:
                network_second = 'org'
            else:
                network_second = 'user' if 'login' in row.source else 'repo'
            network_type = network_first + "_" + network_second
            cols_df.loc[index, 'network_type'] = network_type
        if row.source == 'repo_full_name':
            network_type = "repo_user"
            cols_df.loc[index, 'network_type'] = network_type

    cols_df['descriptive_file_name'] = cols_df.file_name.map(name_mapping)
    return cols_df


In [6]:
def get_descriptive_name(file_name):
    # Split the file name on '/' and get the last element, then split on '_' and get the first two elements.
    name_parts = file_name.split('/')[-1].split('_')[:2]
    # Convert the list into a string, with the elements separated by a space.
    descriptive_name = ' '.join(name_parts).title()
    return descriptive_name

In [7]:
def get_network_connections():
    if os.path.exists("../../datasets/derived_files/file_totals.csv"):
        cols_df = pd.read_csv("../../datasets/derived_files/file_totals.csv")
    else:
        dfs = process_network_connections()
        cols_df = pd.DataFrame(dfs)
        # We will initially have bipartite networks but I'm interested in projecting them to find communities as well
        cols_df = get_bipartite_connections(cols_df)
        # Apply the function to the 'file_name' column to create the new 'descriptive_name' column.
        cols_df['short_file_name'] = cols_df['file_name'].apply(get_descriptive_name)
        print(cols_df.groupby('network_type').size())
        cols_df.to_csv("../../datasets/derived_files/file_totals.csv", index=False)
    return cols_df

In [8]:
cols_df = get_network_connections()

In [16]:
cols_df.file_name = cols_df.file_name.str.replace("../data/", "../../datasets/")

  cols_df.file_name = cols_df.file_name.str.replace("../data/", "../../datasets/")


In [18]:
cols_df['interaction_type'] = cols_df.network_type
user_mapping = {
    'user': 'User',
    'repo': 'Repository',
    'org': 'Organization',

}
def update_interaction_type(row):
    first_item = row.interaction_type.split("_")[0]
    second_item = row.interaction_type.split("_")[1]
    first_item = user_mapping[first_item]
    second_item = user_mapping[second_item]
    row.interaction_type = first_item + " - " + second_item + " Interaction"
    return row

cols_df = cols_df.apply(update_interaction_type, axis=1)

In [22]:
cols_df[['descriptive_file_name', 'interaction_type', 'file_length']].to_csv("../../datasets/derived_files/interaction_totals.csv", index=False)

In [38]:
def load_network_data(cols_df):
    org_user_dfs =[]
    repo_user_dfs = []
    repo_org_dfs = []
    org_repo_dfs = []
    user_repo_dfs = []
    user_user_dfs = []
    user_org_dfs = []

    mapping_networks = {
        'org_user': org_user_dfs,
        'repo_user': repo_user_dfs,
        'repo_org': repo_org_dfs,
        'org_repo': org_repo_dfs,
        'user_repo': user_repo_dfs,
        'user_user': user_user_dfs,
        'user_org': user_org_dfs
    }

    for _, row in cols_df.iterrows():
        df = pd.read_csv(row['file_name'], low_memory=False)
        target_type = row.target
        source_type = row.source
        print(target_type, source_type, row.file_name, row.network_type)
        if 'org' in target_type:
            for origin in core_orgs.origin.unique().tolist():
                filtered_df = df[(df[row.target].isin(core_orgs[core_orgs.origin == origin].login))]
                grouped_filtered_df = filtered_df.groupby([target_type, source_type]).size().reset_index(name='counts')
                grouped_filtered_df['entity_type'] = row['entity_type']
                grouped_filtered_df['file_path'] = row['file_name']
                grouped_filtered_df['file_length'] = row['file_length']
                grouped_filtered_df['origin'] = origin
                grouped_filtered_df['descriptive_file_name'] = row['descriptive_file_name']
                grouped_filtered_df['short_file_name'] = row['short_file_name']
                grouped_filtered_df['network_type'] = row['network_type']
                grouped_filtered_df['target_type'] = row['target']
                grouped_filtered_df['source_type'] = row['source']
                assign_dfs = mapping_networks[row.network_type]
                assign_dfs.append(grouped_filtered_df)
        if ('repo_full_name' in target_type) or ('repo_full_name' in source_type):
            for origin in core_repos.origin.unique().tolist():
                if row.short_file_name == "Repo Stargazers":
                    filtered_df = df[(df[row.source].isin(core_repos[core_repos.origin == origin].full_name))]
                    if (target_type == 'user.login') or (target_type == 'owner.login'):
                        filtered_df = filtered_df.rename(columns={target_type: 'login'})
                        updated_target_type = 'login'
                    else:
                        updated_target_type = source_type
                    grouped_filtered_df = filtered_df.groupby([source_type, updated_target_type]).size().reset_index(name='counts')
                else:
                    filtered_df = df[(df[row.target].isin(core_repos[core_repos.origin == origin].full_name))]
                    if (source_type == 'user.login') or (source_type == 'owner.login'):
                        filtered_df = filtered_df.rename(columns={source_type: 'login'})
                        updated_source_type = 'login'
                    else:
                        updated_source_type = source_type
                    grouped_filtered_df = filtered_df.groupby([target_type, updated_source_type]).size().reset_index(name='counts')
                grouped_filtered_df['entity_type'] = row['entity_type']
                grouped_filtered_df['file_path'] = row['file_name']
                grouped_filtered_df['file_length'] = row['file_length']
                grouped_filtered_df['origin'] = origin
                grouped_filtered_df['descriptive_file_name'] = row['descriptive_file_name']
                grouped_filtered_df['short_file_name'] = row['short_file_name']
                grouped_filtered_df['network_type'] = row['network_type']
                grouped_filtered_df['target_type'] = row['target']
                grouped_filtered_df['source_type'] = row['source']
                assign_dfs = mapping_networks[row.network_type]
                assign_dfs.append(grouped_filtered_df)
        if 'user_login' in target_type:
            for origin in core_users.origin.unique().tolist():
                filtered_df = df[(df[row.target].isin(core_users[core_users.origin == origin].login))]
                grouped_filtered_df = filtered_df.groupby([target_type, source_type]).size().reset_index(name='counts')
                grouped_filtered_df['entity_type'] = row['entity_type']
                grouped_filtered_df['file_path'] = row['file_name']
                grouped_filtered_df['file_length'] = row['file_length']
                grouped_filtered_df['origin'] = origin
                grouped_filtered_df['descriptive_file_name'] = row['descriptive_file_name']
                grouped_filtered_df['short_file_name'] = row['short_file_name']
                grouped_filtered_df['network_type'] = row['network_type']
                grouped_filtered_df['target_type'] = row['target']
                grouped_filtered_df['source_type'] = row['source']
                assign_dfs = mapping_networks[row.network_type]
                assign_dfs.append(grouped_filtered_df)

    return org_user_dfs, repo_user_dfs, repo_org_dfs, org_repo_dfs, user_repo_dfs, user_user_dfs, user_org_dfs      


In [42]:
org_user_dfs, repo_user_dfs, repo_org_dfs, org_repo_dfs, user_repo_dfs, user_user_dfs, user_org_dfs   = load_network_data(cols_df)
# combined_repos_dfs = pd.concat(repos_dfs)
# combined_orgs_dfs = pd.concat(orgs_dfs)
# combined_users_dfs = pd.concat(users_dfs)

org_login login ../../datasets/join_files/org_members_join_dataset.csv org_user
org_login login ../../datasets/join_files/org_followers_join_dataset.csv org_user
repo_full_name login ../../datasets/join_files/repo_subscribers_join_dataset.csv repo_user
repo_full_name login ../../datasets/join_files/repo_orgs_join_dataset.csv repo_org
org_login full_name ../../datasets/large_files/join_files/org_repos_join_dataset.csv org_repo
user_login full_name ../../datasets/large_files/join_files/user_repos_join_dataset.csv user_repo
user_login login ../../datasets/large_files/join_files/user_following_join_dataset.csv user_user
repo_full_name user.login ../../datasets/large_files/join_files/repo_comments_join_dataset.csv repo_user
user_login full_name ../../datasets/large_files/join_files/user_subscriptions_join_dataset.csv user_repo
repo_full_name user.login ../../datasets/large_files/join_files/issues_comments_join_dataset.csv repo_user
repo_full_name login ../../datasets/large_files/join_files/

In [44]:
test = pd.concat(org_user_dfs)
test

Unnamed: 0,org_login,login,counts,entity_type,file_path,file_length,origin,descriptive_file_name,short_file_name,network_type,target_type,source_type
0,ABC-DH,digitalkoine,1,org,../../datasets/join_files/org_members_join_dat...,22480,initial_core,Members that are part of orgs,Org Members,org_user,org_login,login
1,ADHO,ColeDCrawford,1,org,../../datasets/join_files/org_members_join_dat...,22480,initial_core,Members that are part of orgs,Org Members,org_user,org_login,login
2,ADHO,briancroxall,1,org,../../datasets/join_files/org_members_join_dat...,22480,initial_core,Members that are part of orgs,Org Members,org_user,org_login,login
3,ADHO,simonwiles,1,org,../../datasets/join_files/org_members_join_dat...,22480,initial_core,Members that are part of orgs,Org Members,org_user,org_login,login
4,BCDH,ttasovac,1,org,../../datasets/join_files/org_members_join_dat...,22480,initial_core,Members that are part of orgs,Org Members,org_user,org_login,login
...,...,...,...,...,...,...,...,...,...,...,...,...
469,sna-unipi,gatto,2,org,../../datasets/join_files/org_followers_join_d...,5108,finalpass_core,Followers of orgs,Org Followers,org_user,org_login,login
470,sna-unipi,lyereth,2,org,../../datasets/join_files/org_followers_join_d...,5108,finalpass_core,Followers of orgs,Org Followers,org_user,org_login,login
471,sna-unipi,shenjiaxing,2,org,../../datasets/join_files/org_followers_join_d...,5108,finalpass_core,Followers of orgs,Org Followers,org_user,org_login,login
472,sna-unipi,zaffo1,2,org,../../datasets/join_files/org_followers_join_d...,5108,finalpass_core,Followers of orgs,Org Followers,org_user,org_login,login


In [49]:
dict1 = test[test.short_file_name == "Org Followers"][0:1].to_dict()
dict2 = test[test.short_file_name == "Org Members"][0:1].to_dict()

combined_dicts = [dict1, dict2]
combined_dicts

[{'org_login': {0: 'ABC-DH'},
  'login': {0: 'digitalkoine'},
  'counts': {0: 2},
  'entity_type': {0: 'org'},
  'file_path': {0: '../../datasets/join_files/org_followers_join_dataset.csv'},
  'file_length': {0: 5108},
  'origin': {0: 'initial_core'},
  'descriptive_file_name': {0: 'Followers of orgs'},
  'short_file_name': {0: 'Org Followers'},
  'network_type': {0: 'org_user'},
  'target_type': {0: 'org_login'},
  'source_type': {0: 'login'}},
 {'org_login': {0: 'ABC-DH'},
  'login': {0: 'digitalkoine'},
  'counts': {0: 1},
  'entity_type': {0: 'org'},
  'file_path': {0: '../../datasets/join_files/org_members_join_dataset.csv'},
  'file_length': {0: 22480},
  'origin': {0: 'initial_core'},
  'descriptive_file_name': {0: 'Members that are part of orgs'},
  'short_file_name': {0: 'Org Members'},
  'network_type': {0: 'org_user'},
  'target_type': {0: 'org_login'},
  'source_type': {0: 'login'}}]

In [None]:
def build_networks(cols_df):
    if os.path.exists("../data/derived_files/repo_user_network.csv") and os.path.exists("../data/derived_files/org_user_network.csv") and os.path.exists("../data/derived_files/org_repo_network.csv"):
        grouped_combined_repos = pd.read_csv("../data/derived_files/repo_user_network.csv")
    else:
        org_user_dfs, repo_user_dfs, repo_org_dfs, org_repo_dfs, user_repo_dfs, user_user_dfs, user_org_dfs = load_network_data(cols_df)
        combined_org_user_dfs = pd.concat(org_user_dfs)
        grouped_combined_org_users = combined_org_user_dfs.groupby(['org_login', 'login']).size().reset_index(name='counts')
        
        



In [41]:
cols_df

Unnamed: 0,entity_type,source,target,file_name,file_length,network_type,descriptive_file_name,short_file_name
0,org,login,org_login,../../datasets/join_files/org_members_join_dat...,22480,org_user,Members that are part of orgs,Org Members
1,org,login,org_login,../../datasets/join_files/org_followers_join_d...,5108,org_user,Followers of orgs,Org Followers
2,repo,login,repo_full_name,../../datasets/join_files/repo_subscribers_joi...,21898,repo_user,Subscribers of repos,Repo Subscribers
3,repo,login,repo_full_name,../../datasets/join_files/repo_orgs_join_datas...,3713,repo_org,Orgs of repos,Repo Orgs
4,org,full_name,org_login,../../datasets/large_files/join_files/org_repo...,25542,org_repo,Repos that belong to orgs,Org Repos
5,user,full_name,user_login,../../datasets/large_files/join_files/user_rep...,1162246,user_repo,Repos that belong to users,User Repos
6,user,login,user_login,../../datasets/large_files/join_files/user_fol...,389114,user_user,Users that are following other users,User Following
7,repo,user.login,repo_full_name,../../datasets/large_files/join_files/repo_com...,1224,repo_user,Comments from repos,Repo Comments
8,user,full_name,user_login,../../datasets/large_files/join_files/user_sub...,145345,user_repo,Subscriptions of users,User Subscriptions
9,issues,user.login,repo_full_name,../../datasets/large_files/join_files/issues_c...,141566,repo_user,Comments from repo issues,Issues Comments


In [5]:
def build_networks(cols_df):
    networks_paths = {
        'org_user': "../../datasets/derived_files/org_user_network.csv",
        'repo_user': "../../datasets/derived_files/repo_user_network.csv",
        'repo_org': "../../datasets/derived_files/repo_org_network.csv",
        'org_repo': "../../datasets/derived_files/org_repo_network.csv",
        'user_repo': "../../datasets/derived_files/user_repo_network.csv",
        'user_user': "../../datasets/derived_files/user_user_network.csv",
        'user_org': "../../datasets/derived_files/user_org_network.csv"
    }

    for network, file_path in networks_paths.items():
        if os.path.exists(file_path):
            yield pd.read_csv(file_path)
        else:
            org_user_dfs, repo_user_dfs, repo_org_dfs, org_repo_dfs, user_repo_dfs, user_user_dfs, user_org_dfs = load_network_data(cols_df)
            dfs_mapping = {
                'org_user': org_user_dfs,
                'repo_user': repo_user_dfs,
                'repo_org': repo_org_dfs,
                'org_repo': org_repo_dfs,
                'user_repo': user_repo_dfs,
                'user_user': user_user_dfs,
                'user_org': user_org_dfs
            }
            df = dfs_mapping[network]
            network_row = cols_df[cols_df.network_type == network]
            combined_df = pd.concat(df)
            grouped_df = combined_df.groupby(['org_login', 'login']).size().reset_index(name='counts')
            grouped_df.to_csv(file_path, index=False)
            yield grouped_df


repo_full_name login ../data/join_files/repo_subscribers_join_dataset.csv
repo_full_name user.login ../data/large_files/join_files/repo_comments_join_dataset.csv
repo_full_name user.login ../data/large_files/join_files/issues_comments_join_dataset.csv
repo_full_name login ../data/large_files/join_files/repo_subscribers_join_dataset.csv
user.login repo_full_name ../data/large_files/join_files/repo_stargazers_join_dataset.csv
repo_full_name login ../data/large_files/join_files/repo_contributors_join_dataset.csv
repo_full_name user.login ../data/large_files/join_files/repo_issues_join_dataset.csv
repo_full_name user.login ../data/large_files/join_files/repo_pulls_join_dataset.csv
repo_full_name user.login ../data/large_files/join_files/pulls_comments_join_dataset.csv
repo_full_name owner.login ../data/large_files/join_files/repo_forks_join_dataset.csv


In [7]:
grouped_combined_repos = combined_repos_dfs.groupby(['repo_full_name', 'login'])['counts'].sum().reset_index()

In [8]:
grouped_combined_repos.to_csv("../data/derived_files/repo_user_network.csv", index=False)

In [7]:
grouped_combined_repos = pd.read_csv("../data/derived_files/repo_user_network.csv")

In [8]:
import numpy as np
pivoted_df = pd.pivot(grouped_combined_repos, index='repo_full_name', columns='login', values='counts').reset_index()
pivoted_df = pivoted_df.fillna(0)


In [9]:
pivoted_df = pivoted_df.reset_index(drop=True)

In [5]:
# pivoted_df.to_csv("../data/derived_files/repo_user_network_pivoted.csv", index=False)

In [15]:
# import umap
# import matplotlib.pyplot as plt
# from scipy.sparse import csr_matrix

# # Set up the UMAP parameters
# umap_params = {
#     "n_neighbors": 30,
#     "min_dist": 0.3,
#     "n_components": 2,
#     "metric": 'euclidean',
#     "random_state": 42
# }

# # Assume df_umap is your dataframe
# sparse_matrix = csr_matrix(pivoted_df.values)

# # Initialize UMAP
# reducer = umap.UMAP(**umap_params)

# # Fit the model and transform the data
# embedding = reducer.fit_transform(sparse_matrix)

# # Plot the UMAP projection
# plt.figure(figsize=(10, 10))
# plt.scatter(embedding[:, 0], embedding[:, 1], s=0.1)
# plt.gca().set_aspect('equal', 'datalim')
# plt.title('UMAP projection of repo_full_name and login interactions', fontsize=12)
# plt.show()


In [5]:
import umap
# import hdbscan
# import matplotlib.pyplot as plt
import numpy as np
# import seaborn as sns

# Set up the UMAP parameters
umap_params = {
    "n_neighbors": 20,
    "min_dist": 0.5,
    # "n_components": 2,
    "metric": 'euclidean',
    "random_state": 42
}

df_umap = pivoted_df.copy()
# Initialize UMAP
reducer = umap.UMAP(**umap_params)

# Fit the model and transform the data
embedding = reducer.fit_transform(df_umap.values)

# Create a HDBSCAN clusterer
# clusterer = hdbscan.HDBSCAN(min_cluster_size=15, gen_min_span_tree=True)

# # Fit the clusterer to the UMAP embeddings
# clusterer.fit(embedding)

# # Generate the cluster labels
# color_palette = sns.color_palette('deep', max(clusterer.labels_) + 1)
# cluster_colors = [color_palette[x] if x >= 0 else (0.5, 0.5, 0.5) for x in clusterer.labels_]
# cluster_member_colors = [sns.desaturate(x, p) for x, p in zip(cluster_colors, clusterer.probabilities_)]

# # Plot the UMAP projection with cluster colors
# plt.figure(figsize=(10, 10))
# plt.scatter(embedding[:, 0], embedding[:, 1], s=0.1, c=cluster_member_colors)
# plt.gca().set_aspect('equal', 'datalim')
# plt.title('UMAP projection of repo_full_name and login interactions, colored by cluster', fontsize=12)
# plt.show()


  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


: 

: 

In [12]:
#Save the embedding vectors in a TSV file
np.savetxt(os.path.join('../outputs', 'vecs.tsv'), embedding, delimiter='\t')

# Save the labels in a metadata file
with open(os.path.join('../outputs', 'metadata.tsv'), 'w') as f:
    for label in df_umap.index:
        f.write('{}\n'.format(label))

In [1]:
import pandas as pd

In [2]:
embedding = pd.read_csv("../outputs/vecs.tsv", sep="\t", header=None)


In [3]:
import tensorflow
print(tensorflow.__version__)

2.13.0-rc2


In [5]:
import tensorflow as tf
from tensorboard.plugins import projector
import os
# assuming `vecs` is a 2D array or matrix containing your embeddings and `metadata` is a list of labels

# save the weights you're probing as a variable
emb = tf.Variable(embedding, name='embedding')
log_dir = "../outputs/log_dir"
checkpoint = tf.train.Checkpoint(embedding=emb)
checkpoint.save(os.path.join(log_dir, "embedding.ckpt"))
# checkpoint 
# sess = tf.Session()
# saver = tf.train.Saver()
# sess.run(tf.global_variables_initializer())
# saver.save(sess, os.path.join(log_dir, 'model.ckpt'))

# config
# config = projector.ProjectorConfig()
# embedding = config.embeddings.add()
# embedding.tensor_name = 'embedding:0'  # Name of the tensor in the TensorFlow graph
# embedding.metadata_path = os.path.join(log_dir, 'metadata.tsv')  # Path to your labels

# # write labels
# # with open(embedding.metadata_path, 'w') as metadata_file:
# #     for row in df_umap.index:
# #         metadata_file.write('{}\n'.format(row))

# # saves a config file that TensorBoard will read during startup.
# projector.visualize_embeddings(log_dir, config)


'../outputs/log_dir/embedding.ckpt-1'

In [15]:
metadata_list = grouped_combined_repos.repo_full_name.unique().tolist() + grouped_combined_repos.login.unique().tolist()

In [16]:
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
embedding.tensor_name = 'embedding:0'  # Name of the tensor in the TensorFlow graph
embedding.metadata_path = os.path.join(log_dir, 'metadata.tsv')  # Path to your labels

# write labels
with open(embedding.metadata_path, 'w') as metadata_file:
    for row in metadata_list:
        metadata_file.write('{}\n'.format(row))

# saves a config file that TensorBoard will read during startup.
projector.visualize_embeddings(log_dir, config)


In [17]:
!tensorboard --logdir "../outputs/log_dir"

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.13.0 at http://localhost:6006/ (Press CTRL+C to quit)
^C


In [13]:
print(df_umap.isna().sum())

login
0--key          0
0-Eclipse-0     0
0-kaladin       0
0-wiz-0         0
00-Evan         0
               ..
zzuduoduo       0
zzuieliyaoli    0
zzy14           0
zzzyy           0
zzzzBov         0
Length: 90501, dtype: int64


In [14]:
print(df_umap.values.shape)

(14769, 90501)


In [55]:
combined_orgs_dfs = pd.concat(orgs_dfs)

In [56]:
combined_orgs_dfs[0:1].to_dict()

{'org_login': {0: 'ABC-DH'},
 'login': {0: 'digitalkoine'},
 'counts': {0: 1},
 'entity_type': {0: 'org'},
 'file_path': {0: '../data/join_files/org_members_join_dataset.csv'},
 'join_type': {0: 'org_members'}}

In [10]:
bipartite_graph = nx.Graph()

In [62]:
subset_grouped_combined_repos = grouped_combined_repos[grouped_combined_repos.counts > 2]
len(subset_grouped_combined_repos), subset_grouped_combined_repos.counts.sum()

(18823, 408878)

In [63]:
# Add nodes to graph from 'org_login' and 'login' columns
repo_nodes = list(subset_grouped_combined_repos['repo_full_name'].unique())
login_nodes = list(subset_grouped_combined_repos['login'].unique())

bipartite_graph.add_nodes_from(repo_nodes, bipartite=0)  # Add the nodes to the 'org_login' partition
bipartite_graph.add_nodes_from(login_nodes, bipartite=1)  # Add the nodes to the 'login' partition

# Prepare edges subset_grouped_combined_repos for adding to the graph
edges = []
for _, row in subset_grouped_combined_repos.iterrows():
    edge = (row['repo_full_name'], row['login'], {
        'weight': row['counts'],
    })
    edges.append(edge)

# Add edges to graph
bipartite_graph.add_edges_from(edges)

In [64]:
from networkx.algorithms import bipartite

# G = bipartite.projected_graph(bipartite_graph, repo_nodes)

In [65]:
user_graph = bipartite.weighted_projected_graph(bipartite_graph, login_nodes)

In [66]:
components = [G for G in nx.connected_components(user_graph)]
largest = max(components, key=len)

In [67]:
type(largest)

set

In [68]:
subset_graph = user_graph.subgraph(largest)

In [69]:
len(subset_graph.nodes())

83311

In [71]:
import matplotlib.pyplot as plt
import networkx as nx
import community as community_louvain

# assuming G is your graph

# compute the best partition using Louvain method
partition = community_louvain.best_partition(user_graph)

# draw the graph
pos = nx.spring_layout(user_graph)

# color the nodes according to their partition
cmap = plt.cm.get_cmap('viridis', max(partition.values()) + 1)
nx.draw_networkx_nodes(user_graph, pos, partition.keys(), node_size=10, 
                       cmap=cmap, node_color=list(partition.values()))
nx.draw_networkx_edges(user_graph, pos, alpha=0.5)
plt.show()


KeyboardInterrupt: 

In [34]:
nx.write_gexf(user_graph, "../outputs/user_graph.gexf")

In [20]:
user_matrix = nx.to_pandas_adjacency(user_graph, nodelist=login_nodes, dtype=int)

In [27]:
user_matrix.values

array([[0, 1, 1, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 1, 1],
       [0, 0, 0, ..., 1, 0, 1],
       [0, 0, 0, ..., 1, 1, 0]])

In [25]:
import umap
import matplotlib.pyplot as plt

# Set up the UMAP parameters
umap_params = {
    "n_neighbors": 30,
    "min_dist": 0.3,
    "n_components": 2,
    "metric": 'euclidean',
    "random_state": 42
}

df_umap = user_matrix.copy()
# Initialize UMAP
reducer = umap.UMAP(**umap_params)

# Fit the model and transform the data
embedding = reducer.fit_transform(df_umap.values)

# Plot the UMAP projection
plt.figure(figsize=(10, 10))
plt.scatter(embedding[:, 0], embedding[:, 1], s=0.1)
plt.gca().set_aspect('equal', 'datalim')
plt.title('UMAP projection of repo_full_name and login interactions', fontsize=12)
plt.show()


ValueError: cannot assign slice from input of different size

In [42]:
user_nodes = core_users['login'].tolist()
repo_nodes = core_repos['full_name'].tolist()
org_nodes = core_orgs['login'].tolist()
user_nodes = user_nodes + org_nodes

edges 

In [32]:
len(user_nodes), len(repo_nodes), len(edges)

(793, 2264, 4983)

In [43]:
bipartite_graph.add_nodes_from(user_nodes, bipartite=0)
bipartite_graph.add_nodes_from(repo_nodes, bipartite=1)
bipartite_graph.add_edges_from(edges)

In [44]:
print('connected?', nx.is_connected(bipartite_graph))
print('bipartite?', nx.is_bipartite(bipartite_graph))

connected? False
bipartite? True


In [45]:
nx.write_gexf(bipartite_graph,"../data/derived_files/bipartite_graph.gexf")

In [2]:
test = pd.read_excel("../data/private_data/Appendix A publication_raw_chun_llc_dhq.xlsx")

In [9]:
test2 = pd.read_excel("../data/private_data/Appendix E co-retweet network.xlsx")