# Data loading

In [1]:
import pandas as pd
from pathlib import Path
import networkx as nx
import numpy as np

In [2]:
# read all csv files from edgelists folder and save to edgelists_df
edgelists_df = pd.concat([pd.read_csv(file) for file in Path("edgelists").iterdir()], ignore_index=True)
# drop duplicates in edgelists_df
edgelists_df.drop_duplicates(inplace=True)

## Sampling

In [7]:
# get unique author_ids
author_ids = edgelists_df['author_id'].unique()

# sample 100 author_ids
author_ids_sample = np.random.choice(author_ids, 50, replace=False)

# filter edgelists_df to contain only author_ids in author_ids_sample
edgelists_df_sample = edgelists_df[edgelists_df['author_id'].isin(author_ids_sample)]

# get number of unique author_ids in edgelists_df_sample
print(len(edgelists_df_sample['author_id'].unique()))

# get number of unique users in edgelists_df_sample
print(len(edgelists_df_sample['user_id'].unique()))

# print len of edgelists_df_sample
print(len(edgelists_df_sample))

# save edgelists_df_sample to csv
edgelists_df_sample.to_csv(Path(f"edgelists_sample.csv"), index=False)

50
604742
624818


In [11]:
# get unique user_ids
user_ids = edgelists_df['user_id'].unique()

# sample 200 000 user_ids
user_ids_sample = np.random.choice(user_ids, 1000, replace=False)

# filter edgelists_df_sample to contain only user_ids in user_ids_sample
edgelists_df_sample = edgelists_df[edgelists_df['user_id'].isin(user_ids_sample)]

# get number of unique user_ids in edgelists_df_sample
print(len(edgelists_df_sample['user_id'].unique()))

# get number of unique author_ids in edgelists_df_sample
print(len(edgelists_df_sample['author_id'].unique()))

# print len of edgelists_df_sample
print(len(edgelists_df_sample))

# save edgelists_df_sample to csv
edgelists_df_sample.to_csv(Path(f"edgelists_sample.csv"), index=False)

1000
118
1696


## Networkx

In [None]:
# load edgelists_df_sample to networkx undirected graph
edgelists_graph_sample = nx.from_pandas_edgelist(edgelists_df_sample, source='user_id', target='author_id')

In [None]:
# print number of nodes in edgelists_graph_sample
print(f"{len(edgelists_graph_sample.nodes)=}")

# print number of edges in edgelists_graph_sample
print(f"{len(edgelists_graph_sample.edges)=}")