# Data loading

In [1]:
import pandas as pd
from pathlib import Path
import networkx as nx
import numpy as np

## Read the edgelists, merge them and store them

In [2]:
def read_edgelist(path, type_of_edges):
    if type_of_edges is not None:
        assert (type_of_edges) == 'liking' or (type_of_edges) == 'retweeters' or (type_of_edges) == 'any', "type_of_edges must be 'liking' or 'retweeters' or 'any'"
    
    if type_of_edges == 'any':
        df =  pd.concat([pd.read_csv(file) for file in Path(path).iterdir()], ignore_index=True)
        df.name = 'any'
    elif type_of_edges == 'liking':
        df =  pd.concat([pd.read_csv(file) for file in Path(path).iterdir() if 'liking' in file.name], ignore_index=True)
        df.name = 'liking'
    elif type_of_edges == 'retweeters':
        df =  pd.concat([pd.read_csv(file) for file in Path(path).iterdir() if 'retweeters' in file.name], ignore_index=True)
        df.name = 'retweeters'
        
    df.drop_duplicates(inplace=True)
    return df

In [3]:
edgelists_any_df = read_edgelist('edgelists', 'any')
edgelists_liking_df = read_edgelist('edgelists', 'liking')
edgelists_retweeters_df = read_edgelist('edgelists', 'retweeters')

edgelists = [edgelists_any_df, edgelists_liking_df, edgelists_retweeters_df]

## Some stats about the networks

In [4]:
def filter_df_by_authors(df, author_ids_sample):
    return df[df['author_id'].isin(author_ids_sample)]

def print_basic_stats(df):
    print('Number of unique authors:', df['author_id'].nunique())
    print('Number of unique users:', df['user_id'].nunique())
    print('Number of nodes:', df['author_id'].nunique() + df['user_id'].nunique())
    print('Number of edges:', df.shape[0])

def common_authors(df1, df2):
    # commun authors between liking and retweeters
    df1_authors = df1['author_id'].unique()
    df2_authors = df2['author_id'].unique()

    authors_in_common = set(df1_authors).intersection(set(df2_authors))

    print('Number of authors in common:', len(authors_in_common))
    return authors_in_common

def common_rows(df1, df2):
    count_common_authors = df1.merge(df2, on=['author_id', 'user_id'], how='inner')
    print('Number of rows in common:', count_common_authors.shape[0])
    return count_common_authors

In [5]:
for df in edgelists:
    print('-----------' + df.name + '-----------')
    print_basic_stats(df)

print("")

common_authors(edgelists_liking_df, edgelists_retweeters_df)
common_rows(edgelists_liking_df, edgelists_retweeters_df)

-----------any-----------
Number of unique authors: 786
Number of unique users: 2790105
Number of nodes: 2790891
Number of edges: 4466776
-----------liking-----------
Number of unique authors: 755
Number of unique users: 2668663
Number of nodes: 2669418
Number of edges: 4245301
-----------retweeters-----------
Number of unique authors: 682
Number of unique users: 520160
Number of nodes: 520842
Number of edges: 787529

Number of authors in common: 651
Number of rows in common: 566054


Unnamed: 0,user_id,author_id
0,1012080986115444742,755067744333271040
1,1291717706249535488,755067744333271040
2,1472803907369906180,755067744333271040
3,1191410774901698560,755067744333271040
4,550162497,755067744333271040
...,...,...
566049,351061791,820353014527623168
566050,2715983699,820353014527623168
566051,719222253746053120,820353014527623168
566052,884140512931917824,820353014527623168


## Saving graphs

In [8]:
# rename "user_id" to "Source" and "author_id" to "Target"
edgelists_any_df = edgelists_any_df.rename(columns={'user_id': 'Source', 'author_id': 'Target'})
edgelists_liking_df = edgelists_liking_df.rename(columns={'user_id': 'Source', 'author_id': 'Target'})
edgelists_retweeters_df = edgelists_retweeters_df.rename(columns={'user_id': 'Source', 'author_id': 'Target'})

Path.mkdir(Path("edgelistsFull"), exist_ok=True)
edgelists_any_df.to_csv('edgelistsFull/edgelists_any.csv', index=False, header=True)
edgelists_liking_df.to_csv('edgelistsFull/edgelists_liking.csv', index=False, header=True)
edgelists_retweeters_df.to_csv('edgelistsFull/edgelists_retweeters.csv', index=False, header=True)

## Sampling (if needed)

In [9]:
common_authors_list = common_authors(edgelists_liking_df, edgelists_retweeters_df)
# get a sample of common_authors_list, to reduce the graph size
author_ids_sample = np.random.choice(list(common_authors_list), size=100, replace=False)

# filter df by author_ids
edgelists_any_df_sample = filter_df_by_authors(edgelists_any_df, author_ids_sample)
edgelists_liking_df_sample = filter_df_by_authors(edgelists_liking_df, author_ids_sample)
edgelists_retweeters_df_sample = filter_df_by_authors(edgelists_retweeters_df, author_ids_sample)

# print basic stats
print('----------- sample any -----------')
print_basic_stats(edgelists_any_df_sample)
print('----------- sample liking -----------')
print_basic_stats(edgelists_liking_df_sample)
print('----------- sample retweeters -----------')
print_basic_stats(edgelists_retweeters_df_sample)

print("")
common_authors(edgelists_liking_df_sample, edgelists_retweeters_df_sample)
common_rows(edgelists_liking_df_sample, edgelists_retweeters_df_sample)

# rename "user_id" to "Source" and "author_id" to "Target"
edgelists_any_df_sample = edgelists_any_df_sample.rename(columns={'user_id': 'Source', 'author_id': 'Target'})
edgelists_liking_df_sample = edgelists_liking_df_sample.rename(columns={'user_id': 'Source', 'author_id': 'Target'})
edgelists_retweeters_df_sample = edgelists_retweeters_df_sample.rename(columns={'user_id': 'Source', 'author_id': 'Target'})

Path.mkdir(Path("edgelistsSample"), exist_ok=True)
edgelists_any_df_sample.to_csv('edgelistsSample/edgelists_any_sample.csv', index=False, header=True)
edgelists_liking_df_sample.to_csv('edgelistsSample/edgelists_liking_sample.csv', index=False, header=True)
edgelists_retweeters_df_sample.to_csv('edgelistsSample/edgelists_retweeters_sample.csv', index=False, header=True)

Number of authors in common: 651
----------- sample any -----------
Number of unique authors: 100
Number of unique users: 375211
Number of nodes: 375311
Number of edges: 402555
----------- sample liking -----------
Number of unique authors: 100
Number of unique users: 350818
Number of nodes: 350918
Number of edges: 375702
----------- sample retweeters -----------
Number of unique authors: 100
Number of unique users: 83264
Number of nodes: 83364
Number of edges: 88234

Number of authors in common: 100
Number of rows in common: 61381
