# Data loading

In [1]:
import pandas as pd
from pathlib import Path
import networkx as nx
import numpy as np

## Read the edgelists, merge them and store them

In [2]:
def read_edgelist(path, type_of_edges):
    if type_of_edges is not None:
        assert (type_of_edges) == 'liking' or (type_of_edges) == 'retweeters' or (type_of_edges) == 'any', "type_of_edges must be 'liking' or 'retweeters' or 'any'"
    
    if type_of_edges == 'any':
        df =  pd.concat([pd.read_csv(file) for file in Path(path).iterdir()], ignore_index=True)
        df.name = 'any'
    elif type_of_edges == 'liking':
        df =  pd.concat([pd.read_csv(file) for file in Path(path).iterdir() if 'liking' in file.name], ignore_index=True)
        df.name = 'liking'
    elif type_of_edges == 'retweeters':
        df =  pd.concat([pd.read_csv(file) for file in Path(path).iterdir() if 'retweeters' in file.name], ignore_index=True)
        df.name = 'retweeters'
        
    df.drop_duplicates(inplace=True)
    return df


In [3]:
edgelists_any_df = read_edgelist('edgelists', 'any')
edgelists_liking_df = read_edgelist('edgelists', 'liking')
edgelists_retweeters_df = read_edgelist('edgelists', 'retweeters')

edgelists = [edgelists_any_df, edgelists_liking_df, edgelists_retweeters_df]

## Some stats

In [4]:
def filter_df_by_authors(df, author_ids_sample):
    return df[df['author_id'].isin(author_ids_sample)]

def print_basic_stats(df):
    print('Number of unique authors:', df['author_id'].nunique())
    print('Number of unique users:', df['user_id'].nunique())
    print('Number of nodes:', df['author_id'].nunique() + df['user_id'].nunique())
    print('Number of edges:', df.shape[0])

def common_authors(df1, df2):
    # commun authors between liking and retweeters
    df1_authors = df1['author_id'].unique()
    df2_authors = df2['author_id'].unique()

    authors_in_common = set(df1_authors).intersection(set(df2_authors))

    print('Number of authors in common:', len(authors_in_common))
    return authors_in_common

def common_rows(df1, df2):
    count_common_authors = df1.merge(df2, on=['author_id', 'user_id'], how='inner')
    print('Number of rows in common:', count_common_authors.shape[0])
    return count_common_authors

In [5]:
for df in edgelists:
    print('-----------' + df.name + '-----------')
    print_basic_stats(df)

print("")

common_authors(edgelists_liking_df, edgelists_retweeters_df)
common_rows(edgelists_liking_df, edgelists_retweeters_df)

-----------any-----------
Number of unique authors: 786
Number of unique users: 2790105
Number of nodes: 2790891
Number of edges: 4466776
-----------liking-----------
Number of unique authors: 755
Number of unique users: 2668663
Number of nodes: 2669418
Number of edges: 4245301
-----------retweeters-----------
Number of unique authors: 682
Number of unique users: 520160
Number of nodes: 520842
Number of edges: 787529

Number of authors in common: 651
Number of rows in common: 566054


Unnamed: 0,user_id,author_id
0,2909381990,716152735
1,3101577272,2222395582
2,1378730257184919559,2222395582
3,755116981649453056,2222395582
4,1252569115098636290,2222395582
...,...,...
566049,1314894413814996992,554497503
566050,788945381480464386,554497503
566051,1486117700657164297,554497503
566052,39666268,554497503


## Sampling

In [19]:
# get unique common author_ids and convert to df
common_authors_list = common_authors(edgelists_liking_df, edgelists_retweeters_df)

# get a sample of common_authors_list 
author_ids_sample = np.random.choice(list(common_authors_list), size=43, replace=False)

# # filter df by author_ids
edgelists_any_df_sample = filter_df_by_authors(edgelists_any_df, author_ids_sample)
edgelists_liking_df_sample = filter_df_by_authors(edgelists_liking_df, author_ids_sample)
edgelists_retweeters_df_sample = filter_df_by_authors(edgelists_retweeters_df, author_ids_sample)

# print basic stats
print('----------- sample any -----------')
print_basic_stats(edgelists_any_df_sample)
print('----------- sample liking -----------')
print_basic_stats(edgelists_liking_df_sample)
print('----------- sample retweeters -----------')
print_basic_stats(edgelists_retweeters_df_sample)

print("")
common_authors(edgelists_liking_df_sample, edgelists_retweeters_df_sample)
common_rows(edgelists_liking_df_sample, edgelists_retweeters_df_sample)

# rename user_id to "Source" and author_id to "Target"
edgelists_any_df_sample = edgelists_any_df_sample.rename(columns={'user_id': 'Source', 'author_id': 'Target'})
edgelists_liking_df_sample = edgelists_liking_df_sample.rename(columns={'user_id': 'Source', 'author_id': 'Target'})
edgelists_retweeters_df_sample = edgelists_retweeters_df_sample.rename(columns={'user_id': 'Source', 'author_id': 'Target'})

# save edgelists to csv
Path.mkdir(Path("edgelistsSample"), exist_ok=True)
edgelists_any_df_sample.to_csv('edgelistsSample/edgelists_any_sample.csv', index=False, header=True)
edgelists_liking_df_sample.to_csv('edgelistsSample/edgelists_liking_sample.csv', index=False, header=True)
edgelists_retweeters_df_sample.to_csv('edgelistsSample/edgelists_retweeters_sample.csv', index=False, header=True)

Number of authors in common: 651
----------- sample any -----------
Number of unique authors: 43
Number of unique users: 267456
Number of nodes: 267499
Number of edges: 303884
----------- sample liking -----------
Number of unique authors: 43
Number of unique users: 256413
Number of nodes: 256456
Number of edges: 290925
----------- sample retweeters -----------
Number of unique authors: 43
Number of unique users: 44381
Number of nodes: 44424
Number of edges: 49417

Number of authors in common: 43
Number of rows in common: 36458


## Parse for gephi TO DELETE

In [9]:
# edgelists_any_df_sample = edgelists_any_df_sample.rename(columns={'user_id': 'Source', 'author_id': 'Target'})
# add column "like" with value False 
# edgelists_any_df_sample = edgelists_any_df_sample.insert(0, 'like', False)
edgelists_any_df_sample["like"] = False
# # add column "retweet" with value False
# edgelists_any_df_sample = edgelists_any_df_sample.insert(0, 'retweet', False)
edgelists_any_df_sample["retweet"] = False

edgelists_any_df_sample.loc[edgelists_any_df_sample['Source'].isin(edgelists_liking_df_sample['Source'].unique()), "like"] = True
edgelists_any_df_sample.loc[edgelists_any_df_sample['Source'].isin(edgelists_retweeters_df_sample['Source'].unique()), "retweet"] = True

# add column "both" with value True if both like and retweet
edgelists_any_df_sample.loc[:,'both'] = edgelists_any_df_sample['like'] & edgelists_any_df_sample['retweet']

edgelists_any_df_sample

# save edgelists_any_df_sample to csv
edgelists_any_df_sample.to_csv('edgelistsSample/edgelists_any_sample_w_data.csv', index=False, header=True)

In [17]:
# # get unique user_ids
# user_ids = edgelists_df['user_id'].unique()

# # sample 200 000 user_ids
# user_ids_sample = np.random.choice(user_ids, 1000, replace=False)

# # filter edgelists_df_sample to contain only user_ids in user_ids_sample
# edgelists_df_sample = edgelists_df[edgelists_df['user_id'].isin(user_ids_sample)]

# # get number of unique user_ids in edgelists_df_sample
# print(len(edgelists_df_sample['user_id'].unique()))

# # get number of unique author_ids in edgelists_df_sample
# print(len(edgelists_df_sample['author_id'].unique()))

# # print len of edgelists_df_sample
# print(len(edgelists_df_sample))

# # save edgelists_df_sample to csv
# edgelists_df_sample.to_csv(Path(f"edgelists_sample.csv"), index=False)