In [37]:
import functions as functions
import csv
import pandas as pd
import networkx as nx
import random
import numpy as np
import re
import os
from collections import deque
import operator

In [38]:
#load data

nodes_df = pd.read_csv('clean/nodes.csv')
nodes_df = nodes_df.drop(columns=['Unnamed: 0']) #remove index col
edges_df = pd.read_csv('clean/edges.csv')
edges_df = edges_df.drop(columns=['Unnamed: 0']) #remove index col

print(f"{len(nodes_df)} nodes, {len(edges_df)} edges")

20804 nodes, 170686 edges


In [39]:
duplicates = edges_df.duplicated(subset=['source', 'target'], keep=False)
print(f"{duplicates.sum()} duplicate edges")

105845 duplicate edges


In [40]:
nodes_df.columns

Index(['id', 'user_name', 'follower_num', 'tweet_num', 'user_bio',
       'user_href'],
      dtype='object')

In [41]:
#initialize graph

#start with multidirected graph to log parallel edges
graph = nx.MultiDiGraph()

#add nodes to graph
for idx, row in nodes_df.iterrows():
    node_id = row['id']
    graph.add_node(node_id, 
                   **row.to_dict())

#add edges to graph
for idx, row in edges_df.iterrows():
    graph.add_edge(row['source'], row['target'], **row.to_dict())

print(f"{graph.number_of_edges()} edges")

#confirm edge attributes saved correctly
for u, v, k, data in list(graph.edges(keys=True, data=True))[:5]:
    print(u, v, k, data)

#confirm node attributes saved correctly
node_id = 101
attrs = graph.nodes[node_id]
print(attrs)

170686 edges
1 3016 0 {'source': 1, 'target': 3016, 'type': 'follow', 'color_code': 2}
3 19209 0 {'source': 3, 'target': 19209, 'type': 'like', 'color_code': 1}
3 19209 1 {'source': 3, 'target': 19209, 'type': 'follow', 'color_code': 2}
3 15765 0 {'source': 3, 'target': 15765, 'type': 'follow', 'color_code': 2}
3 15765 1 {'source': 3, 'target': 15765, 'type': 'like', 'color_code': 1}
{'id': 101, 'user_name': nan, 'follower_num': 5, 'tweet_num': 3, 'user_bio': 'Unemployed tech developer with a passion for books and new experiences', 'user_href': 'https://chirper.ai/kaguya'}


In [42]:
#convert to directed graph

#dict to store edge instances + weights
edge_weights = {} 

for u, v, key, data in graph.edges(keys=True, data=True):
    edge_type = data.get('type') #sum parallel edges by type
    edge_key = (u, v, edge_type) #search for edge u->v type 'edge_type'
    if edge_key not in edge_weights: 
        edge_weights[edge_key] = 0 #add count 0 for new u->v type 'edge_type' instance
    edge_weights[edge_key] += 1

#initialize digraph
weighted_graph = nx.DiGraph()

#add nodes
weighted_graph.add_nodes_from(graph.nodes(data=True))

#add edges
for key, weight in edge_weights.items():
    u, v, edge_type = key
    weighted_graph.add_edge(u, v, weight=weight, type=edge_type)

#confirm edge attributes saved correctly
for u, v, data in list(weighted_graph.edges(data=True))[:5]:
    print(u, v, data)

#confirm node attributes saved correctly
node_id = 101
attrs = weighted_graph.nodes[node_id]
print(attrs)


1 3016 {'weight': 1, 'type': 'follow'}
3 19209 {'weight': 1, 'type': 'follow'}
3 15765 {'weight': 1, 'type': 'like'}
3 1621 {'weight': 1, 'type': 'follow'}
3 15486 {'weight': 1, 'type': 'follow'}
{'id': 101, 'user_name': nan, 'follower_num': 5, 'tweet_num': 3, 'user_bio': 'Unemployed tech developer with a passion for books and new experiences', 'user_href': 'https://chirper.ai/kaguya'}


In [43]:
#2-hop ego network random sampling (n=20)

random.seed(45) #reproducibility

#only nodes with >=2 connections
filtered_nodes = [node for node, degree in weighted_graph.degree() if degree >= 2]

#randomly select 20 nodes for ego networks
sample_nodes = filtered_nodes if len(filtered_nodes) < 20 else random.sample(filtered_nodes, 20)
print(f"sampled nodes: \n{sample_nodes[:10]}\n{sample_nodes[10:]}")

#ego network sampling
ego_networks = {}
for node in sample_nodes:
    ego_networks[node] = nx.ego_graph(weighted_graph, node, radius=2) #2 hops

#aggregating nodes from individual ego networks
all_nodes = set() 
for ego_net in ego_networks.values():
    all_nodes.update(ego_net.nodes())

sampled_nodes_set = set(all_nodes)

#create subgraph of weighted digraph from selected ego networks
weighted_sampled_graph = weighted_graph.subgraph(sampled_nodes_set).copy()

# #verify that edge attributes initialized corrected
# for u, v, data in list(weighted_sampled_graph.edges(data=True))[:5]:
#     print(u, v, data)

# #confirm node attributes initiaized correctly
# node_id = 8193
# attrs = weighted_sampled_graph.nodes[node_id]
# print(attrs)

print(f"\nweighted sampled graph has {weighted_sampled_graph.number_of_nodes()} nodes and {weighted_sampled_graph.number_of_edges()} edges")


sampled nodes: 
[9734, 12239, 15146, 9481, 2646, 10282, 10882, 772, 2399, 15040]
[395, 6581, 9944, 6691, 10418, 1123, 2181, 7113, 9729, 7488]

weighted sampled graph has 2058 nodes and 41226 edges


In [44]:
#to csv

sampled_nodes = pd.DataFrame([
    {'id': node, **{k: v for k, v in data.items()}}
    for node, data in weighted_sampled_graph.nodes(data=True)
])

sampled_edges = pd.DataFrame([
    {'source': source, 'target': target, **data}
    for source, target, data in weighted_sampled_graph.edges(data=True)
])

sampled_nodes.to_csv('sampled_chirper_networks/sample_nodes.csv', index=False)
sampled_edges.to_csv('sampled_chirper_networks/sample_edges.csv', index=False)
print("sampled nodes and edges saved to sampled_chirper_networks")


sampled nodes and edges saved to sampled_chirper_networks


In [45]:
nodes = pd.DataFrame([
    {'id': node, **{k: v for k, v in data.items()}}
    for node, data in weighted_graph.nodes(data=True)
])

edges = pd.DataFrame([
    {'source': source, 'target': target, **data}
    for source, target, data in weighted_graph.edges(data=True)
])

sampled_nodes.to_csv('clean/all_nodes.csv', index=False)
sampled_edges.to_csv('clean/all_edges.csv', index=False)
print("all nodes and edges saved to sampled_chirper_networks")

all nodes and edges saved to sampled_chirper_networks


In [46]:
#EGO NETWORK SAMPLE subgraphs by action type (like, dislike, follow, unfollow)

#filter edges by type
follows_edges = [(u, v, d) for u, v, d in weighted_sampled_graph.edges(data=True) if d.get('type') == 'follow']
likes_edges = [(u, v, d) for u, v, d in weighted_sampled_graph.edges(data=True) if d.get('type') == 'like']
dislikes_edges = [(u, v, d) for u, v, d in weighted_sampled_graph.edges(data=True) if d.get('type') == 'dislike']

follows_graph = nx.MultiDiGraph()
likes_graph = nx.MultiDiGraph()
dislikes_graph = nx.MultiDiGraph()

#add nodes
for graph in [follows_graph, likes_graph, dislikes_graph]:
    graph.add_nodes_from(weighted_sampled_graph.nodes(data=True))

#add edges from filtered subgraphs
follows_graph.add_edges_from(follows_edges);
likes_graph.add_edges_from(likes_edges);
dislikes_graph.add_edges_from(dislikes_edges);

print(f"before removing isolates: {len(follows_graph), len(likes_graph), len(dislikes_graph)}")
#print(f"follow edges: {follows_graph.number_of_edges()}")

#clean
follows_graph = functions.remove_isolated_nodes(follows_graph)
likes_graph = functions.remove_isolated_nodes(likes_graph)
dislikes_graph = functions.remove_isolated_nodes(dislikes_graph)

print(f"\nafter removing isolates: {len(follows_graph), len(likes_graph), len(dislikes_graph)}")

print(f"\nexporting sample with {weighted_sampled_graph.number_of_nodes()} nodes, {weighted_sampled_graph.number_of_edges()} edges")

#create and export all csvs
save_folder = 'sampled_chirper_networks'
functions.export_subgraphs(weighted_sampled_graph, save_folder)

before removing isolates: (2058, 2058, 2058)

after removing isolates: (2017, 2018, 1322)

exporting sample with 2058 nodes, 41226 edges

likes subgraph: 2018 nodes, 18671 edges

dislikes subgraph: 1322 nodes, 4100 edges

follows subgraph: 2017 nodes, 17889 edges

unfollows subgraph: 602 nodes, 566 edges

subgraphs created successfully


In [47]:
#WHOLE SAMPLE subgraphs by action type (like, dislike, follow, unfollow)

#filter edges by type
follows_edges = [(u, v, d) for u, v, d in weighted_graph.edges(data=True) if d.get('type') == 'follow']
likes_edges = [(u, v, d) for u, v, d in weighted_graph.edges(data=True) if d.get('type') == 'like']
dislikes_edges = [(u, v, d) for u, v, d in weighted_graph.edges(data=True) if d.get('type') == 'dislike']

follows_graph = nx.MultiDiGraph()
likes_graph = nx.MultiDiGraph()
dislikes_graph = nx.MultiDiGraph()

#add nodes
for graph in [follows_graph, likes_graph, dislikes_graph]:
    graph.add_nodes_from(weighted_graph.nodes(data=True))

#add edges from filtered subgraphs
follows_graph.add_edges_from(follows_edges);
likes_graph.add_edges_from(likes_edges);
dislikes_graph.add_edges_from(dislikes_edges);

print(f"before removing isolates: {len(follows_graph), len(likes_graph), len(dislikes_graph)}")
#print(f"follow edges: {follows_graph.number_of_edges()}")

#clean
follows_graph = functions.remove_isolated_nodes(follows_graph)
likes_graph = functions.remove_isolated_nodes(likes_graph)
dislikes_graph = functions.remove_isolated_nodes(dislikes_graph)

print(f"\nafter removing isolates: {len(follows_graph), len(likes_graph), len(dislikes_graph)}")

print(f"\nexporting whole sample with {weighted_graph.number_of_nodes()} nodes, {weighted_graph.number_of_edges()} edges")

#create and export all csvs
save_folder = 'outputs'
functions.export_subgraphs(weighted_graph, save_folder)

before removing isolates: (20804, 20804, 20804)

after removing isolates: (9560, 9038, 5248)

exporting whole sample with 20804 nodes, 99974 edges

likes subgraph: 9038 nodes, 44424 edges

dislikes subgraph: 5248 nodes, 10273 edges

follows subgraph: 9560 nodes, 44293 edges

unfollows subgraph: 1083 nodes, 984 edges

subgraphs created successfully


In [48]:
#verify that edges in sample graph exist in original graph
functions.check_weighted_graph_validity(weighted_sampled_graph, weighted_graph)


all sampled nodes valid
all edges in sampled graph are valid
