# Fun with comics

A intro to functionality.  

nodes.csv: Contains two columns (node, type), indicating the name and the type (comic, hero) of the nodes.

edges.csv: Contains two columns (hero, comic), indicating in which comics the heroes appear.

hero-edge.csv: Contains the network of heroes which appear together in the comics. This file was originally taken from http://syntagmatic.github.io/exposedata/marvel/

In [None]:
# RAPIDS
import cudf
import cugraph

In [None]:
# Standard Python stuff
import pandas
import time

In [None]:
# read all the data.  pass in the module to use
def read_data(m):
    _n = m.read_csv('./marvel/nodes.csv')
    _e = m.read_csv('./marvel/edges.csv')
    _he = m.read_csv('./marvel/hero-network.csv')
    return (_n, _e, _he)

In [None]:
%%time
(nodes, edges, heros) = read_data(cudf)
(len(nodes), len(edges), len(heros))

-----
Some basic stats

In [None]:
%%time
# How many comic books and heros are in the set?
nodes.groupby('type').count()

In [None]:
%%time
# What are the top 10 comics with the most heros?
edges.groupby('comic').count().sort_values(by='hero', ascending=False).head(10)

In [None]:
%matplotlib inline
edges.groupby('comic').count().sort_values(by='hero', ascending=False).to_pandas().head(500).plot(legend=True, figsize=(10, 5))

In [None]:
%matplotlib inline
edges.groupby('comic').count().to_pandas().plot(legend=True, figsize=(15, 5))

In [None]:
# What heros appeared in the most comics?
edges.groupby('hero').count().sort_values(by='comic', ascending=False).head(10)

In [None]:
# What heros appeared in a single comic?
len(edges.groupby('hero').count().query('comic == 1'))

----
### Create a Graph

In [None]:
# first a quick peek at the edge file
edges.head(1)

looks like both columns are "string", luckily cuGraph supports string
It is also a directed graph (really a bipartite graph) but we will create an undirected graph
which will symmeterize the edges 

In [None]:
G =  cugraph.from_cudf_edgelist(edges, source=['hero'], destination=['comic'])

In [None]:
(G.number_of_nodes(),G.number_of_edges())

-----
__Who is key in the graph__

In [None]:
%%time
# Run BC
bc = cugraph.betweenness_centrality(G, k=1000)

In [None]:
%%time
bc.sort_values(by='betweenness_centrality', ascending=False).head(10)

In [None]:
%%time
# Run PageRank
pr = cugraph.pagerank(G)

In [None]:
%%time
pr.sort_values(by='pagerank', ascending=False).head(10)

---
The problem is that the graph is really Bipartite.  Drop the "comic" nodes
convert from hero -> comic to a hero -> hero graph

__only do this on small graphs__

In [None]:
# clean up the old graph
del G
del bc
del pr

In [None]:
# make a copy
edges2 = edges.copy()

In [None]:
edges2 = edges2.rename(columns={'hero':'hero2'})

In [None]:
edges2.head(2)

In [None]:
%%time
# join (merge) the two dataframes
df = edges.merge(edges2, on="comic")

In [None]:
df.head(2)

In [None]:
print(f"edges went from {(len(edges)):,} to {(len(df)):,}" )

In [None]:
# drop the "comic" column
df = df.drop(columns=['comic'])

In [None]:
df.head(1)

In [None]:
# drop all the duplicates - where the same heros appear in multiple comics
df.drop_duplicates(inplace=True)

In [None]:
print(f"edges went from {(len(edges)):,} to {(len(df)):,}" )

looks like there are a lot of comics with the same characters

In [None]:
g = cugraph.Graph()

In [None]:
g.from_cudf_edgelist(df, source=['hero'], destination=['hero2'])

In [None]:
(g.number_of_nodes(),g.number_of_edges(directed_edges=True))

In [None]:
%%time
# Run BC
bc = cugraph.betweenness_centrality(g, k=1000)

In [None]:
%%time
bc.sort_values(by='betweenness_centrality', ascending=False).head(10)

In [None]:
%%time
# Run PageRank
pr = cugraph.pagerank(g)

In [None]:
%%time
pr.sort_values(by='pagerank', ascending=False).head(10)

----
some visualization

In [None]:
import graphistry

In [None]:
graphistry.register(api=3, protocol="https", server="hub.graphistry.com", username="bradley_rees", password="1LoveGraphs!")

In [None]:
graphistry.edges(df, 'hero', 'hero2').plot()