In [1]:
import networkx as nx
import pandas as pd
import numpy as np

**Data Cleaning**

In [2]:
# Import data.
edges = pd.read_csv(r'dataset\archive\edges.csv')
hero_network = pd.read_csv(r'dataset\archive\hero-network.csv')
nodes = pd.read_csv(r'dataset\archive\nodes.csv')

# Remove last space and "/" if present.
def remove_extra(row):
    return row.rstrip().rstrip('/')

hero_network['hero1'] = hero_network['hero1'].apply(remove_extra)
hero_network['hero2'] = hero_network['hero2'].apply(remove_extra)
edges['hero'] = edges['hero'].apply(remove_extra)
nodes['node'] = nodes['node'].apply(remove_extra)

# Cut to max 20 characters hero's names.
edges['hero'] = edges['hero'].apply(lambda row: row[:20])
nodes['node'] = nodes['node'].apply(lambda row: row[:20])

# Remove row with same hero (self-loop).
hero_network.drop(hero_network[hero_network.hero1 == hero_network.hero2].index, inplace=True)
hero_net = hero_network.reset_index(drop=True)

**First Graph**

For this function use hero_network dataset.

The number of *nodes* is the same of the unique heroes in all the dataset and *edges* are weighted with $w_{AB} = \frac{1}{n_{AB}}$ where $n_{AB}$ is the number of edges between node A and node B and $w_{AB}$ the weight of the single edge between these nodes.

The created graph is `undirected` and `weighted` and there are no self-loops or multiple edges.

In [3]:
def first_graph(dataset):

    # Remake the dataframe sorting the names by row to check duplicates.
    dataset = pd.DataFrame(np.sort(dataset.values), columns=dataset.columns)

    # Store the edges weights in a sorted dictionary.
    edges_weight = dict(sorted(dict(round(1 / (dataset.hero1 + dataset.hero2).value_counts(), 5)).items()))

    # Drop duplicates.
    dataset.drop_duplicates(inplace=True)

    # Create "weight" column.
    dataset = dataset.sort_values(by=['hero1', 'hero2'])
    dataset['weight'] = edges_weight.values()

    # Generate the graph.
    graph = nx.from_pandas_edgelist(dataset, 'hero1', 'hero2', 'weight')

    return graph

**Second Graph**

For this function we need the dataset with nodes and the one with edges to generate the graph.

The graph is `undirected` and `unweighted`.

In [169]:
def second_graph(nodes, edges):
    graph = nx.Graph()
    graph = nx.from_pandas_edgelist(edges, 'hero', 'comic')
    node_attr = nodes.apply(lambda row: (row.node, {'type': row.type}), axis=1)
    graph.add_nodes_from(node_attr)
    return graph

**Top N heroes**

The function return the top N heroes who have appeared in the most number of comics based on edges dataset.

In [278]:
def top_N_heroes(N=6439):
    top_heroes = edges.groupby('hero').count().sort_values('comic', ascending=False)
    return list(top_heroes.iloc[:N].index)

---

### Functionality 3

In [None]:
def functionality_3(graph_data, seq, initial_node, final_node, N=6439):
    graph = graph_data

    # Keep only the top N heroes and remove disconnected nodes.
    to_keep = top_N_heroes(N) + list(nodes[nodes.type == 'comic'].node)
    subgraph = nx.Graph(graph.subgraph(to_keep))
    subgraph.remove_nodes_from(list(nx.isolates(subgraph)))

In [170]:
graph = second_graph(nodes, edges)

In [283]:
to_keep = top_N_heroes(20) + list(nodes[nodes.type == 'comic'].node)
subgraph = nx.Graph(graph.subgraph(to_keep))
print(subgraph.number_of_nodes())
subgraph.remove_nodes_from(list(nx.isolates(subgraph)))
print(subgraph.number_of_nodes())

12670
7554
