Imports

In [15]:
import networkx as nx
import wikipediaapi
from operator import itemgetter

Setting vars and stops (pages that have too many links that arent relevant or useful)

In [16]:
STOPS = ("International Standard Serial Number",
         "International Standard Book Number",
         "National Diet Library",
         "International Standard Name Identifier",
         "International Standard Book Number (Identifier)",
         "Pubmed Identifier", "Pubmed Central",
         "Digital Object Identifier", "Arxiv",
         "Proc Natl Acad Sci Usa", "Bibcode",
         "Library Of Congress Control Number", "Jstor")


In [17]:
wiki = wikipediaapi.Wikipedia('en')

Setting SEED (starting site)

In [18]:
#starting page
SEED = "Computer Science".title()
#number of layers
max_layers = 2

#bfs lists
todo_list = [(0,SEED)]
todo_set = set(SEED)
done_set = set()

#instantiating graph
G = nx.DiGraph()

#starting loop
layer , page = todo_list[0]

while layer < max_layers:
    del todo_list[0]
    done_set.add(page)
    #print(layer,page)

    try:
        wikiPage = wiki.page(page)
    except:
        layer,page = todo_list[0]
        #print("Could not load", page)
        continue
    
    for link in wikiPage.links:
        link = link.title()
        if link not in STOPS and not link.startswith("List Of") and not link.startswith("Glossary Of") and not (":"in link):
            if link not in todo_set and link not in done_set:
                todo_list.append((layer + 1, link))
                todo_set.add(link)
            G.add_edge(page, link)
    layer, page = todo_list[0]

In [19]:
print("{} nodes, {} edges".format(len(G), nx.number_of_edges(G)))

65845 nodes, 158077 edges


De duplicate

In [20]:
G.remove_edges_from(G.selfloop_edges())
duplicates = [(node, node + "s") for node in G if node + "s" in G]
for dup in duplicates:
    G = nx.contracted_nodes(G, *dup, self_loops=False)
duplicates = [(x, y) for x, y 
              in [(node, node.replace("-", " ")) for node in G]
              if x != y and y in G]
for dup in duplicates:
    G = nx.contracted_nodes(G, *dup, self_loops=False)
nx.set_node_attributes(G, 0, "contraction")

Truncate

In [21]:
core = [node for node, deg in dict(G.degree()).items() if deg >= 2]
G = nx.subgraph(G, core)
print("{} nodes, {} edges".format(len(G), nx.number_of_edges(G)))
# 2995 nodes, 11817 edges
nx.write_graphml(G, "compsci.graphml")

22099 nodes, 114613 edges


Print top degrees

In [33]:
top_indegree = sorted(dict(G.in_degree()).items(),
                      reverse=True, key=itemgetter(1))[:100]
#print("\n".join(map(lambda t: "{} {}".format(*reversed(t)), top_indegree)))

In [126]:
#print full graph
nx.write_gexf(G, "graphFile_ComputerScience.gexf")

In [8]:
#load the graph file
G = nx.read_graphml("compsci.graphml")
print(len(G.nodes()), len(G.edges))

23075 122164


In [32]:
#cull nodes with degree less than a set value
node_degree_cutoff = 100

to_keep = [node for node, deg in dict(G.degree()).items() if deg >= node_degree_cutoff]
small = G.subgraph(to_keep)

print(len(small.nodes()))
nx.write_gexf(small, "graphFile_ComputerScience_culled2.gexf")

418


In [14]:
mod = nx.read_graphml("graphFile_SubGraph.graphml")
mod.nodes.get("Computer Science")

{'Clustering Coefficient': 0.1223583072423935,
 'Degree': 611,
 'Eigenvector Centrality': 0.9718249141879425,
 'In-Degree': 198,
 'Modularity Class': 6,
 'Out-Degree': 413,
 'b': 251,
 'contraction': 0,
 'g': 255,
 'label': 'Computer Science',
 'r': 209,
 'size': 10.0,
 'x': -701.28564,
 'y': 299.30154}