Imports

In [2]:
import networkx as nx
import wikipediaapi
from operator import itemgetter

Setting vars and stops (pages that have too many links that arent relevant or useful)

In [3]:
STOPS = ("International Standard Serial Number",
         "International Standard Book Number",
         "National Diet Library",
         "International Standard Name Identifier",
         "International Standard Book Number (Identifier)",
         "Pubmed Identifier", "Pubmed Central",
         "Digital Object Identifier", "Arxiv",
         "Proc Natl Acad Sci Usa", "Bibcode",
         "Library Of Congress Control Number", "Jstor")


In [4]:
wiki = wikipediaapi.Wikipedia('en')

Setting SEED (starting site)

In [5]:
#starting page
SEED = "Computer Science".title()
#number of layers
max_layers = 2

#bfs lists
todo_list = [(0,SEED)]
todo_set = set(SEED)
done_set = set()

#instantiating graph
G = nx.DiGraph()

#starting loop
layer , page = todo_list[0]

while layer < max_layers:
    del todo_list[0]
    done_set.add(page)
    print(layer,page)

    try:
        wikiPage = wiki.page(page)
    except:
        layer,page = todo_list[0]
        print("Could not load", page)
        continue
    
    for link in wikiPage.links:
        link = link.title()
        if link not in STOPS and not link.startswith("List Of") and not link.startswith("Glossary Of"):
            if link not in todo_set and link not in done_set:
                todo_list.append((layer + 1, link))
                todo_set.add(link)
            G.add_edge(page, link)
    layer, page = todo_list[0]

0 Computer Science
1 Acm Computing Classification System
1 Abacus
1 Academic Freedom
1 Acid–Base Reaction
1 Ada Lovelace
1 Adele Goldstine
1 Aerospace Engineering
1 Agile Software Development
1 Agricultural Engineering
1 Alan Kay
1 Alan Turing
1 Algebra
1 Algebraic Data Types
1 Algorithm
1 Algorithm Design
1 Algorithmic Efficiency
1 Algorithmic Trading
1 Allen B. Tucker
1 Alonzo Church
1 Analysis Of Algorithms
1 Analytical Engine
1 Analytical Chemistry
1 Analytical Mechanics
1 Anatomy
1 Anthony Ralston
1 Anthropology
1 Application Security
1 Applied Mathematics
1 Applied Physics
1 Applied Science
1 Archaeology
1 Arithmometer
1 Artificial Intelligence
1 Aspect-Oriented Software Development
1 Association For Computing Machinery
1 Association For Information Systems
1 Astrobiology
1 Astrochemistry
1 Astronomy
1 Astrophysics
1 Atomic Physics
1 Automata Theory
1 Automated Planning And Scheduling
1 Barry Boehm
1 Basic Research
1 Bernoulli Number
1 Bertrand Meyer
1 Biblioteca Nacional De Espa

KeyboardInterrupt: 

In [None]:
print("{} nodes, {} edges".format(len(G), nx.number_of_edges(G)))

De duplicate

In [119]:
G.remove_edges_from(G.selfloop_edges())
duplicates = [(node, node + "s") for node in G if node + "s" in G]
print("1")
for dup in duplicates:
    G = nx.contracted_nodes(G, *dup, self_loops=False)
print("2")
duplicates = [(x, y) for x, y 
              in [(node, node.replace("-", " ")) for node in G]
              if x != y and y in G]
print("3")
for dup in duplicates:
    G = nx.contracted_nodes(G, *dup, self_loops=False)
print("4")
nx.set_node_attributes(G, 0, "contraction")

1
2
3
4


Truncate

In [122]:
core = [node for node, deg in dict(G.degree()).items() if deg >= 2]
G = nx.subgraph(G, core)
print("{} nodes, {} edges".format(len(G), nx.number_of_edges(G)))
# 2995 nodes, 11817 edges
nx.write_graphml(G, "compsci.graphml")

23075 nodes, 122164 edges


Print top degrees

In [125]:
top_indegree = sorted(dict(G.in_degree()).items(),
                      reverse=True, key=itemgetter(1))[:100]
print("\n".join(map(lambda t: "{} {}".format(*reversed(t)), top_indegree)))

309 Help:Authority Control
274 Category:Wikipedia Articles With Gnd Identifiers
273 Integrated Authority File
216 Computer Science
203 Wikipedia:Citation Needed
195 Category:Wikipedia Articles With Ndl Identifiers
168 Category:Wikipedia Articles With Lccn Identifiers
161 Wayback Machine
157 Help:Maintenance Template Removal
121 Software Design
121 Wikipedia:Verifiability
119 Programming Paradigm
119 Statistic
119 Formal Method
118 Modeling Language
118 Software Configuration Management
117 Artificial Intelligence
117 Computer Engineering
117 Software Maintenance
116 Software Deployment
116 Software Development Process
115 Software Quality
112 Software Engineering
110 Mathematic
108 Information Theory
107 Algorithm
105 Computational Physics
104 Software
104 Wikipedia:Citing Sources
103 Systems Engineering
102 Physics
100 Bibliothèque Nationale De France
100 Operations Research
99 Category:Wikipedia Articles With Bnf Identifiers
97 Biology
97 Control Theory
95 Help:Introduction To Refere

In [126]:
#print full graph
nx.write_gexf(G, "graphFile_ComputerScience.gexf")

In [8]:
#load the graph file
G = nx.read_graphml("compsci.graphml")
print(len(G.nodes()), len(G.edges))

23075 122164


In [32]:
#cull nodes with degree less than a set value
node_degree_cutoff = 100

to_keep = [node for node, deg in dict(G.degree()).items() if deg >= node_degree_cutoff]
small = G.subgraph(to_keep)

print(len(small.nodes()))
nx.write_gexf(small, "graphFile_ComputerScience_culled.gexf")

418
