# Computer Science Knowledge Graph
A networks based approach to analyzing computer science as a field using Wikipedia data

![Computer Science Knowledge Graph]("Stratified_Comp_Sci_Wiki_Connections_Labeled.png")

## Imports

In [1]:
import networkx as nx
import wikipediaapi
from operator import itemgetter

#### Setting vars and stops (pages that have too many links that arent relevant or useful)

In [2]:
STOPS = ("International Standard Serial Number",
         "International Standard Book Number",
         "National Diet Library",
         "International Standard Name Identifier",
         "International Standard Book Number (Identifier)",
         "Pubmed Identifier", "Pubmed Central",
         "Digital Object Identifier", "Arxiv",
         "Proc Natl Acad Sci Usa", "Bibcode",
         "Library Of Congress Control Number", "Jstor")


In [3]:
wiki = wikipediaapi.Wikipedia('en')

#### Setting SEED (starting site)

In [5]:
#starting page
SEED = "Neuroscience".title()
#number of layers
max_layers = 2

#bfs lists
todo_list = [(0,SEED)]
todo_set = set(SEED)
done_set = set()

#instantiating graph
G = nx.DiGraph()

#starting loop
layer , page = todo_list[0]

while layer < max_layers:
    del todo_list[0]
    done_set.add(page)
    print(layer,page)

    try:
        wikiPage = wiki.page(page)
    except:
        layer,page = todo_list[0]
        #print("Could not load", page)
        continue
    
    for link in wikiPage.links:
        link = link.title()
        if link not in STOPS and not link.startswith("List Of") and not link.startswith("Glossary Of") and not (":"in link):
            if link not in todo_set and link not in done_set:
                todo_list.append((layer + 1, link))
                todo_set.add(link)
            G.add_edge(page, link)
    layer, page = todo_list[0]

0 Neuroscience
1 1700 Bc
1 Abiogenesis
1 Abu Al-Qasim Al-Zahrawi
1 Academic Press
1 Action Potential
1 Action Potentials
1 Addiction Medicine
1 Adolf Beck (Physiologist)
1 Affect (Psychology)
1 Affective Neuroscience
1 Alan Lloyd Hodgkin
1 Allen Institute For Brain Science
1 Amyotrophic Lateral Sclerosis
1 Anatomy
1 Ancient Egypt
1 Ancient Greek Medicine
1 Andrew Huxley
1 Anesthesiology
1 António Egas Moniz
1 Aplysia
1 Applied Science
1 Aristotle
1 Artificial Neural Network
1 Arvid Carlsson
1 Astrobiology
1 Australia
1 Austria
1 Austria-Hungary
1 Autonomic Nervous System
1 Averroes
1 Avicenna
1 Avon (Publishers)
1 Axon
1 Axoplasm
1 Bbc
1 Brain Initiative
1 Basic Books
1 Basic Research
1 Bat
1 Behavioral Epigenetics
1 Behavioral Neurology
1 Behavioral Neuroscience
1 Behavioral Sciences
1 Behavioural Genetics
1 Bernard Katz
1 Biochemistry
1 Biogeography
1 Biohistory
1 Bioinformatics
1 Biological Sciences
1 Biological Classification
1 Biological Neural Network
1 Biological Neuron Model
1 

1 Synthetic Biology
1 Systematics
1 Systems Biology
1 Systems Neuroscience
1 Teratology
1 The Mit Press
1 The Man Who Mistook His Wife For A Hat
1 Theoretical Neuroscience
1 Thomas Willis
1 Timeline Of Biology And Organic Chemistry
1 Torsten Wiesel
1 Toxicology
1 Translational Neuroscience
1 Translational Research
1 Transportation Theory (Psychology)
1 Trepanation
1 Ulf Von Euler
1 Ultrasound
1 United Kingdom
1 United States
1 Vesalius
1 Virology
1 Virophysics
1 Visual Perception
1 Visual System
1 W. W. Norton
1 Walter Reed Army Institute Of Research
1 Walter Rudolf Hess
1 Wayback Machine
1 Xenobiology
1 Zoology


In [8]:
print("{} nodes, {} edges".format(len(G), nx.number_of_edges(G)))

61348 nodes, 140359 edges


#### De duplicating dataset

In [9]:
G.remove_edges_from(G.selfloop_edges())
duplicates = [(node, node + "s") for node in G if node + "s" in G]
for dup in duplicates:
    G = nx.contracted_nodes(G, *dup, self_loops=False)
duplicates = [(x, y) for x, y 
              in [(node, node.replace("-", " ")) for node in G]
              if x != y and y in G]
for dup in duplicates:
    G = nx.contracted_nodes(G, *dup, self_loops=False)
nx.set_node_attributes(G, 0, "contraction")

### Truncating nodes based on degree

In [10]:
core = [node for node, deg in dict(G.degree()).items() if deg >= 2]
G = nx.subgraph(G, core)
print("{} nodes, {} edges".format(len(G), nx.number_of_edges(G)))
# 2995 nodes, 11817 edges
nx.write_graphml(G, "../../graphFile_Neuroscience.graphml")

18891 nodes, 98233 edges


#### Print top degrees

In [None]:
top_indegree = sorted(dict(G.in_degree()).items(),
                      reverse=True, key=itemgetter(1))[:100]
#print("\n".join(map(lambda t: "{} {}".format(*reversed(t)), top_indegree)))

### Saving and loading graphs

In [None]:
#print full graph
nx.write_gexf(G, "../../graphFile_Neuroscience.gexf")

In [12]:
#load the graph file
G = nx.read_graphml("../../graphFile_Neuroscience.graphml")
print(len(G.nodes()), len(G.edges))

18891 98233


In [13]:
#cull nodes with degree less than a set value
node_degree_cutoff = 50

to_keep = [node for node, deg in dict(G.degree()).items() if deg >= node_degree_cutoff]
small = G.subgraph(to_keep)

print(len(small.nodes()))
nx.write_gexf(small, "../../graphFile_Neuroscience_culled2.gexf")

360


In [14]:
mod = nx.read_graphml("graphFile_SubGraph.graphml")
mod.nodes.get("Computer Science")

{'Clustering Coefficient': 0.1223583072423935,
 'Degree': 611,
 'Eigenvector Centrality': 0.9718249141879425,
 'In-Degree': 198,
 'Modularity Class': 6,
 'Out-Degree': 413,
 'b': 251,
 'contraction': 0,
 'g': 255,
 'label': 'Computer Science',
 'r': 209,
 'size': 10.0,
 'x': -701.28564,
 'y': 299.30154}