# Classify disciplines

https://networkx.org/documentation/stable/index.html

In [1]:
import json
import Levenshtein as lvs
import networkx as nx
import yaml

In [2]:
doc = json.load(fp=open("../etc/b2find_disciplines.json"))
len(doc["disciplines"])

336

In [3]:
G = nx.DiGraph()
attributes = {}
for value in doc["disciplines"]:
    parts = value.split("#")
    if len(parts) < 3:
        continue
    nodes = [node.strip() for node in parts]
    label = nodes[0]
    start = nodes[1]
    end = nodes[2]
    attributes[end] = {"label": label}
    if start != end:
        G.add_edge(start, end)
nx.set_node_attributes(G, attributes)
G.number_of_nodes()

336

In [4]:
def classify(text):
    for node in G.nodes:
        similarity = lvs.ratio(text.lower(), node.lower())
        if similarity >= 0.9:
            print(node, similarity, list(nx.ancestors(G, node)), 
                  G.in_degree(node), G.out_degree(node))

In [5]:
classify("astrophysics and astronomy")

Astrophysics and Astronomy 1.0 ['Natural Sciences', 'Physics'] 1 11


In [6]:
classify("physics")

Physics 1.0 ['Natural Sciences'] 1 6


In [7]:
classify("information science")

Information Science 1.0 ['Engineering Sciences', 'Computer Science, Electrical and System Engineering', 'Computer Science'] 1 0


In [8]:
classify("infrmation scince")

Information Science 0.9444444444444444 ['Engineering Sciences', 'Computer Science, Electrical and System Engineering', 'Computer Science'] 1 0


In [9]:
classify("Engineering")

Engineering 1.0 ['Engineering Sciences', 'Construction Engineering and Architecture'] 1 0


In [10]:
classify("Computer Science")

Computer Science 1.0 ['Engineering Sciences', 'Computer Science, Electrical and System Engineering'] 1 7


In [11]:
classify("Natural Sciences")

Natural Sciences 1.0 [] 0 4


In [12]:
list(nx.bfs_successors(G, "Natural Sciences", 1))

[('Natural Sciences', ['Chemistry', 'Physics', 'Mathematics', 'Geosciences'])]

In [13]:
list(nx.ancestors(G, "Mathematics"))

['Natural Sciences']

## Root nodes

In [14]:
for node in G.nodes:
    degree = G.in_degree(node)
    if degree == 0:
        print(node, nx.get_node_attributes(G, 'label')[node])

Humanities 1
Social and Behavioural Sciences 2
Life Sciences 3
Natural Sciences 4
Engineering Sciences 5


In [15]:
for node in G.nodes:
    degree = len(list(nx.ancestors(G, node)))
    if degree == 0:
        print(node)
        print(list(G.successors(node)))
        print()

Humanities
['Ancient Cultures', 'History', 'Fine Arts, Music, Theatre and Media Studies', 'Linguistics', 'Literary Studies', 'Social and Cultural Anthropology', 'Theology and Religion Studies', 'Philosophy']

Social and Behavioural Sciences
['Education Sciences', 'Psychology', 'Social Sciences', 'Economics', 'Jurisprudence']

Life Sciences
['Biology', 'Medicine', 'Agriculture, Forestry, Horticulture, Aquaculture and Veterinary Medicine']

Natural Sciences
['Chemistry', 'Physics', 'Mathematics', 'Geosciences']

Engineering Sciences
['Mechanical and industrial Engineering', 'Thermal Engineering/Process Engineering', 'Materials Science and Engineering', 'Computer Science, Electrical and System Engineering', 'Construction Engineering and Architecture']



## Leaf nodes

In [16]:
leafs = []
for node in G.nodes:
    degree = G.out_degree(node)
    if degree == 0:
        # print(node)
        leafs.append(node)
len(leafs)

275

## Store as yaml

In [17]:
def node_name(node, with_label=False):
    label = node_label(node)
    if with_label and label:
        name = f"{label}#{node}"
    else:
        name = node
    return name

def node_label(node):
    labels = nx.get_node_attributes(G, "label")
    return labels.get(node)

def sort_nodes(nodes):
    return sorted(nodes, key=node_label)

def root_nodes():
    nodes = []
    for node in G.nodes:
        degree = G.in_degree(node)
        if degree == 0:
            nodes.append(node)
    return nodes

def build_forest(nodes, with_labels=False):
    forest = {'root': []}
    for node in sort_nodes(nodes):
        name = node_name(node, with_labels)
        forest['root'].append(build_tree(node, with_labels))
    return forest

def build_tree(node, with_labels=False):
    children = list(G.successors(node))
    name = node_name(node, with_labels)
    if children:
        # node with children
        tree = {}
        tree[name] = []
        for child in sort_nodes(children):
            tree[name].append(build_tree(child, with_labels))
    else:
        # leaf node
        tree = name
    return tree

In [28]:
forest = build_forest(root_nodes(), with_labels=False)['root']
print(yaml.dump(forest))

- Humanities:
  - Ancient Cultures:
    - Prehistory
    - Classical Philology
    - Ancient History
    - Classical Archaeology
    - Archaeology
    - Egyptology and Ancient Near Eastern Studies
  - History:
    - Medieval History
    - Early Modern History
    - Modern and Current History
    - History of Science
    - History of Political Thought
  - Fine Arts, Music, Theatre and Media Studies:
    - Art History
    - Fine Arts
    - Dance
    - Theater
    - Film studies
    - Photography
    - Music
    - Design
    - Mixed Media and Media Studies
  - Linguistics:
    - General and Applied Linguistics
    - Individual Linguistics
    - Non-European Languages,Historical Linguistics
    - Typology, Etymology, Morphology, Phonetics
  - Literary Studies:
    - Medieval European Literature
    - Modern European Literature
    - North American Literature
    - South American Literature
    - African Literature
    - Asian Literature
    - Australia/Oceania Literature
    - Literary The

In [29]:
# nx.get_node_attributes(G, "label")

In [30]:
yaml.dump(forest, open("../etc/b2find_disciplines.yaml", "w"))

## Read from yaml

In [21]:
disciplines = yaml.safe_load(open("../etc/b2find_disciplines.yaml"))
# disciplines

In [22]:
#G = nx.DiGraph()
#attributes = {}

In [23]:
def read_tree(node, others):
    if isinstance(others, dict):
        for key in others.keys():
            attributes[node] = {"label": "1"}
            G.add_edge(node, key)
            read_tree(key, others[key])
    elif isinstance(others, list):
        for val in others:
            read_tree(node, val)
    elif isinstance(others, str):
        attributes[node] = {"label": "1"}
        G.add_edge(node, others)

In [24]:
#read_tree('root', disciplines)
#nx.set_node_attributes(G, attributes)

In [25]:
#list(G.nodes())[0]

In [26]:
#G.number_of_nodes()

In [27]:
#forest = build_forest(["root"], True)['root']
#print(yaml.dump(forest))