# Classify disciplines

https://networkx.org/documentation/stable/index.html

In [1]:
import json
import Levenshtein as lvs
import networkx as nx

In [2]:
doc = json.load(fp=open("../etc/b2find_disciplines.json"))
len(doc["disciplines"])

336

In [3]:
disc_graph = nx.DiGraph()
for value in doc["disciplines"]:
    parts = value.split("#")
    if len(parts) != 3:
        continue
    nodes = [node.strip() for node in parts]
    start = nodes[1]
    end = nodes[2]
    if start != end:
        disc_graph.add_edge(start, end)
disc_graph.number_of_nodes()

340

In [4]:
def classify(text):
    for node in disc_graph.nodes:
        similarity = lvs.ratio(text.lower(), node.lower())
        if similarity >= 0.9:
            print(node, similarity, list(nx.ancestors(disc_graph, node)), 
                  disc_graph.in_degree(node), disc_graph.out_degree(node))

In [5]:
classify("astrophysics and astronomy")

Astrophysics and Astronomy 1.0 ['Natural Sciences', 'Physics'] 1 11


In [6]:
classify("physics")

Physics 1.0 ['Natural Sciences'] 1 6


In [7]:
classify("information science")

Information Science 1.0 ['Computer Science', 'Computer Science, Electrical and System Engineering', 'Engineering Sciences'] 1 0


In [8]:
classify("infrmation scince")

Information Science 0.9444444444444444 ['Computer Science', 'Computer Science, Electrical and System Engineering', 'Engineering Sciences'] 1 0


In [9]:
classify("Engineering")

Engineering 1.0 ['Engineering Sciences', 'Construction Engineering and Architecture'] 1 0


In [10]:
classify("Computer Science")

Computer Science 1.0 ['Computer Science, Electrical and System Engineering', 'Engineering Sciences'] 1 7


In [11]:
classify("Natural Sciences")

Natural Sciences 1.0 [] 0 4


In [12]:
list(nx.bfs_successors(disc_graph, "Natural Sciences", 1))

[('Natural Sciences', ['Chemistry', 'Physics', 'Mathematics', 'Geosciences'])]

In [13]:
list(nx.ancestors(disc_graph, "Mathematics"))

['Natural Sciences']

## Root nodes

In [14]:
for node in disc_graph.nodes:
    degree = disc_graph.in_degree(node)
    if degree == 0:
        print(node)

Humanities
Social and Behavioural Sciences
Life Sciences
Zoology
Agriculture, Forestry, Horticulture, Aquaculture and Veterinary Medicine
Agriculture, Forestry, Horticulture, Aquaculture
Natural Sciences
Geospheric Science
Engineering Sciences
Process Engineering, Technical


In [15]:
for node in disc_graph.nodes:
    degree = len(list(nx.ancestors(disc_graph, node)))
    if degree == 0:
        print(node)
        print(list(disc_graph.successors(node)))
        print()

Humanities
['Ancient Cultures', 'History', 'Fine Arts, Music, Theatre and Media Studies', 'Linguistics', 'Literary Studies', 'Social and Cultural Anthropology', 'Theology and Religion Studies', 'Philosophy']

Social and Behavioural Sciences
['Education Sciences', 'Psychology', 'Social Sciences', 'Economics', 'Jurisprudence']

Life Sciences
['Biology', 'Plant Sciences', 'Medicine', 'Agriculture, Forestry, Horticulture and Veterinary Medicine']

Zoology
['Systematics and Morphology', 'Evolution, Anthropology', 'Animal Ecology, Biodiversity and Ecosystem Research', 'Sensory and Behavioural Biology', 'Biochemistry and Animal Physiology', 'Animal Genetics, Cell and Developmental Biology']

Agriculture, Forestry, Horticulture, Aquaculture and Veterinary Medicine
['Agriculture, Forestry, Horticulture', 'Veterinary Medicine']

Agriculture, Forestry, Horticulture, Aquaculture
['Soil Sciences', 'Plant Cultivation', 'Plant Nutrition', 'Ecology of Agricultural Landscapes', 'Plant Breeding', 'Phyto

## Leaf nodes

In [16]:
leafs = []
for node in disc_graph.nodes:
    degree = disc_graph.out_degree(node)
    if degree == 0:
        # print(node)
        leafs.append(node)
len(leafs)

280