# NASA dataset keywords analysis

https://www.tidytextmining.com/nasa.html

In [72]:
import json
import pandas as pd
import networkx as nx

In [64]:
from kganalytics.network_generation import generate_cooccurrence_network

from kganalytics.metrics import (compute_degree_centrality,
                               compute_pagerank_centrality,
                               compute_betweenness_centrality,
                               detect_communities,
                               compute_all_metrics)
from kganalytics.export import (save_network,
                              save_to_gephi)
from kganalytics.paths import (minimum_spanning_tree,
                             top_n_paths,
                             top_n_tripaths,
                             single_shortest_path,
                             top_n_nested_paths,
                             graph_from_paths,
                             pretty_print_paths,
                             pretty_print_tripaths)
from kganalytics.utils import subgraph_by_types

In [46]:
with open("data/nasa.json", "r") as f:
    data = json.load(f)

In [48]:
rows = []
for el in data['dataset']:
    row = [el["identifier"]]
    if "keyword" in el:
        for k in el["keyword"]:
            rows.append(row + [k])
keyword_data = pd.DataFrame(rows, columns=["dataset", "keyword"])

In [49]:
keyword_data

Unnamed: 0,dataset,keyword
0,urn:nasa:pds:context_pds3:data_set:data_set.ro...,international rosetta mission
1,urn:nasa:pds:context_pds3:data_set:data_set.ro...,earth
2,urn:nasa:pds:context_pds3:data_set:data_set.ro...,unknown
3,C1973352326-GHRC_CLOUD,earth science
4,C1973352326-GHRC_CLOUD,atmosphere
...,...,...
110742,NASA-877__2,apollo
110743,NASA-877__2,catalog
110744,NASA-877__2,lunar
110745,TECHPORT_94299,active


In [50]:
keyword_occurrence = keyword_data.groupby("keyword").aggregate(set)

In [51]:
keyword_occurrence["frequency"] = keyword_occurrence.dataset.apply(len)

In [57]:
print("20 most frequent keywords:\n")
for k in keyword_occurrence.nlargest(20, "frequency").index:
    print("\t", k)

20 most frequent keywords:

	 completed
	 earth science
	 atmosphere
	 national geospatial data asset
	 ngda
	 active
	 land surface
	 oceans
	 goddard space flight center
	 glenn research center
	 langley research center
	 spectral/engineering
	 jet propulsion laboratory
	 ames research center
	 johnson space center
	 biosphere
	 atmospheric water vapor
	 atmospheric radiation
	 marshall space flight center
	 atmospheric temperature


In [58]:
n_datasets = len(keyword_data.dataset.unique())

In [62]:
comention_network = generate_cooccurrence_network(
    keyword_occurrence, "dataset", n_datasets,
    n_most_frequent=1000,
    parallelize=True)

Fitering data.....
Selected 1000 most frequent terms
Examining 499500 pairs of terms for co-occurrence...
Generated 55442 edges                    
Created a co-occurrence graph:
	number of nodes:  1000
	number of edges:  55442
Saving the edges...
Creating a graph object...


In [74]:
print("Density of the constructed network: ", nx.density(comention_network))

Density of the constructed network:  0.11099499499499499


In [76]:
_ = compute_degree_centrality(comention_network, ["frequency"], 20)

Top n nodes by frequency:
	earth science (37219)
	atmosphere (18258)
	national geospatial data asset (16742)
	ngda (16742)
	land surface (12152)
	oceans (9404)
	completed (8786)
	atmospheric water vapor (8248)
	atmospheric temperature (7480)
	atmospheric radiation (7402)
	biosphere (7057)
	spectral/engineering (6245)
	precipitation (5923)
	clouds (5394)
	atmospheric chemistry (4962)
	ocean temperature (4914)
	vegetation (4684)
	atmospheric pressure (4471)
	terrestrial hydrosphere (4244)
	cryosphere (4222)



In [77]:
_ = compute_pagerank_centrality(comention_network, ["frequency"], 20)

Top n nodes by frequency:
	earth science (0.02)
	completed (0.02)
	atmosphere (0.01)
	active (0.01)
	national geospatial data asset (0.01)
	ngda (0.01)
	pds (0.01)
	land surface (0.01)
	spice (0.00)
	oceans (0.00)
	labeling (0.00)
	space science (0.00)
	jupiter (0.00)
	calibration (0.00)
	mars (0.00)
	earth (0.00)
	atmospheric water vapor (0.00)
	biosphere (0.00)
	saturn (0.00)
	spectral/engineering (0.00)



In [78]:
_ = compute_betweenness_centrality(
    comention_network, ["distance_npmi"], 20)

Top n nodes by distance_npmi:
	10199 chariklo (0.12936249542795972)
	nix (0.0777490917771479)
	ida (0.07718941386276056)
	ceres (0.07389353281136848)
	planetary science (0.07308510915725344)
	project (0.07044118266563157)
	atlas (0.0632516283818889)
	safety (0.06299485858604095)
	management (0.059841404530783286)
	active (0.05961472494538627)
	star (0.054409118537375054)
	imagery (0.054409118537375054)
	iss (0.051807318340384476)
	operations (0.04893570925635054)
	space (0.048007927767446806)
	time (0.04567092142242443)
	working group for planetary system nomenclature (0.044734112870385416)
	astronomy (0.044442237829011376)
	2p/encke 1 (1818 w1) (0.044327895029297834)
	gsfc (0.044102218450915845)



In [79]:
_ = detect_communities(comention_network, weight="frequency", set_attr="community")
_ = detect_communities(comention_network, weight="npmi", set_attr="community_npmi")

Detecting communities...
Best network partition:
	 Number of communities: 13
	 Modularity: 0.6383457625721062
Detecting communities...
Best network partition:
	 Number of communities: 7
	 Modularity: 0.16926481232934845


In [89]:
# Save the graph for Gephi import.
save_to_gephi(
    comention_network, "data/gephi_nasa_comention", 
    node_attr_mapping = {
        "degree_frequency": "Degree",
        "pagerank_frequency": "PageRank",
        "betweenness_distance_npmi": "Betweenness",
        "community": "Community"
    },
    edge_attr_mapping={
        "npmi": "Weight"
    })

The representation of the network saved above can be imported into Gephi for producing graph visualizations, as in the following example:

In the figures below colors represent communities detected using the raw frequency of the co-occurrence edges, node sizes are proportional to the PageRank of nodes and edge thickness to the NPMI values.

![alt text](./figures/nasa/full_network.png "NASA dataset keywords co-occurrence network")



Community | Zoom
- | - 
Celestial bodies <img src="./figures/nasa/celestial_body_cluster.png" alt="Drawing" style="width: 400px;"/>|<img src="./figures/nasa/celestial_body_cluster_zoom.png" alt="Drawing" style="width: 400px;"/>
Earth science <img src="./figures/nasa/earth_science.png" alt="Drawing" style="width: 400px;"/>|<img src="./figures/nasa/earch_science_zoom.png" alt="Drawing" style="width: 400px;"/>
Space programs and missions <img src="./figures/nasa/programs_missions.png" alt="Drawing" style="width: 400px;"/>|<img src="./figures/nasa/programs_missions_zoom.png" alt="Drawing" style="width: 400px;"/>

In [None]:

![alt text]( "Celestial bodies")![alt text](./figures/nasa/nasa_celestial_body_cluster_zoom.png "Celestial bodies")

In [82]:
tree = minimum_spanning_tree(comention_network, weight="distance_npmi")

In [90]:
save_to_gephi(
    tree, "data/gephi_nasa_spanning_tree", 
    node_attr_mapping = {
        "degree_frequency": "Degree",
        "pagerank_frequency": "PageRank",
        "betweenness_distance_npmi": "Betweenness",
        "community": "Community"
    },
    edge_attr_mapping={
        "npmi": "Weight"
    })

![alt text](./figures/nasa/tree.png "Minimum spanning tree")

In [103]:
single_shortest_path(comention_network, "space shuttle", "mars")

['earth', 'mars']

In [87]:
paths = top_n_paths(
    comention_network, "space shuttle", "mars", n=10,
    distance="distance_npmi",
    strategy="yen")

In [88]:
pretty_print_paths(paths)

space shuttle <->                                                  <-> mars
                  vehicles <-> mars global surveyor
                  vehicles <-> orbiter <-> mars global surveyor
                  shuttle <-> mars reconnaissance orbiter
                  3d model <-> mars global surveyor
                  3d model <-> orbiter <-> mars global surveyor
                  shuttle <-> vehicles <-> mars global surveyor
                  spacecraft <-> vehicles <-> mars global surveyor
                  3d model <-> vehicles <-> mars global surveyor
                  vehicles <-> 3d model <-> mars global surveyor
                  shuttle <-> 3d model <-> mars global surveyor


In [94]:
paths = top_n_paths(
    comention_network, "jupiter", "mars", n=10,
    distance="distance_npmi",
    strategy="yen")

In [92]:
pretty_print_paths(paths)

earth <->                      <-> mars
          c/ison (2012 s1)
          vega
          wasp-3
          xo-2
          xo-3
          vega <-> phobos
          2p/encke 1 (1818 w1)
          epoxi
          gj 436
          hat-p-4


In [99]:
paths = top_n_nested_paths(
    comention_network, "shuttle", "mars",
    n=10, nested_n=2, depth=2, distance="distance_npmi",
    strategy="naive")

In [100]:
paths

[('shuttle', 'spacecraft', 'mars reconnaissance orbiter'),
 ('shuttle', 'launch', 'mars reconnaissance orbiter'),
 ('mars reconnaissance orbiter', 'mars express', 'mars'),
 ('mars reconnaissance orbiter', 'viking', 'mars'),
 ('shuttle', 'space shuttle', '3d model'),
 ('shuttle', 'vehicles', '3d model'),
 ('3d model', 'mars global surveyor', 'mars'),
 ('3d model', 'mars reconnaissance orbiter', 'mars'),
 ('shuttle', '3d model', 'satellite'),
 ('shuttle', 'spacecraft', 'satellite'),
 ('satellite', 'mars global surveyor', 'mars'),
 ('satellite', 'mars reconnaissance orbiter', 'mars')]

In [97]:
pretty_print_paths(paths)

earth <->                      <-> c/ison (2012 s1)
          messenger
          moon
          2p/encke 1 (1818 w1)
          vega
          c/linear (2002 t7)
          m42
          phobos
          deimos
          xo-2
          xo-3
          c/ison (2012 s1)
          xo-2
          xo-3
          wasp-3
          c/ison (2012 s1)
          xo-3
          xo-2
          wasp-3
          c/ison (2012 s1)
          xo-2
          messenger
          mercury
          c/ison (2012 s1)
          vega
          wasp-3
          xo-2
          c/ison (2012 s1)
          wasp-3
          wasp-3
          xo-2
          c/ison (2012 s1)
          wasp-3
          wasp-3
          xo-2
          c/ison (2012 s1)
          wasp-3
          wasp-3
          xo-2
          c/ison (2012 s1)
          wasp-3
