# Granularity
This is a thought experirment for advanced discovery of related Earthdata events. The idea behind this project is that the metadata collected by NASA has extremely high value and the ability to search that data effiecently is pivotal to the discovery of relevant Earthdata information. With this project I sought out a bare bones example of how, given a few key features of a CMR Granule, one may be able to correlate data across collections, concepts, providers, and Natural events. 

Ideally this can be expanded in the furture to other features of a granule in the future.

In [184]:
# Imports
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import directed_hausdorff
import numpy as np

import requests

pd.options.display.max_rows = 4000

## Data collection pipeline

In [151]:
# Query to gather some collections to search through
response = requests.get("https://cmr.earthdata.nasa.gov/search/collections.json?has_granules=true&page_size=500")
json_result = response.json()

final_granules_list = []

# Obtain a list of concept ids
for item in json_result['feed']['entry']:
    # Obtain up to 25 granules from each collection id
    granules = requests.get(f"https://cmr.earthdata.nasa.gov/search/granules.json?collection_concept_id={item['id']}&page_size=25").json()

    for granule in granules['feed']['entry']:
        final_granules_list.append(granule)

len(final_granules_list)


9883

In [152]:
final_granules_list[0]

{'time_start': '1984-12-25T00:00:00.000Z',
 'updated': '2023-03-10T22:34:22.000Z',
 'dataset_id': '15 Minute Stream Flow Data: USGS (FIFE)',
 'points': ['39.1019 -96.595'],
 'data_center': 'ORNL_DAAC',
 'title': 'FIFE_STRM_15M.43601715.s15',
 'coordinate_system': 'CARTESIAN',
 'day_night_flag': 'BOTH',
 'time_end': '1985-01-01T00:00:00.000Z',
 'id': 'G2630004486-ORNL_DAAC',
 'original_format': 'ECHO10',
 'granule_size': '8.89E-4',
 'browse_flag': False,
 'collection_concept_id': 'C179003030-ORNL_DAAC',
 'online_access_flag': True,
 'links': [{'rel': 'http://esipfed.org/ns/fedsearch/1.1/data#',
   'title': 'Download 43601715.s15',
   'hreflang': 'en-US',
   'href': 'https://daac.ornl.gov/daacdata/fife/data/hydrolgy/strm_15m/y1984/43601715.s15'},
  {'rel': 'http://esipfed.org/ns/fedsearch/1.1/metadata#',
   'title': "ORNL DAAC Data Set Documentation (USER'S GUIDE)",
   'hreflang': 'en-US',
   'href': 'https://daac.ornl.gov/FIFE/guides/15_min_strm_flow.html'},
  {'rel': 'http://esipfed.or

## Data Analysis and Engineering

In [153]:
df = pd.json_normalize(final_granules_list)
df.head()

Unnamed: 0,time_start,updated,dataset_id,points,data_center,title,coordinate_system,day_night_flag,time_end,id,...,boxes,producer_granule_id,lines,polygons,orbit_calculated_spatial_domains,orbit.ascending_crossing,orbit.start_lat,orbit.start_direction,orbit.end_lat,orbit.end_direction
0,1984-12-25T00:00:00.000Z,2023-03-10T22:34:22.000Z,15 Minute Stream Flow Data: USGS (FIFE),[39.1019 -96.595],ORNL_DAAC,FIFE_STRM_15M.43601715.s15,CARTESIAN,BOTH,1985-01-01T00:00:00.000Z,G2630004486-ORNL_DAAC,...,,,,,,,,,,
1,1985-01-01T00:00:00.000Z,2023-03-10T22:34:22.000Z,15 Minute Stream Flow Data: USGS (FIFE),[39.1019 -96.595],ORNL_DAAC,FIFE_STRM_15M.50011715.s15,CARTESIAN,BOTH,1985-02-01T00:00:00.000Z,G2630004452-ORNL_DAAC,...,,,,,,,,,,
2,1985-02-01T00:00:00.000Z,2023-03-10T22:34:22.000Z,15 Minute Stream Flow Data: USGS (FIFE),[39.1019 -96.595],ORNL_DAAC,FIFE_STRM_15M.50321715.s15,CARTESIAN,BOTH,1985-03-01T00:00:00.000Z,G2630004461-ORNL_DAAC,...,,,,,,,,,,
3,1985-03-01T00:00:00.000Z,2023-03-10T22:34:22.000Z,15 Minute Stream Flow Data: USGS (FIFE),[39.1019 -96.595],ORNL_DAAC,FIFE_STRM_15M.50601715.s15,CARTESIAN,BOTH,1985-04-01T00:00:00.000Z,G2630004485-ORNL_DAAC,...,,,,,,,,,,
4,1985-04-01T00:00:00.000Z,2023-03-10T22:34:22.000Z,15 Minute Stream Flow Data: USGS (FIFE),[39.1019 -96.595],ORNL_DAAC,FIFE_STRM_15M.50911715.s15,CARTESIAN,BOTH,1985-05-01T00:00:00.000Z,G2630004469-ORNL_DAAC,...,,,,,,,,,,


In [154]:
# The features of a granule
df.columns

Index(['time_start', 'updated', 'dataset_id', 'points', 'data_center', 'title',
       'coordinate_system', 'day_night_flag', 'time_end', 'id',
       'original_format', 'granule_size', 'browse_flag',
       'collection_concept_id', 'online_access_flag', 'links', 'boxes',
       'producer_granule_id', 'lines', 'polygons',
       'orbit_calculated_spatial_domains', 'orbit.ascending_crossing',
       'orbit.start_lat', 'orbit.start_direction', 'orbit.end_lat',
       'orbit.end_direction'],
      dtype='object')

In [155]:
df.describe()

Unnamed: 0,time_start,updated,dataset_id,points,data_center,title,coordinate_system,day_night_flag,time_end,id,...,boxes,producer_granule_id,lines,polygons,orbit_calculated_spatial_domains,orbit.ascending_crossing,orbit.start_lat,orbit.start_direction,orbit.end_lat,orbit.end_direction
count,9883,9883,9883,641,9883,9883,9883,9883,9883,9883,...,6110,6148,505,2552,836,300.0,300.0,300,300.0,300
unique,3745,4589,500,269,11,9883,4,4,4062,9883,...,1598,5938,213,1101,282,144.0,2.0,2,2.0,2
top,1984-01-01T00:00:00.000Z,2012-02-20T20:04:40.000Z,15 Minute Stream Flow Data: USGS (FIFE),[39.1019 -96.595],ORNL_CLOUD,FIFE_STRM_15M.43601715.s15,CARTESIAN,UNSPECIFIED,1989-10-31T00:00:00.000Z,G2630004486-ORNL_DAAC,...,[-90 -180 90 180],ALPSRP016350460,[31.566639 -125.453053 37.649117 -116.897793],[[25 -85 25 -58.5 50 -58.5 50 -85 25 -85]],[{'orbit_number': '-1'}],-39.49,81.8,D,-81.8,D
freq,142,38,25,25,2726,1,8334,4398,174,1,...,1992,5,33,31,236,7.0,154.0,154,154.0,154


In [161]:
# make a copy of the df just in case! :) 
data = df
data = data.drop(['updated', 'time_start', 'time_end', 'dataset_id', 'browse_flag', 'online_access_flag', 'links', 'day_night_flag', 'original_format', 'coordinate_system', 'producer_granule_id', 'granule_size', 'orbit_calculated_spatial_domains', 'orbit.ascending_crossing', 'orbit.start_lat', 'orbit.start_direction', 'orbit.end_lat', 'orbit.end_direction', 'points', 'boxes', 'lines'], axis=1)
data = data[data.polygons.notna()]
data.head()


Unnamed: 0,data_center,title,id,collection_concept_id,polygons
340,NSIDC_ECS,SC:ABLVIS1B.001:129486296,G1513115240-NSIDC_ECS,C1513105920-NSIDC_ECS,[[52.73648 -107.21639 52.73287 -107.21639 52.7...
341,NSIDC_ECS,SC:ABLVIS1B.001:129487317,G1513115837-NSIDC_ECS,C1513105920-NSIDC_ECS,[[52.98734 -107.12686 52.98553 -107.12984 52.9...
342,NSIDC_ECS,SC:ABLVIS1B.001:129487360,G1513116253-NSIDC_ECS,C1513105920-NSIDC_ECS,[[53.29756 -107.55 53.29395 -107.55 53.28312 -...
343,NSIDC_ECS,SC:ABLVIS1B.001:129487358,G1513116276-NSIDC_ECS,C1513105920-NSIDC_ECS,[[53.31764 -108.09025 53.31764 -108.09327 53.3...
344,NSIDC_ECS,SC:ABLVIS1B.001:129487523,G1513116588-NSIDC_ECS,C1513105920-NSIDC_ECS,[[53.3285 -108.55822 53.32669 -108.55824 53.31...


In [157]:
data.describe()

Unnamed: 0,data_center,title,id,collection_concept_id,polygons
count,2552,2552,2552,2552,2552
unique,6,2552,2552,115,1101
top,LARC_ASDC,SC:ABLVIS1B.001:129486296,G1513115240-NSIDC_ECS,C1513105920-NSIDC_ECS,[[25 -85 25 -58.5 50 -58.5 50 -85 25 -85]]
freq,1184,1,1,25,31


In [158]:
list(data['polygons'].head(1))


[[['52.73648 -107.21639 52.73287 -107.21639 52.72565 -107.21639 52.68955 -107.14518 52.6354 -107.03845 52.58847 -106.91991 52.54515 -106.83707 52.52349 -106.77785 52.54515 -106.77169 52.57764 -106.83084 52.61735 -106.93757 52.67511 -107.04424 52.71843 -107.1451 52.74009 -107.21639 52.73648 -107.21639']]]

In [159]:
def conversion_to_tuples(coord_list):
    coord_list = coord_list['polygons'][0][0].split()

    tuples = []
    for i in range(0, len(coord_list), 2):
        tuples.append((float(coord_list[i]), float(coord_list[i + 1])))

    return tuples

# Testing the function works
conversion_to_tuples(data.iloc[0])

[(52.73648, -107.21639),
 (52.73287, -107.21639),
 (52.72565, -107.21639),
 (52.68955, -107.14518),
 (52.6354, -107.03845),
 (52.58847, -106.91991),
 (52.54515, -106.83707),
 (52.52349, -106.77785),
 (52.54515, -106.77169),
 (52.57764, -106.83084),
 (52.61735, -106.93757),
 (52.67511, -107.04424),
 (52.71843, -107.1451),
 (52.74009, -107.21639),
 (52.73648, -107.21639)]

In [162]:
# Convert every row to tuples
for index, row in data.iterrows():
    row['polygons'] = conversion_to_tuples(row)
    data.at[index,'polygons'] = row['polygons']
data.head()

Unnamed: 0,data_center,title,id,collection_concept_id,polygons
340,NSIDC_ECS,SC:ABLVIS1B.001:129486296,G1513115240-NSIDC_ECS,C1513105920-NSIDC_ECS,"[(52.73648, -107.21639), (52.73287, -107.21639..."
341,NSIDC_ECS,SC:ABLVIS1B.001:129487317,G1513115837-NSIDC_ECS,C1513105920-NSIDC_ECS,"[(52.98734, -107.12686), (52.98553, -107.12984..."
342,NSIDC_ECS,SC:ABLVIS1B.001:129487360,G1513116253-NSIDC_ECS,C1513105920-NSIDC_ECS,"[(53.29756, -107.55), (53.29395, -107.55), (53..."
343,NSIDC_ECS,SC:ABLVIS1B.001:129487358,G1513116276-NSIDC_ECS,C1513105920-NSIDC_ECS,"[(53.31764, -108.09025), (53.31764, -108.09327..."
344,NSIDC_ECS,SC:ABLVIS1B.001:129487523,G1513116588-NSIDC_ECS,C1513105920-NSIDC_ECS,"[(53.3285, -108.55822), (53.32669, -108.55824)..."


## (Failed attempts at ) Training a Machine Learning Model

In [164]:
# Hold out data
test_data = data.tail(10)

data.drop(data.tail(10).index, inplace = True)

X = np.concatenate(data['polygons'].values)

k = 5
model = NearestNeighbors(n_neighbors=k, metric='euclidean')
model.fit(X)

## Hausdorff_distance Approach

In [185]:
def find_similar_shapes(query_shape, data, top_n=5):
    similarities = []
    for shape in data['polygons']:
        similarity = directed_hausdorff(query_shape, shape)[0]

        similarities.append(similarity)

    similar_indices = sorted(range(len(similarities)), key=lambda i: similarities[i])[:top_n]
    similar_shapes = data.iloc[similar_indices]

    return similar_shapes

In [192]:
test_data.iloc[0]

data_center                                                        OB_DAAC
title                    1_MODISA_L1_GEO_AQUA_MODIS.20020704T011501.GEO...
id                                                     G2527713500-OB_DAAC
collection_concept_id                                  C2526537408-OB_DAAC
polygons                 [(64.2215, -168.93315), (58.13617, 148.42354),...
Name: 9848, dtype: object

In [191]:
test_data.iloc[0].title

'1_MODISA_L1_GEO_AQUA_MODIS.20020704T011501.GEO.hdf'

In [187]:
similar_granules = find_similar_shapes(test_data.iloc[0].polygons, data)
similar_granules

Unnamed: 0,data_center,title,id,collection_concept_id,polygons
9097,NSIDC_ECS,SC:AA_L2A.001:29262271,G1241441498-NSIDC_ECS,C1241435536-NSIDC_ECS,"[(-80.13003204789746, 8.045044113989507), (-81..."
8162,NSIDC_ECS,SC:AU_Land.001:142373337,G1570336335-NSIDC_ECS,C1343001245-NSIDC_ECS,"[(-81.90328868494649, 8.572944224512066), (-79..."
8120,NSIDC_ECS,SC:AU_Ocean.001:276148003,G2716487483-NSIDC_ECS,C2176472016-NSIDC_ECS,"[(-82.31548700871022, 12.414641603010141), (-7..."
8145,NSIDC_ECS,SC:AU_Rain.001:254894218,G2552253056-NSIDC_ECS,C1708620364-NSIDC_ECS,"[(-82.31548700871022, 12.414641603010141), (-7..."
8554,NSIDC_ECS,SC:AMSREL1A.003:41185207,G187453387-NSIDC_ECS,C186584407-NSIDC_ECS,"[(72.89720139676162, -79.30297860712726), (72...."


## Lessons learned

- I don't know how to numerically represent a set of coordinates such that a good ML model like ball and tree or KNN could be best utilized
- I learned a good bit about hausdorff distance and I think this could be improved on to make the shape recommendations closer
- I would have loved to map this to more features of a granule and in the future that would be intresting
- Math is really hard