In [1]:
DOCS_YML_LN = "https://github.com/samapriya/awesome-gee-community-datasets/raw/master/mkdocs.yml"
PROJECT_MD_DIR = "https://github.com/samapriya/awesome-gee-community-datasets/raw/master/docs"

import urllib.request
# display(response.read())
# dir(response)
import yaml


EXCLUDED_NAV_ITEMS = ['index.md', 'stats.md', 'license.md', 'changelog.md']
with urllib.request.urlopen(DOCS_YML_LN) as stream:
    yaml_dict = yaml.load(stream, Loader=yaml.BaseLoader)


In [2]:
import pandas
datasets = pandas.DataFrame(
    columns=['category_name', 'title', 'md_file']
)
for nav_item in yaml_dict['nav']:
    for category_name in nav_item:  # there is only one key
        if nav_item[category_name] not in EXCLUDED_NAV_ITEMS:
            for dataset in nav_item[category_name]:
                for dataset_name in dataset:  # only one key
                    dataset_dict = {
                        'category_name': category_name,
                        'title': dataset_name,
                        'md_file': dataset[dataset_name],
                    }
                    # dataset_list = [category_name, dataset_name, dataset[dataset_name]]
                    # print(dataset_dict)
                    datasets = datasets.append(dataset_dict, ignore_index=True)
                    
# display(datasets)

In [3]:
import urllib.request
import re

YR_REGEX_19 = "[^0-9]19[3-9][0-9][^0-9]"  # 1930-1999
YR_REGEX_2k = "[^0-9]20[0-9]{2}[^0-9]"  # 2000-2099
YR_REGEX = f"{YR_REGEX_19}|{YR_REGEX_2k}"

def scrape_years(text):
    """
    Returns a list of all years mentioned in the given text string.
    Assumes that data is between years 1930-2099.
    """
    yr_list = [int(yr[1:-1]) for yr in re.compile(YR_REGEX).findall(text)]
    return yr_list


def guess_data_freq_from_yrs_mentioned(years_mentioned):
    """
    Estimates temporal frequency of data from a list of years that were mentioned
    in a text. The years should be in the order that they appeared in the text.
    
    TODO: deal with year ranges (eg 2000-2010)
    """
    yr_df = pandas.DataFrame(years_mentioned, columns=['year'])
    yr_df['dt'] = yr_df.diff()
    # yr_df['ddt'] = yr_df['dt'].diff()
    yr_df['cumul_dts'] = yr_df.groupby('dt').cumcount()
    yr_df['cumul_dts_dt'] = yr_df['cumul_dts'].diff()
    yr_df['cumul_cumul_dts_dt'] = yr_df.groupby('cumul_dts_dt').cumcount()
    estimated_data_frequency = yr_df.iloc[yr_df['cumul_cumul_dts_dt'].idxmax()]['dt']
    return estimated_data_frequency


def guess_datatype(text):
    """
    Guesses GEE dataset datatype based on readme text
    """
    text = text.lower()
    # regex searches that hint at what type of data this is:
    FEATURE_COLLECTION_HINTS = [  # TODO: these could be weighted by how good of a hint they are
        'featurecollection',
        'point',
        'vector',
        'ee.featurecollection',
    ]
    IMAGE_COLLECTION_HINTS = [
        'arc[ -]?second',
        'image',
        'imagecollection',
        'raster',
        'ee.image',

    ]
    n_fc_hints = sum([len(re.compile(hint).findall(text)) for hint in FEATURE_COLLECTION_HINTS])
    n_ic_hints = sum([len(re.compile(hint).findall(text)) for hint in IMAGE_COLLECTION_HINTS])
    if n_fc_hints >= n_ic_hints:  # if more feature collection hints
        return 'FeatureCollection'
    else:
        return 'ImageCollection||Image'

def guess_updated_year(text):
    """
    Estimates when the dataset was updated by looking for the word "update" followed by a date.
    When multiple "update" years found, returns the most common one.
    """
    update_years = []
    for text_after_update_chunk in text.split("update")[1:]:
        # get first year following the word "update"
        update_years.append(int(
            re.compile(YR_REGEX).findall(text_after_update_chunk)[0][1:-1]
        ))
    return max(set(update_years), key=update_years.count)  # return mode


def guess_spatial_resolution(text):
    """
    Guesses spatial frequency from text about the dataset.
    When multiple possible frequencies found, returns the most common one.
    
    Returns
    -------
    int
        length of one pixel edge in meters
    """
    SPATIAL_UNIT_SUFFIXES = [
        # unit, multiplier
        ['km', 1000],
        ['m', 1],
        # TODO: arcseconds
    ]
    text = text.lower()

    resolutions = []
    for unit in SPATIAL_UNIT_SUFFIXES:
        regex = re.compile("([0-9]+)[ ]?" + unit[0] + " ")
        resolutions += [float(res)*unit[1] for res in regex.findall(text)]
    return resolutions

def guess_data_level(text):
    """
    Returns NASA data processing level (L0-L4) guessed from given text.
    [lvl ref](https://earthdata.nasa.gov/collaborate/open-data-services-and-software/data-information-policy/data-levels)
    """
    return "TODO"

def parse_dataset_md(md_url):
    """
    fetches .md file from the given url, reads the file, and returns whatever dataset metadata was able to be gleaned from the text.
    """
    with urllib.request.urlopen(md_url) as response:
        text = response.read().decode('utf-8')
        years_mentioned = scrape_years(text)
        # TODO: could parse these further to glean info from order & adjacent year diffs
        metadata = {}
        for metadata_guess in [
            # [ var name, var getter function, kwargs, default value]
            ["years_mentioned", None, None, years_mentioned],
            ["temporal_resolution_yrs", guess_data_freq_from_yrs_mentioned, years_mentioned, 'NaT'],
            ["spatial_resolution_m", guess_spatial_resolution, text, 'NaN'],
            ["datatype", guess_datatype, text, '?'],
            ["updated_year", guess_updated_year, text, 'NaT'],
            ["processing_level", guess_data_level, text, 'L?'],
        ]:
            key, guesser_fn, guesser_arg, default_val = metadata_guess
            try:
                metadata[key] = guesser_fn(guesser_arg)
            except:
                metadata[key] = default_val
        return metadata
    
# def test_apply(row):
#     return parse_dataset_md(f"{PROJECT_MD_DIR}/{row['md_file']}")
    
    
# for each row in datasets:
# append rows from
# parse_dataset_md(f"{PROJECT_MD_DIR}/projects/aogcm_cmip6.md")
# for index, row in datasets.iterrows():
#     # display(row)
#     metadata = parse_dataset_md(f"{PROJECT_MD_DIR}/{row['md_file']}")
#     # row.append(metadata)
#     display(metadata)
def wrapper_fn(row):
    print('.', end='')
    # print(f"guessing stuff about {row['title']}")
    return parse_dataset_md(f"{PROJECT_MD_DIR}/{row['md_file']}")

# use use the guess metatadata fn on every row
datasets["guessed_metadata"] = datasets.apply(wrapper_fn, axis=1)
# explode the metadata dict into columns
datasets = datasets.join(pandas.json_normalize(datasets['guessed_metadata']))
del datasets['guessed_metadata']


# display(
#     datasets.append(
#         parse_dataset_md(f"{PROJECT_MD_DIR}/{datasets.loc[0]['md_file']}"),
#         ignore_index = True,
#     )
# )


.......................................................................................

In [5]:
pandas.set_option('display.max_rows', datasets.shape[0]+1)
display(datasets)

Unnamed: 0,category_name,title,md_file,years_mentioned,temporal_resolution_yrs,spatial_resolution_m,datatype,updated_year,processing_level
0,Population & Socioeconomic,High Resolution Settlement Layers,projects/hrsl.md,"[2016, 2016, 2021]",5.0,[],ImageCollection||Image,2021,TODO
1,Population & Socioeconomic,GPW Version 4 Admin Units,projects/GPWv4.md,"[2000, 2005, 2010, 2015, 2020, 2010, 2010, 201...",5.0,[],FeatureCollection,2021,TODO
2,Population & Socioeconomic,geoBoundaries Global Database of Political Adm...,projects/geoboundary.md,"[2017, 2020, 2017, 2021]",4.0,[],FeatureCollection,2021,TODO
3,Population & Socioeconomic,West Africa Coastal Vulnerability Mapping,projects/wacvm.md,"[2010, 2000, 2005, 2008, 2000, 1997, 2018, 2021]",21.0,[],FeatureCollection,2021,TODO
4,Population & Socioeconomic,Relative Wealth Index (RWI),projects/rwi.md,[2021],,"[4000.0, 4000.0]",ImageCollection||Image,NaT,TODO
5,Population & Socioeconomic,Social Connectedness Index (SCI),projects/sci.md,"[2018, 2021]",,[],FeatureCollection,2021,TODO
6,Population & Socioeconomic,Native Land (Indigenous Land Maps),projects/native.md,"[2021, 2021, 2021]",,[],FeatureCollection,2021,TODO
7,"Geophysical, Biological & Biogeochemical",Geomorpho90m Geomorphometric Layers,projects/geomorpho90.md,"[2020, 2021]",,"[90.0, 90.0, 90.0, 90.0]",ImageCollection||Image,2021,TODO
8,"Geophysical, Biological & Biogeochemical",Bare Earth’s Surface Spectra 1980-2019,projects/bss.md,"[1980, 2020, 1980, 1980, 2021]",41.0,[],ImageCollection||Image,2021,TODO
9,"Geophysical, Biological & Biogeochemical",Normalized Sentinel-1 Global Backscatter Model...,projects/s1gbm.md,"[2016, 2016, 2021, 2021, 2021]",5.0,"[100000.0, 100000.0, 10.0, 10.0, 10.0, 10.0]",FeatureCollection,2021,TODO
