In [1]:
import requests as req
import os
import json
from pprint import pprint

import numpy as np
import pandas as pd
pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000
pd.options.display.max_colwidth = 1000


In [8]:
#### Download Google Spreadsheets ####

# Make a temporary directory, and move into it
!mkdir temp
os.chdir("temp")
dest = os.getcwd()

# July Data Sheet
!curl "https://docs.google.com/spreadsheets/d/1viPOGYIk6RGu7YMoM3BHNVbkWaCZ0JFBOMSNncWvHYk/export?format=tsv" > july_data_upload.tsv
july_data_upload = pd.read_csv(dest+"/july_data_upload.tsv", sep="\t", index_col=[0])

# # Metadata to Upload
# !curl "https://docs.google.com/spreadsheets/d/1UkABgMlBIinJjITa6WepFAL-8VBkulS0LCbKojRXjVY/export?format=tsv" > current_metadata.tsv
# current_mdata = pd.read_csv(dest+"/current_metadata.tsv", sep="\t", index_col=[0])
# current_mdata = current_mdata.transpose()

!curl "https://docs.google.com/spreadsheets/d/1UkABgMlBIinJjITa6WepFAL-8VBkulS0LCbKojRXjVY/export?format=tsv" > current_metadata.tsv
current_mdata = pd.read_csv(dest+"/current_metadata.tsv", sep="\t", index_col=[0])
current_mdata = current_mdata.transpose()

!curl "https://docs.google.com/spreadsheets/d/1trzei7ETctQuI9kNg012MHieRVOLLAt1kbaKarT41fI/export?format=csv" > fakemetabeta.csv
# import csv
# with open(dest+"/fakemetabeta.tsv", 'r') as f:
#     reader = csv.reader(f)
#     linenumber = 1
#     data = []
#     try:
#         for row in reader:
#             linenumber += 1
#             data.append(row)
#     except Exception as e:
#         print (("Error line %d: %s %s" % (linenumber, str(type(e)))))

fakemetabeta = pd.read_csv(dest+"/fakemetabeta.csv", sep=",", index_col=[0], encoding="utf8")
#print(data)
fakemetabeta = fakemetabeta.transpose()

# Delete temporary files
os.chdir("..")
!rm -r temp

#### Merge info from the Tracking sheet and Metadata sheet

old_id_col = "Dataset on Backoffice"
new_id_col = "API-ID (PERFECT DATASET)"
dl_from_src_col = "Download from Source"
dl_from_s3_col = "Download Data (S3)"
public_title = "Public Title"
technical_title = "Technical Title"

columns_to_xfer = [old_id_col, new_id_col, dl_from_src_col, dl_from_s3_col, public_title, technical_title]
match_col = "Unique ID"

# HELPER FUNCTION

def append_columns(src_df, dst_df, columns_to_xfer, match_col):
    try:
        info = src_df.loc[dst_df[match_col], columns_to_xfer]
        # Have to set index or join returns no cols
        dst_df.set_index(match_col, inplace=True)
        dst_df[columns_to_xfer] = info
        dst_df.reset_index(inplace=True)
        return(dst_df)
    except:
        print("ERROR: There is an element in the metadata sheet that is not in the tracking sheet")
        return(None)
    
# current_mdata[old_id_col] = current_mdata.apply(lambda row: fetch_info(row,match_col,july_data_upload,old_id_col), axis=1)
# current_mdata[new_id_col] = current_mdata.apply(lambda row: fetch_info(row,match_col,july_data_upload,new_id_col), axis=1)

# current_mdata[dl_from_src_col] = current_mdata.apply(lambda row: fetch_info(row,match_col,july_data_upload,dl_from_src_col), axis=1)
# current_mdata[dl_from_s3_col] = current_mdata.apply(lambda row: fetch_info(row,match_col,july_data_upload,dl_from_s3_col), axis=1)

append_columns(july_data_upload, current_mdata, columns_to_xfer, match_col)
append_columns(july_data_upload, fakemetabeta, columns_to_xfer, match_col)

# Keep only those datasets with rw_ids already
valid_old_ids = pd.notnull(current_mdata[old_id_col])
valid_new_ids = pd.notnull(current_mdata[new_id_col])

valid_old_ids_fake = pd.notnull(fakemetabeta[old_id_col])
valid_new_ids_fake = pd.notnull(fakemetabeta[new_id_col])

def choose_new_id(df, valid_old_ids,old_id_col, valid_new_ids,new_id_col):
    assert(len(valid_old_ids)==len(valid_new_ids))
    final_ids = []
    for i in range(0, len(valid_new_ids)):
        if(valid_new_ids[i]):
            final_ids.append(df.iloc[i][new_id_col])
        elif(valid_old_ids[i]):
            final_ids.append(df.iloc[i][old_id_col])
        else:
            final_ids.append(None)
    return(final_ids)

current_mdata["final_ids"] = choose_new_id(current_mdata, valid_old_ids,old_id_col, valid_new_ids,new_id_col)
fakemetabeta["final_ids"] = choose_new_id(fakemetabeta, valid_old_ids_fake,old_id_col, valid_new_ids_fake,new_id_col)

keep_matched_ids = pd.notnull(current_mdata["final_ids"])
keep_matched_ids_fake = pd.notnull(fakemetabeta["final_ids"])

matched_mdata = current_mdata.loc[keep_matched_ids]
matched_mdata_fake = fakemetabeta.loc[keep_matched_ids_fake]

matched_mdata.set_index("final_ids", inplace=True)
matched_mdata_fake.set_index("final_ids", inplace=True)

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 94665    0 94665    0     0   278k      0 --:--:-- --:--:-- --:--:--  279k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  567k    0  567k    0     0  1470k      0 --:--:-- --:--:-- --:--:-- 1469k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  156k    0  156k    0     0   646k      0 --:--:-- --:--:-- --:--:--  647k


In [81]:
def investigate_unmatched(src_df, dst_df, match_col):
    ids = dst_df[match_col]
    unmatched = []
    for id in ids:
        try:
            src_df.loc[id]
        except:
            unmatched.append(id)
    return(unmatched)

#append_columns(july_data_upload, fakemetabeta, columns_to_xfer, match_col)

print("These cause an error: " + str(investigate_unmatched(july_data_upload, fakemetabeta, match_col)))

These cause an error: [nan, 'bio.016', 'ene.002', 'foo_045']


In [None]:
list1 = [None, "a", None, "b"]
list2 = ["c", None, None, "d"]
df = pd.DataFrame({"l1":list1, "l2":list2})
choose_new_id(df, list1,"l1", list2,"l2")

In [84]:
print(matched_mdata.shape)
print(july_data_upload.shape)

(125, 42)
(270, 35)


In [22]:
sum(valid_new_ids)

53

In [23]:
matched_mdata.head(1)

Unnamed: 0_level_0,Unique ID,Public Title,Technical Title,Subtitle,Source Organizations,Learn More Link,Function,Description,Cautions,Geographic Coverage,Data Type,Spatial Resolution,Date of Content,Frequency of Updates,Summary of Licence,Link to License,Citation,Published Language,Published Title (if not English),Download,Download from Source,Uploaded To,Layer Name 1,Layer Definition 1,Layer Name 2,Layer Definition 2,Layer Name 3,Layer Definition 3,Layer Name 4,Layer Definition 4,Original Data Name 1,Original Data Link 1,Original Data Name 2,Original Data Link 2,Original Data Name 3,Original Data Link 3,Original Data Name 4,Original Data Link 4,nan,VIZZ - RW API (bulk upload),API-ID (PERFECT DATASET),Download Data (S3)
final_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1
93ee443e-cb39-424a-9aa4-1d16af813418,bio.001,Endangered Species Sites,,AZE,Alliance for Zero Extinction (AZE),http://www.biodiversitya-z.org/content/allianc...,Global list of sites containing 95% or more of...,Created by the Alliance for Zero Extinction (A...,None listed.,Global,Vector,,2010,5 years,The AZE data and any derivatives may be used f...,https://www.arcgis.com/home/item.html?id=4ecca...,"Alliance for Zero Extinction. 2010. ""2010 AZE ...",English,,,,,,,,,,,,,,,,,,,,,,93ee443e-cb39-424a-9aa4-1d16af813418,93ee443e-cb39-424a-9aa4-1d16af813418,


In [24]:
july_data_upload.head(1)

Unnamed: 0_level_0,VIZZ - RW API (bulk upload),Slug - RW API (sanity check),API-ID (PERFECT DATASET),Public Title,Alternative Public Title,Technical Title,Subtitle,Theme_1,Theme_2,Theme_3,Planet Pulse,Water & Conflict,Problem Solving,Process these first,Multiple Layers needed to surface at same time?,Metadata Completed,Metadata on Backoffice,Distribution Restriction,Tags,Format,"Endpoint URL (Carto, GEE)",Download from Source,Download Data (S3),On WRI Platform,Dataset Processed for Upload,Data Upload Responsibility,Uploaded to S3,Server Location,Missing ISO Code,Dataset on Backoffice,Columns defined on Backoffice,Layer Definition,Layer Name/Description finalized,Widgets,Published on RW
WRI Unique ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1
bio.001,93ee443e-cb39-424a-9aa4-1d16af813418,Alliance-for-Zero-Extinction-sites,93ee443e-cb39-424a-9aa4-1d16af813418,Endangered Species Sites,,Alliance for Zero Extinction (AZE) Endangered ...,AZE,Biodiversity,,,,,,,,X,,X,,,,,,GFW,,Vizz,,Carto,,X,,X,X,,X


In [25]:
matched_mdata.columns

Index([                       'Unique ID',                     'Public Title',
                        'Technical Title',                         'Subtitle',
                   'Source Organizations',                  'Learn More Link',
                               'Function',                      'Description',
                               'Cautions',              'Geographic Coverage',
                              'Data Type',               'Spatial Resolution',
                        'Date of Content',             'Frequency of Updates',
                     'Summary of Licence',                  'Link to License',
                               'Citation',               'Published Language',
       'Published Title (if not English)',                         'Download',
                   'Download from Source',                      'Uploaded To',
                           'Layer Name 1',               'Layer Definition 1',
                           'Layer Name 2',          

In [9]:
## FOR EACH DATASET IN BACKOFFICE THAT HAS METADATA, UPLOAD IT
from configparser import ConfigParser
config = ConfigParser()
config.read("../.env")
api_token = config.get("auth", "rw_api_token")

auth_token = api_token # <Insert Auth Token Here>

# HELPER FUNCTIONS

def clean_nulls(val):
    try:
        if np.isnan(val):
            return(None)
        else:
            return(val)
    except:
        return(val)

def create_source_object(sources):
    if sources:
        source_object = []
        srcs = sources.split("/")
        for ix, src in enumerate(srcs):
            source_object.append({
                "source-name":src,
                "id":ix,
                "source-description":""
            })
        return source_object
    return None

### THIS ADDS ALL DATASETS FOR WHICH WE HAVE METADATA in METADATA FOR UPLOAD ###

small_batch = ["60be01b0-99fb-459c-8a08-b934270f8c4b", "b8307c16-fd77-4e35-9b68-8726a025f401"]

#datasets_for_pushing = [matched_mdata, matched_mdata_fake]
datasets_for_pushing = [matched_mdata]
#datasets_for_pushing = [matched_mdata_fake]



processed1 = []

# If datatype is raster, don't show download link from s3
# if no download_from_source link, show learn_more_link

for df in datasets_for_pushing:
    for rw_id in df.index:
    #for rw_id in small_batch:
        url = "https://api.resourcewatch.org/v1/dataset/"+str(rw_id)+"/metadata"
        print(url)
        # Everything from current_mdata
        metadata = df.loc[rw_id]
        if len(metadata.shape) > 1:
            print(metadata)
    
        flag1 = clean_nulls(metadata["Data Type"]) != None
        if(flag1):
            flag2 = clean_nulls(metadata["Data Type"]).lower() != "raster"
            if(flag1&flag2):
                data_dl_link = clean_nulls(metadata["Download Data (S3)"])
            else:
                data_dl_link = None
        else:
            data_dl_link = None
            
        if(clean_nulls(metadata["Download from Source"]) != None):
            data_dl_orig_link = clean_nulls(metadata["Download from Source"])
        else:
            data_dl_orig_link = clean_nulls(metadata["Learn More Link"])
            
        if(clean_nulls(metadata["Technical Title"]) != None):
            tech_title = clean_nulls(metadata["Technical Title"])
        else:
            tech_title = clean_nulls(metadata["Public Title"])
        
        #print(metadata)
        print(clean_nulls(metadata["Unique ID"]))
        #print(clean_nulls(metadata["Subtitle"]))
        
        row_payload = {
            "language": "en",

            "name": clean_nulls(metadata["Public Title"]),
            "description": clean_nulls(metadata["Description"]),
            "subtitle": clean_nulls(metadata["Subtitle"]),
            "source": clean_nulls(metadata["Subtitle"]),
            "functions": clean_nulls(metadata["Function"]),

            "application":"rw",
            "dataset":rw_id,

            "info": {

                # One of these a duplicate, test how shows up in front-end
                # or should rwId be dataset, above?
                "wri_rw_id": clean_nulls(metadata["Unique ID"]),
                "rwId": clean_nulls(metadata["Unique ID"]),

                "data_type": clean_nulls(metadata["Data Type"]),

                "name": clean_nulls(metadata["Public Title"]),
                "sources": create_source_object(clean_nulls(metadata["Source Organizations"])),

                "technical_title":tech_title,

                "functions": clean_nulls(metadata["Function"]),
                "cautions": clean_nulls(metadata["Cautions"]),

                "citation": clean_nulls(metadata["Citation"]),

                "license": clean_nulls(metadata["Summary of Licence"]),
                "license_link": clean_nulls(metadata["Link to License"]),

                "geographic_coverage": clean_nulls(metadata["Geographic Coverage"]),
                "spatial_resolution": clean_nulls(metadata["Spatial Resolution"]),

                "date_of_content": clean_nulls(metadata["Date of Content"]),
                "frequency_of_updates": clean_nulls(metadata["Frequency of Updates"]),

                "learn_more_link": clean_nulls(metadata["Learn More Link"]),
                
                "data_download_link": data_dl_link,
                "data_download_original_link":data_dl_orig_link

            }
        }

        headers = {
            'content-type': "application/json",
            'authorization': "Bearer " + auth_token,
        }

        try:
            processed1.append(rw_id)
            res = req.request("POST", url, data=json.dumps(row_payload), headers = headers)
            print(res)
            #print(res.text)
            if("already exists" in res.text):
                res = req.request("PATCH", url, data=json.dumps(row_payload), headers = headers)
                print(res)
                #print(res.text)
                if("errors:" in res.text):
                    print(res.text)
            elif("errors:" in res.text):
                print(res.text)
                
        except TypeError as e:
            print(e.args)
            print(metadata[["Unique ID", "Public Title"]])


https://api.resourcewatch.org/v1/dataset/93ee443e-cb39-424a-9aa4-1d16af813418/metadata
bio.001
<Response [400]>
<Response [200]>
https://api.resourcewatch.org/v1/dataset/4458eb12-8572-45d1-bf07-d5a3ee097021/metadata
bio.002
<Response [400]>
<Response [200]>
https://api.resourcewatch.org/v1/dataset/16df8ada-87cc-4907-adce-a98bc4e91856/metadata
bio.003
<Response [400]>
<Response [200]>
https://api.resourcewatch.org/v1/dataset/3624554e-b240-4edb-9110-1f010642c3f3/metadata
bio.004
<Response [400]>
<Response [200]>
https://api.resourcewatch.org/v1/dataset/ad790c87-fe9e-4405-891d-de7c2ddfda79/metadata
bio.005
<Response [400]>
<Response [200]>
https://api.resourcewatch.org/v1/dataset/7d3465f8-5959-4531-aaf2-c9a8a03183b3/metadata
bio.006
<Response [400]>
<Response [200]>
https://api.resourcewatch.org/v1/dataset/de452a4c-a55c-464d-9037-8c3e9fe48365/metadata
bio.007
<Response [400]>
<Response [200]>
https://api.resourcewatch.org/v1/dataset/3c82c421-8964-444e-86f2-df800174d8b9/metadata
bio.008
<R

In [10]:
#### Merge subtitles with Tracking sheet

#### UPLOADS TITLE, SUBTITLE, AND DOWNLOAD LINKS, if there is no METADATA IN METADATA FOR UPLOAD AVAILABLE (RW_UNIQUE ID EXISTS) ####

# Keep only those datasets from trakcing sheet with rw_ids already
tracking_valid_old_ids = pd.notnull(july_data_upload[old_id_col])
tracking_valid_new_ids = pd.notnull(july_data_upload[new_id_col])

july_data_upload["final_ids"] = choose_new_id(july_data_upload, tracking_valid_old_ids,old_id_col, tracking_valid_new_ids,new_id_col)

missed_ids = [rw_id for rw_id in july_data_upload["final_ids"].values if ((rw_id not in processed1) and (rw_id != None))]

missed_data = july_data_upload.reset_index().set_index("final_ids")
missed_data = missed_data.loc[missed_ids]
missed_data


### THIS ADDS ALL DATASETS FOR WHICH WE HAVE ENTRIES IN TRACKING SHEET and NOTHING IN METADATA FOR UPLOAD###
print("True if below print empty list []")
print([ind for ind in missed_data.index if ind in matched_mdata.index])

processed2 = []

for rw_id in missed_data.index:
    url = "https://api.resourcewatch.org/v1/dataset/"+str(rw_id)+"/metadata"
    # Everything from current_mdata
    metadata = missed_data.loc[rw_id]
    #print(metadata)
    print(metadata["WRI Unique ID"])
    print(metadata["Public Title"])
    print(url)
    #print(metadata)
    row_payload = {
        "language": "en",
        
        "name": clean_nulls(metadata["Public Title"]),
        "subtitle": clean_nulls(metadata["Subtitle"]),
        
        "application":"rw",
        "dataset":rw_id,
        
        "info": {
            
            "wri_rw_id": clean_nulls(metadata["WRI Unique ID"]),

            "name": clean_nulls(metadata["Public Title"]),
            "technical_title":clean_nulls(metadata["Technical Title"]),

            "data_download_link": clean_nulls(metadata["Download Data (S3)"]), 
            "data_download_original_link": clean_nulls(metadata["Download from Source"])
            
        }
    }

    headers = {
        'content-type': "application/json",
        'authorization': "Bearer " + auth_token,
    }
    #print(row_payload)

    try:
        processed2.append(rw_id)
        res = req.request("POST", url, data=json.dumps(row_payload), headers = headers)
        if("already exists" in res.text):
            res = req.request("PATCH", url, data=json.dumps(row_payload), headers = headers)
            if("errors:" in res.text):
                print(res.text)
        elif("errors:" in res.text):
            print(res.text)
    except TypeError as e:
        print(e.args)
        print(metadata[["Unique ID", "Public Title"]])



True if below print empty list []
[]
bio.031
Social and Economic Dependence on Coral Reefs
https://api.resourcewatch.org/v1/dataset/894f43a8-ce8e-43a5-a4c7-fa80faa43d63/metadata
bio.033
Cold Water Corals
https://api.resourcewatch.org/v1/dataset/6b8442f5-4766-4444-94b4-d6676277fd80/metadata
bio.034
Percent of Land Area Protected by Country
https://api.resourcewatch.org/v1/dataset/96ce9416-7a34-4c67-a21f-4f9b914d0d45/metadata
bio.035
Coral Bleaching Frequency Prediction
https://api.resourcewatch.org/v1/dataset/1ef55baf-bbbe-480d-85e9-7132c742f196/metadata
cit.032
Urban Population Projections
https://api.resourcewatch.org/v1/dataset/7d9c0d09-e833-4a74-811b-0af78da9c731/metadata
cit.033
Urban Built-Up Area
https://api.resourcewatch.org/v1/dataset/f6bb99af-541a-4d41-9e47-cc36cb479d4b/metadata
cli.003
Sea Surface Temperature Anomalies
https://api.resourcewatch.org/v1/dataset/c8040a7a-a40f-48bd-b003-625c33beff5e/metadata
cli.013
Methane
https://api.resourcewatch.org/v1/dataset/c9c9cb2f-9655-4

In [32]:
missed_data.to_csv("/Users/nathansuberi/Desktop/datasets_on_july_sheet_with_rw_id_no_metadata.csv")
missed_data

Unnamed: 0_level_0,WRI Unique ID,VIZZ - RW API (bulk upload),Slug - RW API (sanity check),API-ID (PERFECT DATASET),Public Title,Alternative Public Title,Technical Title,Subtitle,Theme_1,Theme_2,Theme_3,Planet Pulse,Water & Conflict,Problem Solving,Process these first,Multiple Layers needed to surface at same time?,Metadata Completed,Metadata on Backoffice,Distribution Restriction,Tags,Format,"Endpoint URL (Carto, GEE)",Download from Source,Download Data (S3),On WRI Platform,Dataset Processed for Upload,Data Upload Responsibility,Uploaded to S3,Server Location,Missing ISO Code,Dataset on Backoffice,Columns defined on Backoffice,Layer Definition,Layer Name/Description finalized,Widgets,Published on RW
final_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
894f43a8-ce8e-43a5-a4c7-fa80faa43d63,bio.031,894f43a8-ce8e-43a5-a4c7-fa80faa43d63,,894f43a8-ce8e-43a5-a4c7-fa80faa43d63,Social and Economic Dependence on Coral Reefs,,Social and Economic Dependence on Coral Reefs,WRI,Biodiversity,Commerce,Society,,,,,,ready,,,,Vector,https://resourcewatch.carto.com/u/wri-rw/datas...,http://datasets.wri.org/dataset/reefs-at-risk-...,,,,Liz,,Carto,,X,,X,X,,X
6b8442f5-4766-4444-94b4-d6676277fd80,bio.033,6b8442f5-4766-4444-94b4-d6676277fd80,,6b8442f5-4766-4444-94b4-d6676277fd80,Cold Water Corals,,Global DIstribution of Cold Water Corals,UNEP-WCMC,Biodiversity,,,,,Max/Laura write metadata,,,ready,,X,,Vector,https://resourcewatch.carto.com/u/wri-rw/datas...,http://data.unep-wcmc.org/datasets/3,https://wri-public-data.s3.amazonaws.com/resou...,,,Elise,X,Carto,,X,,X,X,,X
1ef55baf-bbbe-480d-85e9-7132c742f196,bio.035,1ef55baf-bbbe-480d-85e9-7132c742f196,,,Coral Bleaching Frequency Prediction,,Frequency of Future Coral Reef Bleaching Event...,WRI,,,,,,,,,Laura,,,,,,,,,,,,,,,,,,,
7d9c0d09-e833-4a74-811b-0af78da9c731,cit.032,7d9c0d09-e833-4a74-811b-0af78da9c731,,7d9c0d09-e833-4a74-811b-0af78da9c731,Urban Population Projections,,Urban Population Percentage,No Metadata Yet,Cities,,,,,,,,ready,,X,,CSV,https://resourcewatch.carto.com/u/wri-rw/datas...,https://esa.un.org/unpd/wup/CD-ROM/,https://wri-projects.s3.amazonaws.com/resource...,,,Emily,X,Carto,,X,,X,,X,X
c8040a7a-a40f-48bd-b003-625c33beff5e,cli.003,c8040a7a-a40f-48bd-b003-625c33beff5e,Sea-Surface-Temperature-Anomalies-NOAA,,Sea Surface Temperature Anomalies,,Twice-weekly Sea Surface Temperature Anomalies,No Metadata Yet,Climate,,,RT,,Vizz rasters - who makes layer?,,,ready,,,,,,,,,,Vizz,,Carto,,X,,?,,,X
c9c9cb2f-9655-4f40-8736-9b407ee43514,cli.013,c9c9cb2f-9655-4f40-8736-9b407ee43514,Current-Methane-Concentration-CH4,,CH4 Concentrations,,"Current Methane Concentration, CH4",No Metadata Yet,Climate,,,RT,,Laura/Max write metadata,,,Laura,,x,,,,,,,,Vizz,,Carto,,X,,X,,,
ea67f436-473c-4977-bdf2-8aa0dabbaa6f,cli.021,ea67f436-473c-4977-bdf2-8aa0dabbaa6f,,,Average Snow Cover,,Northern Hemisphere Average Snow Cover by Mont...,No Metadata Yet,Climate,,,RT,,,,,ready,,,,Raster,,,,,,Vizz,,,,X,,X,,,
d1fd9ac1-b99f-4f5f-b9d8-d3f1f818321a,cli.025,d1fd9ac1-b99f-4f5f-b9d8-d3f1f818321a,Sea-Level-Trends,,Relative Sea Level Trends,,Global Linear Relative Mean Sea Level (MSL) tr...,No Metadata Yet,Climate,Cities,,explore,,,,,ready,,,,,,https://tidesandcurrents.noaa.gov/sltrends/msl...,,,,Vizz,,Carto,,X,,X,,,
fe311144-8c0e-4440-b068-6efd057e0f6a,com.007,fe311144-8c0e-4440-b068-6efd057e0f6a,,fe311144-8c0e-4440-b068-6efd057e0f6a,FDI Regulatory Restrictiveness Index,,FDI Regulatory Restrictiveness Index (OECD),No Metadata Yet,Commerce,,,,,Laura/Max write metadata,,,ready,,,,Tabular,https://resourcewatch.carto.com/u/wri-rw/datas...,http://stats.oecd.org/Index.aspx?datasetcode=F...,https://wri-public-data.s3.amazonaws.com/resou...,,"deleted extra headers, transpose x and y",Elise,X,Carto,,X,,X,~,,X
5e3a3a9f-7380-47c0-ad84-2c193861e106,com.019,5e3a3a9f-7380-47c0-ad84-2c193861e106,,5e3a3a9f-7380-47c0-ad84-2c193861e106,Ecological Footprint,,Global Footprint Network,No Metadata Yet,Commerce,,,,,,,,ready,,,,Tabular,https://resourcewatch.carto.com/u/wri-rw/datas...,http://data.footprintnetwork.org/#/compareCoun...,https://wri-public-data.s3.amazonaws.com/resou...,,,Elise,X,Carto,,X,,X,~,,X


In [None]:
"9ea634db-53af-445e-a767-60ec9efc321e" in processed2

In [45]:
#### Which datasets do we have metadata for, but are not on tracking sheet? ####
# processed1 stores datasets with metadata that were uploaded
# current_mdata is all mdata
# matched_mdata is all mdata with a final_id
print(matched_mdata["Unique ID"].head())
print(current_mdata["Unique ID"].head())

unmatched_ids = [wri_id for wri_id in current_mdata["Unique ID"].values if wri_id not in matched_mdata["Unique ID"].values]
unmatched_mdata = current_mdata.set_index('Unique ID').loc[unmatched_ids]
unmatched_mdata

final_ids
93ee443e-cb39-424a-9aa4-1d16af813418    bio.001
4458eb12-8572-45d1-bf07-d5a3ee097021    bio.002
16df8ada-87cc-4907-adce-a98bc4e91856    bio.003
3624554e-b240-4edb-9110-1f010642c3f3    bio.004
ad790c87-fe9e-4405-891d-de7c2ddfda79    bio.005
Name: Unique ID, dtype: object
0    bio.001
1    bio.002
2    bio.003
3    bio.004
4    bio.005
Name: Unique ID, dtype: object


Unnamed: 0_level_0,Public Title,Technical Title,Subtitle,Source Organizations,Learn More Link,Function,Description,Cautions,Geographic Coverage,Data Type,Spatial Resolution,Date of Content,Frequency of Updates,Summary of Licence,Link to License,Citation,Published Language,Published Title (if not English),Download,Download from Source,Uploaded To,Layer Name 1,Layer Definition 1,Layer Name 2,Layer Definition 2,Layer Name 3,Layer Definition 3,Layer Name 4,Layer Definition 4,Original Data Name 1,Original Data Link 1,Original Data Name 2,Original Data Link 2,Original Data Name 3,Original Data Link 3,Original Data Name 4,Original Data Link 4,nan,VIZZ - RW API (bulk upload),API-ID (PERFECT DATASET),Download Data (S3),final_ids
Unique ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1
bio.019,Bird Species that are Threatened,"Percentage of Bird Species Threatened, by Country",WV GIS TC/DOE/NETL,West Virginia Geographic Information Systems T...,http://www.nature.com/nature/journal/v546/n765...,Percentage of bird species threatened by country.,"The percentage of bird species threatened, by ...",These data take into account future threats to...,Global,Tabular,National,2016,No known updates,TBD,https://s100.copyright.com/AppDispatchServlet?...,"Tilman, D., M. Clark, David R. Williams, K. Ki...",English,,,,,,,,,,,,,,,,,,,,,,,,,
bio.020,Mammal Species that are Threatened,"Percentage of Mammal Species Threatened, by Co...",UC Santa Barbara/UMN/IUCN,"University of California, Santa Barbara (UC Sa...",http://www.nature.com/nature/journal/v546/n765...,Percentage of mammal species threatened by cou...,"The percentage of mammal species threatened, b...",These data take into account future threats to...,Global,Tabular,National,2016,No known updates,TBD,https://s100.copyright.com/AppDispatchServlet?...,"Tilman, D., M. Clark, David R. Williams, K. Ki...",English,,,,,,,,,,,,,,,,,,,,,,,,,
bio.024,Local Threats to Coral Reefs,,WRI,World Resources Institute (WRI),http://www.wri.org/publication/reefs-risk-revi...,Estimated threat to coral reefs from developme...,The Local Threats to Coral Reefs datasets were...,The data sets used to derive the Coastal Devel...,Global,Raster,1 km x 1 km,2011,No known updates,WRI Open Data License,https://creativecommons.org/licenses/by/4.0/,World Resources Institute. 2001. Reefs at Risk...,English,,,http://www.wri.org/resources/data-sets/reefs-r...,,,,,,,,,,Threat to Coral Reefs from Coastal Development,http://www.wri.org/publication/reefs-risk-revi...,Threat to Coral Reefs from Marine-Based Pollution,http://www.wri.org/publication/reefs-risk-revi...,Threat to Coral Reefs from Watershed-Based Pol...,http://www.wri.org/publication/reefs-risk-revi...,Threat to Coral Reef from Fishing Practices,http://www.wri.org/publication/reefs-risk-revi...,*Metadata for bio.024.1-.4 merged. Description...,,,https://wri-public-data.s3.amazonaws.com/resou...,
bio.029,Coral Bleaching from Heat Stress,,NOAA CRW,National Oceanic and Atmospheric Administratio...,https://coralreefwatch.noaa.gov/satellite/blea...,Displays the number of times that an ocean gri...,The Coral Reef Watch (CRW) program of the Nati...,Being exposed to DHW greater than 8 is correla...,Global,Raster,5 km x 5 km,2013-present,Daily,Public domain,https://wiki.creativecommons.org/wiki/Public_d...,NOAA Coral Reef Watch. 2000 (updated twice wee...,English,,,,,,,,,,,,,,,,,,,,,,,,,
bio.032,Ecological Land Units,Global Ecological Land Units (ELUs),USGS,United States Geological Survey (USGS),https://catalog.data.gov/dataset/global-ecolog...,"Land units considering bioclimate, landform, l...",The Global Ecological Land Units data set is a...,The USGS acknowledges that there may be missin...,Global,Raster,250 m,2014,No known updates,Public domain,https://catalog.data.gov/harvest/object/8c3bf2...,"Sayre, R., J. Dangermond, C. Frye, R. Vaughan,...",English,,,,,,,,,,,,,,,,,,,,,,,,,
cit.007,Urban Expansion Extent,,NYU/UN-Habitat/Lincoln Institute,New York University (NYU)/United Nations Human...,http://www.atlasofurbanexpansion.org/data,"Built-up extents of major cities in 1990, 2000...",The Atlas of Urban Expansion is a joint projec...,The cutoffs for classification of pixels as ur...,200 global cities,Vector,,1984-2013,No known updates,,,,English,,,,,,,,,,,,,,,,,,,,,,,,,
cit.010,Bus Rapid Transit Systems,,BRT CoE/WRI/SIBRT/IEA,Bus Rapid Transit Center of Excellence (BRT Co...,brtdata.org,"Information about the location, capacity, and ...",BRT data gather information on Bus Rapid Trans...,"Data quality varies significantly. Generally, ...",205 cities worldwide,Vector,,1968-present,Monthly,Creative Commons Attribution-Noncommercial-NoD...,http://brtdata.org/info/terms,"BRT Centre of Excellence, EMBARQ, IEA, and SIB...",English,,,,,,,,,,,,,,,,,,,,,,,,,
cit.016,Road Network,,CIESIN,Columbia University Earth Institute Center for...,http://sedac.ciesin.columbia.edu/data/set/groa...,"A global data set of roads, edited to ensure t...",The Global Roads Open Access Data Set (gROADS)...,"Data are compiled from multiple sources, not a...",Global,Vector,,1980-2010,No known updates,"Mostly in public domain, some countries requir...",http://sedac.ciesin.columbia.edu/downloads/doc...,Center for International Earth Science Informa...,English,,,http://sedac.ciesin.columbia.edu/data/set/groa...,,,,,,,,,,,,,,,,,,,,,,
cli.004,Aboveground Live Woody Biomass Density,Aboveground Live Woody Biomass Density,WHRC/Zarin,Woods Hole Research Center (WHRC)/Zarin (Zarin),http://data.globalforestwatch.org/datasets/8f9...,Shows carbon density values of aboveground liv...,The Aboveground Live Woody Biomass Density dat...,It is recommended that both aboveground carbon...,Tropics,Raster,30 × 30 m,2000,No known updates,Attribution 4.0 International (CC BY 4.0),https://creativecommons.org/licenses/by/4.0/,"Baccini, A., W. Walker, L. Carvahlo, M. Farina...",English,,,,,,,,,,,,,,,,,,,,,,,,,
cli.005,Sea Ice Extent,"Sea Ice Index, Version 2",NSIDC,National Snow and Ice Data Center (NSIDC),https://nsidc.org/data/docs/noaa/g02135_seaice...,Monthly sea ice extents in the Arctic and Anta...,The monthly sea ice extent data produced by th...,"On Resource Watch, monthly data are shown alth...",Arctic and Antarctic,Raster,25 x 25 km,1978-present,Monthly,"You may download and use photographs, imagery,...",http://nsidc.org/about/use_copyright.html,"Fetterer, F., K. Knowles, W. Meier, and M. Sav...",English,,,ftp://sidads.colorado.edu/DATASETS/NOAA/G02135/,,,,,,,,,,Antarctic Sea Ice Extent,ftp://sidads.colorado.edu/DATASETS/NOAA/G02135...,Arctic Sea Ice Extent,ftp://sidads.colorado.edu/DATASETS/NOAA/G02135...,,,,,,,,https://wri-public-data.s3.amazonaws.com/resou...,


In [16]:
#### Inspect metadata on backoffice

# Base URL for getting dataset metadata from RW API
# Metadata = Data that describes Data 
url = "https://api.resourcewatch.org/v1/dataset?sort=slug,-provider,userId&status=saved&includes=metadata,vocabulary,widget,layer"

# page[size] tells the API the maximum number of results to send back
# There are currently between 200 and 300 datasets on the RW API
payload = { "application":"rw", "page[size]": 1000}

# Request all datasets, and extract the data from the response
res = req.get(url, params=payload)
data = res.json()["data"]

#############################################################

### Convert the json object returned by the API into a pandas DataFrame
# Another option: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.io.json.json_normalize.html
datasets_on_api = {}

for ix, dset in enumerate(data):

    atts = dset["attributes"]
    metadata = atts["metadata"]
    layers = atts["layer"]
    widgets = atts["widget"]
    tags = atts["vocabulary"]
    datasets_on_api[atts["name"]] = {
        "rw_id":dset["id"],
        "upload_name":atts["name"],
        "table_name":atts["tableName"],
        "provider":atts["provider"],
        "date_updated":atts["updatedAt"],
        "num_metadata_keys":len(metadata),
        "metadata": metadata,
        "num_layers":len(layers),
        "layers": layers,
        "num_widgets":len(widgets),
        "widgets": widgets,
        "num_tags":len(tags),
        "tags":tags
    }    
    
# Create the DataFrame, name the index, and sort by date_updated
# More recently updated datasets at the top
current_datasets_on_api = pd.DataFrame.from_dict(datasets_on_api, orient='index')

def check_public_title(metadata):
    if len(metadata) > 0:
        mdata = metadata[0]
        if "attributes" in mdata:
            if "info" in mdata["attributes"]:
                if "name" in mdata["attributes"]["info"]:
                    return(mdata["attributes"]["info"]["name"])
        return(None)

# Grab public title, if it exists in metadata
current_datasets_on_api["public_title"] = current_datasets_on_api.apply(lambda row: check_public_title(row["metadata"]), axis=1)

current_datasets_on_api.set_index("rw_id", inplace=True)
current_datasets_on_api.index.rename("Dataset", inplace=True)
current_datasets_on_api.sort_values(by=["date_updated"], inplace=True, ascending = False)

In [61]:
table_names = {"Dataset on Backoffice":[], "table_name":[], "provider":[]}

for dset in data:    
    table_names["Dataset on Backoffice"].append(dset["id"])
    table_names["table_name"].append(dset["attributes"]["tableName"])
    table_names["provider"].append(dset["attributes"]["provider"])
    
dataset_table_names = pd.DataFrame.from_dict(table_names).set_index("Dataset on Backoffice")

In [64]:
dataset_table_names.apply(lambda row:  print(row["provider"])  , axis=1)

cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
gee
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
gee
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
csv
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
csv
cartodb
cartodb
cartodb
csv
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
wms
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
gee
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
gee
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
gee
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cartodb
cart

Dataset on Backoffice
37d04efc-0ab2-4499-a891-54dca1013c74    None
bd3ad33a-886b-4456-a032-119b9ac064de    None
223b936e-06b8-4970-abd9-4f123904d95d    None
e8f53f73-d77c-485a-a2a6-1c47ea4aead9    None
ccfb322a-20aa-4132-b58b-0f76acec8f5a    None
621a7e5e-20ca-4567-bf31-7c70dfd0b222    None
7a551dd8-b59c-4f59-9d50-c92cb61c5799    None
95b013a3-389a-4367-83b7-c9d68c28c406    None
51159bdb-4904-4101-a88e-ca7bd4f67cb0    None
93ee443e-cb39-424a-9aa4-1d16af813418    None
d8a45b34-4cc0-42f4-957d-e13b37e9182e    None
eb33d4b8-26e2-48de-a153-82c9f86263b5    None
5903f1d7-2240-4591-b970-33b32fe3be54    None
4458eb12-8572-45d1-bf07-d5a3ee097021    None
7d3465f8-5959-4531-aaf2-c9a8a03183b3    None
9ad91eb4-9217-41aa-a547-7b86f7c68deb    None
9d9b48d3-152d-48c3-8c2a-2957ddb601a1    None
8bc79a36-d77e-4ee3-b9bc-c77146cfc503    None
f6bb99af-541a-4d41-9e47-cc36cb479d4b    None
6d99441e-5faa-4c61-967f-01c9fe60624b    None
c9eadefd-4a06-4f3b-a2eb-3e3f45624c24    None
1d6c6bcc-7787-4b9c-84bb-25a185c12

In [50]:

#matching_with_tracking = july_data_upload.set_index("Dataset on Backoffice")

matching_with_tracking = pd.merge(july_data_upload, dataset_table_names, 
                                  on=["Dataset on Backoffice"],
                                  how="left")
july_data_upload.reset_index()                                  
matching_with_tracking



Unnamed: 0,Dataset on Backoffice,API-ID (PERFECT DATASET),Published on RW,Public Title,Technical Title,Subtitle,Theme_1,Theme_2,Theme_3,Planet Pulse / RT,"WRI Platform (PREP, GFW, used for Water & Conflict)",Problem Solving,Metadata Completed,Distribution Restriction,Format,"Endpoint URL (Carto, GEE)",Download from Source,Download Data (S3),Dataset Processed for Upload,Data Upload Responsibility,Uploaded to S3,"Server Location (and account wri-rw, insights, or wri-01)",Alias defined on Backoffice,Layer Definition/Description/Name,Editable Widget (Chart/Map),final_ids,provider,table_name
0,93ee443e-cb39-424a-9aa4-1d16af813418,93ee443e-cb39-424a-9aa4-1d16af813418,X,Endangered Species Sites,Alliance for Zero Extinction Sites (AZE),AZE,Biodiversity,,,,,,X,,Vector,https://resourcewatch.carto.com/u/wri-rw/dataset/bio_001_aze_endangered_species_sites,http://www.biodiversitya-z.org/content/alliance-for-zero-extinction-sites-aze,https://wri-public-data.s3.amazonaws.com/resourcewatch/bio_001_aze_endangered_species_sites.zip,omitted spreadsheet forward from AZE along with shapefile,Peter,X,Carto - wri-rw,,X,,93ee443e-cb39-424a-9aa4-1d16af813418,cartodb,alliance_for_zero_extinction_sites_species_joi
1,4458eb12-8572-45d1-bf07-d5a3ee097021,4458eb12-8572-45d1-bf07-d5a3ee097021,X,Biodiversity Hotspots,"Hotspots Revisited, 2011",CI/CEPF,Biodiversity,,,,GFW,,X,,Vector,,http://www.cepf.net/resources/hotspots/Pages/default.aspx,,,Vizz,,Carto - wri-01,X,X,,4458eb12-8572-45d1-bf07-d5a3ee097021,cartodb,biodiversity_hotspots
2,16df8ada-87cc-4907-adce-a98bc4e91856,,,Marine Species Richness,Climate impacts on global hot spots of marine biodiversity,,Biodiversity,,,,,,X,,Raster,,http://advances.sciencemag.org/content/3/2/e1601198/tab-figures-data,,,Vizz,,Carto - wri-rw,,doesn't work,,16df8ada-87cc-4907-adce-a98bc4e91856,cartodb,sp_richness
3,3624554e-b240-4edb-9110-1f010642c3f3,3624554e-b240-4edb-9110-1f010642c3f3,X,Coral Reef Locations,Global Distribution of Coral Reefs (2010),UNEP,Biodiversity,,,,,,X,"X (No commercial use, no redistributing data)",Vector,,http://data.unep-wcmc.org/datasets/1,,,Vizz,X,Carto - wri-rw,X,X,,3624554e-b240-4edb-9110-1f010642c3f3,cartodb,table_14_001_wcmc008_coralreef2010_v1_3
4,ad790c87-fe9e-4405-891d-de7c2ddfda79,,,Coral Reef Bleaching Alerts,Coral Reef Watch Bleaching Alerts,NOAA,Biodiversity,,,pulse,,,X,,Raster,,https://coralreefwatch.noaa.gov/satellite/bleaching5km/index.php,,,Vizz,,Carto - wri-rw,,X,,ad790c87-fe9e-4405-891d-de7c2ddfda79,cartodb,coralreefhotspots
5,7d3465f8-5959-4531-aaf2-c9a8a03183b3,7d3465f8-5959-4531-aaf2-c9a8a03183b3,X,Endemic Bird Areas,,BirdLife International,Biodiversity,,,,GFW,,X,X,Vector,,http://datazone.birdlife.org/eba/search,,,Vizz,,Carto - wri-01,,X,,7d3465f8-5959-4531-aaf2-c9a8a03183b3,cartodb,endemic_bird_areas
6,de452a4c-a55c-464d-9037-8c3e9fe48365,de452a4c-a55c-464d-9037-8c3e9fe48365,X,Marine and Terrestrial Protected Areas,World Database of Protected Areas,Protected Planet/UNEP WCMC/IUCN,Biodiversity,,,,GFW,,X,X,Vector,https://wri-01.carto.com/tables/wdpa_protected_areas/public/map,https://protectedplanet.net/,,2.5 gb,Elise,X,Carto - wri-01,,X,X,de452a4c-a55c-464d-9037-8c3e9fe48365,cartodb,wdpa_protected_areas
7,3c82c421-8964-444e-86f2-df800174d8b9,,,Cumulative Climate Impacts on Marine Ecosystems,Global Distribution of Cumulative Environmental Impacts,,Biodiversity,,,,,,X,,Raster,,http://advances.sciencemag.org/content/3/2/e1601198/tab-figures-data,,,,,GEE,,,,3c82c421-8964-444e-86f2-df800174d8b9,gee,users/resourcewatch/bio_008_cumulative_climate_impacts
8,3c12072d-611b-413f-b314-4df0834523ab,3c12072d-611b-413f-b314-4df0834523ab,X,Ecoregions Prioritized for Conservation,Ecoregional Portfolio,TNC,Biodiversity,,,,,,X,,Vector,https://resourcewatch.carto.com/u/wri-rw/dataset/wri-rw.bio_009_ecoregions_prioritized_conservation,http://maps.tnc.org/gis_data.html,https://wri-public-data.s3.amazonaws.com/resourcewatch/bio_009_Ecoregions_prioritized_for_conservation.zip,original,Jasmine,X,Carto - wri-rw,,X,X,3c12072d-611b-413f-b314-4df0834523ab,cartodb,bio_009_ecoregions_prioritized_conservation
9,,,,Exctinction Rates,Accelerated modern human–induced species losses: Entering the sixth mass extinction,UNAM/UC Berkley/Stanford U/Princeton U/U Florida,Biodiversity,,,,,? not clear exactly what data we wish to incorporate,Laura,,Tabular,,http://advances.sciencemag.org/content/1/5/e1400253.full,,,,,,,,,,,


In [55]:
df = july_data_upload.reset_index().merge(dataset_table_names, how="left", on="Dataset on Backoffice").set_index("WRI Unique ID")

df.to_csv("/Users/nathansuberi/Desktop/RW_Data/tracking_sheet_w_table_names.csv")

In [35]:
from pprint import pprint

pprint(data[0], depth=2)

{'attributes': {'application': [...],
                'attributesPath': None,
                'blockchain': {},
                'clonedHost': {},
                'connectorType': 'rest',
                'connectorUrl': 'https://wri-rw.carto.com/tables/soc_040_improved_sanitation/public',
                'dataPath': '',
                'env': 'production',
                'errorMessage': None,
                'geoInfo': True,
                'layer': [...],
                'layerRelevantProps': [],
                'legend': {...},
                'metadata': [...],
                'name': 'soc.040 Access to Improved Sanitation Facilities',
                'overwrite': False,
                'provider': 'cartodb',
                'published': True,
                'slug': 'Access-to-Improved-Sanitation-Facilities',
                'status': 'saved',
                'subtitle': '',
                'tableName': 'soc_040_improved_sanitation',
                'type': 'tabular',
             

In [13]:
### THIS COVERS ALL DATASETS WHICH ARE ON THE BACKOFFICE but HAVE NO WRI_ID / RW_ID IN TRACKING SHEET ###
### Occasionally this is because the data has been moved to after launch

### Check if any metadata are not updating as expected ###
### Indicating that their unique IDs are wrong in the tracking sheet ###

investigate_mdata = current_datasets_on_api[["upload_name", "public_title", "metadata"]]

missed_ids = [rw_id for rw_id in investigate_mdata.index if ((rw_id not in processed1) & (rw_id not in processed2))]

investigate_mdata = investigate_mdata.loc[missed_ids]

investigate_mdata.to_csv("Datasets_on_backoffice_with_no_WRIID.csv")

In [15]:
investigate_mdata

Unnamed: 0_level_0,upload_name,public_title,metadata
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
b487b6a0-3c6f-4476-97ab-e422612e68ca,frs,,[]
077afe8c-50c9-4033-a6b7-d70d90816e94,Forestt,,[]
ec730070-b187-4221-a4e5-af6de7c854b7,(delete?) Water Seasonality (2014-2015),,[]
62494370-3799-4165-838e-0ebaf42804c2,(delete?) Global Surface Water Extent,Global Surface Water,"[{'id': '59b65d67381b9900115f1b90', 'type': 'metadata', 'attributes': {'dataset': '62494370-3799-4165-838e-0ebaf42804c2', 'application': 'rw', 'resource': {'type': 'dataset', 'id': '62494370-3799-4165-838e-0ebaf42804c2'}, 'language': 'es', 'name': '', 'description': 'Lorem ipsum', 'source': '', 'citation': '', 'license': '', 'info': {'technical_title': 'Global Surface Water', 'name': 'Global Surface Water', 'functions': '', 'source_organization': '', 'source_organization_link': '', 'learn_more_link': '', 'function': '', 'cautions': '', 'geographic_coverage': '', 'spatial_resolution': '', 'date_of_content': '', 'frequency_of_updates': '', 'data_download_link': ''}, 'columns': {'max_extent': {'type': 'categorical', 'alias': 'Max extent', 'description': 'Binary image containing 1 anywhere water has ever been detected. 0 - No water 1 - Water'}, 'transition': {'type': 'categorical', 'alias': 'Transition', 'description': 'Categorical classification of change between first and last year. ..."
20662342-dcdd-4a42-9f58-bcc80217de71,soc_071_world_languages,,[]
4c9190e9-205b-4ff8-83dd-ee08cd63a04c,(Delete?) soc.005 Political Rights and Civil Liberties Index,Political Rights and Civil Liberties Index,"[{'id': '59a427ac7b6c000012baa70b', 'type': 'metadata', 'attributes': {'dataset': '4c9190e9-205b-4ff8-83dd-ee08cd63a04c', 'application': 'rw', 'resource': {'type': 'dataset', 'id': '4c9190e9-205b-4ff8-83dd-ee08cd63a04c'}, 'language': 'en', 'name': 'Political Rights and Civil Liberties Index', 'description': 'Freedom in the World is Freedom House’s flagship annual report, assessing the condition of political rights and civil liberties around the world. It is composed of numerical ratings and supporting descriptive texts for 195 countries and 14 territories. Freedom in the World has been published since 1973, allowing Freedom House to track global trends in freedom over more than 40 years. External analysts assess the 209 countries and territories using a combination of on-the-ground research, consultations with local contacts, and information from news articles, nongovernmental organizations, governments, and a variety of other sources. Expert advisers and regional specialists then ..."
5b5a21ac-0835-43fb-86b9-64b93d472e10,bio.001 Alliance for Zero Extinction Endangered Species Sites,,[]
585938cd-a438-43e8-b1de-bc767a98e0df,test ECM,,[]
46332f9b-30d6-4366-be24-582813256972,(delete) ene.012 Percent of National Population with Access to Electricity,Access to Electricity,"[{'id': '59a427a97b6c000012baa6ff', 'type': 'metadata', 'attributes': {'dataset': '46332f9b-30d6-4366-be24-582813256972', 'application': 'rw', 'resource': {'type': 'dataset', 'id': '46332f9b-30d6-4366-be24-582813256972'}, 'language': 'en', 'name': 'Access to Electricity', 'description': 'The access-to-electricity data are released through the Sustainable Energy for All (SE4ALL) database and collected among different sources: Demographic and Health Surveys (DHSs) and Living Standards Measurement Surveys (LSMSs), Multi-indicator Cluster Surveys (MICSs), the World Health Survey (WHS), other nationally developed and implemented surveys, and various government agencies (e.g., ministries of energy and utilities). Given the low frequency and the regional distribution of some surveys, a number of countries have gaps in available data. To develop the historical evolution and starting point of electrification rates, a simple modeling approach was adopted to fill in the missing data points: a..."
e44679f9-b64b-4f25-a017-1cb0c99bd95d,(?) Open Air Quality: O3 (in Parts per million PPM).,,[]


Many of these are datasets for which the Unique ID changed

soc.003 Distribution of Infant Mortality
soc.016 Conflict and Protest Events in African...
dis_007 Landslide Susceptibility Map
bio.035 Coral Bleaching Frequency Prediction
dis.001 Earthquakes Over the Past 30 days
Foo_046a Food Footprint in Protein
wat.033 Agriculture Water Demand and Depletion
soc.062 Internal Displacement
soc.061 Rural Poverty
soc.042 Percentage of Urban Population with Ac
soc.020 GINI Index
soc.008 Gross Domestic Product Per Capita (PPP
soc.006 Multidimensional Poverty Index
soc.004 Human Development Index
soc.002 Gender Development Index
foo.002 GLDAS Land Water Content from NOAH Lan..
com.028 Effect of Agricultural Policies on Com...
cit.029 Municipal Waste

In [None]:
# DANGER Bug - able to update metadata for a dataset that no longer exists on the API
#test upload cit.029:
#    broken, old id: 8f14a33e-5a61-47af-b26e-c1fc036932a5
#    working, new id: 00abb46f-34e2-4bf7-be30-1fb0b1de022f
    
url1="https://api.resourcewatch.org/v1/dataset/8f14a33e-5a61-47af-b26e-c1fc036932a5/metadata"    
url2="https://api.resourcewatch.org/v1/dataset/10337db6-8321-445e-a60b-28fc1e114f29/metadata"

res1a = req.request("POST", url1, data=json.dumps(row_payload), headers = headers)
if("already exists" in res1a.text):
    res1b = req.request("PATCH", url1, data=json.dumps(row_payload), headers = headers)
        
res2a = req.request("POST", url2, data=json.dumps(row_payload), headers = headers)
if("already exists" in res2a.text):
    res2b = req.request("PATCH", url1, data=json.dumps(row_payload), headers = headers)
    
print(res1b.text)