In [1]:
import requests
import json

In [2]:
WORLD_BANK_API = "https://search.worldbank.org/api/v2/wds?"
QUERY_PARAMS="format=json"\
             "&qterm=digital%20agriculture&"\
             "lang=English&"\
            "fl=docdt,authr,count,abstracts,display_title,majtheme,keywd,envcat,projectid,subsc,subtopic,theme,url,txturl"
#Retrieve projects whose query is "digital agriculture" and are in English
#from each project, retrieve the following fields:
# docdt: date of document
# authr: author
# count: country
# abstracts: abstract
# display_title: title
# majtheme: major theme
# keywd: keywords
# envcat: environmental category
# projectid: project id
# subsc: subsector
# subtopic: subtopic
# theme: theme
# url: url
# txturl: text url

rows = 500 # read n rows at once from API
verbose = True
test_size = 100000 # Only process the first n projects, for debugging

def get_projects_metadata():
    projects_metadata = {}
    offset = 0
    while True:
        URL=f"{WORLD_BANK_API}/{QUERY_PARAMS}&rows={rows}&os={offset}"
        response = requests.get(URL)
        if response.status_code != 200:
            break

        data = response.json()


        projects_metadata.update(data["documents"])

        offset += rows
        if offset > min(data["total"],test_size) :
            break
        
        if verbose: 
            print(f"Retrieved {offset} of {data['total']}", end="\r")

    return projects_metadata

#retrieve projects metadata from API
projects_metadata = get_projects_metadata()
if verbose:
    print(f"Retrieved {len(projects_metadata)} projects.")

#save to file
with open("projects_metadata.json", "w") as f:
    json.dump(projects_metadata, f)

Retrieved 500 of 7222
Retrieved 1000 of 7222
Retrieved 1500 of 7222
Retrieved 2000 of 7222
Retrieved 2500 of 7222
Retrieved 3000 of 7222
Retrieved 3500 of 7222
Retrieved 4000 of 7222
Retrieved 4500 of 7222
Retrieved 5000 of 7222
Retrieved 5500 of 7222
Retrieved 6000 of 7222
Retrieved 6500 of 7222
Retrieved 7000 of 7222
Retrieved 7223 projects.

In [9]:
#process projects metadata
verbose = True

def clean_text(text):
    out = text.replace('\n', ' ').strip()
    out = " ".join(out.split())
    return out
    

def process_project(project_metadata):

    #print(json.dumps(project_metadata, indent=4))

    #check if keys exist before accessing them
    for k in ['display_title','txturl','projectid','docdt','subtopic']:
        if k not in project_metadata.keys():
            project_metadata[k] = ""
    if "abstracts" not in project_metadata.keys() or project_metadata["abstracts"] == "":
        project_metadata["abstracts"] = {"cdata!" : ""}
    if "keywd" not in project_metadata.keys():
        project_metadata["keywd"]={"0":{"keywd" : ""}}
    
    if "authors" not in project_metadata.keys():
        project_metadata["authors"] = {'0':{'author': ''}}
    elif project_metadata["authors"] == {'author': ''}:
        project_metadata["authors"] = {'0':{'author': ''}}
    
    #print(json.dumps(project_metadata, indent=4))
    
    keywords=[clean_text(project_metadata["keywd"]["0"]["keywd"])]
    for k in [clean_text(project_metadata[v]) for v in ['majtheme','theme','subsc','subtopic','envcat'] if v in project_metadata.keys()]:
         keywords.append(k)
    authors=[clean_text(v["author"]) for k,v in project_metadata["authors"].items()]


    

    project = {
        "title":    clean_text(project_metadata["display_title"]),
        "txturl":   project_metadata["txturl"],
        "ids":      [project_metadata["projectid"].split(",")],
        "date":     project_metadata["docdt"],
        "keywords": f"{keywords},{clean_text(project_metadata['subtopic'])}",
        "abstract": clean_text(project_metadata["abstracts"]["cdata!"]),
        "authors":  authors
    }
    #print(json.dumps(project, indent=4))
    return project


with open("projects_metadata.json", "r") as f:
    projects_metadata = json.load(f)

projects = []
i=0
for pid in projects_metadata.keys():
    project_metadata = projects_metadata[pid]
    if project_metadata == {}:
        continue
    project = process_project(project_metadata)
    projects.append(project)
    i+=1

    if verbose:
        print(f"Processed project {i} of {len(projects_metadata)}", end="\r")


with open("digital_agriculture_projects.json", "w") as file:
        json.dump(projects, file, indent=4)

Processed project 7222 of 7223

In [None]:
from datetime import datetime


with open("digital_agriculture_projects.json", "r") as f:
    projects = json.load(f)

author_counts = {}
for project in projects:
    #if date within last 5 years
    date_obj = datetime.strptime(project['date'], "%Y-%m-%dT%H:%M:%SZ")

    if date_obj.year >= 2013:
        for author in project["authors"]:
            if author not in author_counts.keys():
                author_counts[author] = 1
            else:
                author_counts[author] += 1
#sort authors by number of occurences
author_counts = {k: v for k, v in sorted(author_counts.items(), key=lambda item: item[1], reverse=True)}
#print top 10 authors
for k,v in list(author_counts.items())[:]:
    print(k,": ",v)