In [1]:
import requests
import pandas as pd
import numpy as np
import json
from pandas.io.json import json_normalize

In [2]:
api_token='9bb4facb6d23f48efbf424bb05c0c1ef1cf6f468393bc745d42179ac4aca5fee'

In [3]:
def process_request(url):
    """
    This function processes the api request and returns a pandas dataframe object
    """
    request = requests.get(url)
    data = json.loads(request.text)
    df = pd.DataFrame(json_normalize(data['result']))
    return df 

In [4]:
def get_threat(name):
    """
    This function gets 
    """
    threat_url = 'http://apiv3.iucnredlist.org/api/v3/threats/species/name/{}?token={}'.format(name,api_token)
    threat_df = process_request(threat_url)
    return threat_df

In [5]:
def get_species_by_group(group_name):
    species_url = 'http://apiv3.iucnredlist.org/api/v3/comp-group/getspecies/{}?token={}'.format(group_name,api_token)
    group_df = process_request(species_url)
    return group_df

In [6]:
def get_species_by_country(country_name):
    country_url = 'http://apiv3.iucnredlist.org/api/v3/country/getspecies/{}?token={}'.format(country_name,api_token)
    country_df = process_request(country_url)
    return country_df

In [7]:
def get_species_by_page():
    result_species_df = pd.DataFrame()
    first_url = 'http://apiv3.iucnredlist.org/api/v3/species/page/{}?token={}'.format(0,api_token)
    temp_df = process_request(first_url)
    result_species_df = result_species_df.append(temp_df)
    
    page_num = 1
    while not temp_df.empty:
        url = 'http://apiv3.iucnredlist.org/api/v3/species/page/{}?token={}'.format(page_num,api_token)
        temp_df = process_request(url)
        result_species_df = result_species_df.append(temp_df)
        print("---page {}---".format(page_num))
        page_num += 1
    result_species_df.to_csv("datasets/complied_species.csv")
    return result_species_df

In [8]:
get_species_by_page().head()

---page 1---
---page 2---
---page 3---
---page 4---
---page 5---
---page 6---
---page 7---
---page 8---
---page 9---


KeyboardInterrupt: 

In [None]:
get_threat("Loxodonta africana").head()

In [None]:
get_species_by_group("seabreams_porgies_picarels").head()

In [8]:
# In  this section we will get a partial of the dataframe we complied above
# Then use the api to get the threat for each of them.
# Next, we will use sholarly a python package to recheive google scholar information.
import scholarly

In [9]:
compiled_species_df = pd.read_csv("datasets/complied_species.csv")

In [10]:
# Take the first one thousand to test 
partial_species_df = compiled_species_df.head(1000)

In [None]:
def get_threats_return_dataframe(df):
    result_df = pd.DataFrame()
    for species in df["scientific_name"]:
        temp_df = get_threat(species)
        temp_df["scientific_name"] = species
        result_df = result_df.append(temp_df)
    
    # Merge with our original df so we still have that information
    
    result_merged_df = result_df.merge(df, how = "left", on = "scientific_name")
    result_merged_df.to_csv("datasets/partial_threats.csv")
    return result_merged_df


In [None]:
get_threats_return_dataframe(partial_species_df)

In [11]:
# The above code is way too slow for even just 1000 species.
# I am going to revise the function to make it parallel 
# Code reference:https://stackoverflow.com/questions/36054321/parallel-processing-in-pandas-python
# Code reference: https://stackoverflow.com/questions/36794433/python-using-multiprocessing-on-a-pandas-dataframe
# Code reference: https://towardsdatascience.com/make-your-own-super-pandas-using-multiproc-1c04f41944a1
import multiprocessing as mp
import numpy as np

In [11]:
def get_threats_for_parallel(df):
    for species in df["scientific_name"]:
        temp_df = get_threat(species)
        temp_df["scientific_name"] = species
        return temp_df


def get_threats_return_dataframe_parallel(df):
    # Open the pool based on the number of cpus we have
    
    # We don't want to overload the computer. So we leave one for 
    usable_cores = mp.cpu_count() - 1
    print("There are {} cores available".format(usable_cores))
    pool = mp.Pool(processes = usable_cores)
    
    #Seperate our dataframe based on that
    
    df_split = np.array_split(df,usable_cores)
    results = pool.map(get_threats_for_parallel, df_split)
    pool.close()
    pool.join()
    results_df = pd.concat(results)
    
    # Merge with our original df so we still have that information
    
    result_merged_df = result_df.merge(df, how = "left", on = "scientific_name")
    result_merged_df.to_csv("datasets/partial_threats_parallel.csv")
    return result_merged_df



In [None]:
# For some reason this is not faster than non parallel one which is bizzare.
# TODO
# Use other available pakages for this
get_threats_return_dataframe_parallel(partial_species_df)

There are 3 cores available


In [None]:
# Let's try schoaly first
# I doubt this will work well, since it's hard for it to only show relevant results. 
# Code reference: https://stackoverflow.com/questions/50555855/parsing-google-scholar-results-with-python-and-beautifulsoup

In [13]:
from bs4 import BeautifulSoup
import random 
import time

In [14]:
partial_species_threats_df = pd.read_csv("datasets/partial_threats.csv")

In [46]:

def cross_check_scholar(scientific_name, threat):
    query = "{}%20{}".format("sleep", "dream")
#     url = "https://scholar.google.com/scholar?hl=en&as_sdt=0%2C9&q={}+{}&btnG=".format("sleep", "dream")
    url = 'https://scholar.google.com/scholar?q=' + query + '&ie=UTF-8&oe=UTF-8&hl=en&btnG=Search'
    content = requests.get(url).text
    page = BeautifulSoup(content, 'lxml')
    results = []
    for entry in page.find_all("h3", attrs={"class": "gs_rt"}):
        results.append("{} ".format(entry.a.text))
    print(results)
    # Not sure if this can prevent us from getting detected as bot. 
    time.sleep(random.uniform(0.01, 0.1))
    if random.randint(1,10) == 5:
        time.sleep(2)

In [47]:
partial_species_threats_df['google_scholar_results'] = partial_species_threats_df.apply(lambda x: cross_check_scholar(x['scientific_name'], x['title']), axis=1)


[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]


KeyboardInterrupt: 

In [None]:
partial_species_threats_df.head()

In [28]:
partial_species_threats_df.to_csv("datasets/partial_species_scholar.csv")

In [42]:
# got blocked 
import gscholar
gscholar.query("dream+sleep")

HTTPError: HTTP Error 429: Too Many Requests