In [1]:
import requests
import pandas as pd
import numpy as np
import json
from pandas.io.json import json_normalize

In [2]:
api_token='9bb4facb6d23f48efbf424bb05c0c1ef1cf6f468393bc745d42179ac4aca5fee'

In [3]:
def process_request(url):
    """
    This function processes the api request and returns a pandas dataframe object
    """
    request = requests.get(url)
    data = json.loads(request.text)
    df = pd.DataFrame(json_normalize(data['result']))
    return df 

In [4]:
def get_threat(name):
    """
    This function gets 
    """
    threat_url = 'http://apiv3.iucnredlist.org/api/v3/threats/species/name/{}?token={}'.format(name,api_token)
    threat_df = process_request(threat_url)
    return threat_df

In [5]:
def get_species_by_group(group_name):
    species_url = 'http://apiv3.iucnredlist.org/api/v3/comp-group/getspecies/{}?token={}'.format(group_name,api_token)
    group_df = process_request(species_url)
    return group_df

In [6]:
def get_species_by_country(country_name):
    country_url = 'http://apiv3.iucnredlist.org/api/v3/country/getspecies/{}?token={}'.format(country_name,api_token)
    country_df = process_request(country_url)
    return country_df

In [8]:
def get_species_by_page():
    result_species_df = pd.DataFrame()
    first_url = 'http://apiv3.iucnredlist.org/api/v3/species/page/{}?token={}'.format(0,api_token)
    temp_df = process_request(first_url)
    result_species_df = result_species_df.append(temp_df)
    
    page_num = 1
    while not temp_df.empty:
        url = 'http://apiv3.iucnredlist.org/api/v3/species/page/{}?token={}'.format(page_num,api_token)
        temp_df = process_request(url)
        result_species_df = result_species_df.append(temp_df)
        print("---page {}---".format(page_num))
        page_num += 1
    result_species_df.to_csv("datasets/complied_species.csv")
    return result_species_df

In [9]:
get_species_by_page().head()

---page 1---
---page 2---
---page 3---
---page 4---
---page 5---
---page 6---
---page 7---


KeyboardInterrupt: 

In [10]:
get_threat("Loxodonta africana").head()

Unnamed: 0,code,invasive,scope,score,severity,timing,title
0,1.1,,,Low Impact: 3,,Ongoing,Housing & urban areas
1,11.2,,,Low Impact: 3,,Ongoing,Droughts
2,1.2,,,Low Impact: 3,,Ongoing,Commercial & industrial areas
3,2.1,,,Low Impact: 3,,Ongoing,Annual & perennial non-timber crops
4,2.1.1,,,Low Impact: 3,,Ongoing,Shifting agriculture


In [11]:
get_species_by_group("seabreams_porgies_picarels").head()

Unnamed: 0,category,rank,scientific_name,subpopulation,subspecies,taxonid
0,DD,,Acanthopagrus akazakii,,,170230
1,LC,,Acanthopagrus arabicus,,,47166659
2,LC,,Acanthopagrus australis,,,170257
3,LC,,Acanthopagrus berda,,,170266
4,LC,,Acanthopagrus bifasciatus,,,170239


In [7]:
# In  this section we will get a partial of the dataframe we complied above
# Then use the api to get the threat for each of them.
# Next, we will use sholarly a python package to recheive google scholar information.
import scholarly

In [8]:
compiled_species_df = pd.read_csv("datasets/complied_species.csv")

In [9]:
# Take the first one thousand to test 
partial_species_df = compiled_species_df.head(1000)

In [10]:
def get_threats_return_dataframe(df):
    result_df = pd.DataFrame()
    for species in df["scientific_name"]:
        temp_df = get_threat(species)
        temp_df["scientific_name"] = species
        result_df = result_df.append(temp_df)
    
    # Merge with our original df so we still have that information
    
    result_merged_df = result_df.merge(df, how = "left", on = "scientific_name")
    result_merged_df.to_csv("datasets/partial_threats.csv")
    return result_merged_df


In [11]:
get_threats_return_dataframe(partial_species_df)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


KeyboardInterrupt: 

In [None]:
# The above code is way too slow for even just 1000 species.
# I am going to revise the function to make it parallel 
# Code reference:https://stackoverflow.com/questions/36054321/parallel-processing-in-pandas-python
# Code reference: https://stackoverflow.com/questions/36794433/python-using-multiprocessing-on-a-pandas-dataframe
# Code reference: https://towardsdatascience.com/make-your-own-super-pandas-using-multiproc-1c04f41944a1
import multiprocessing as mp
import numpy as np

In [12]:
def get_threats_for_parallel(df):
    for species in df["scientific_name"]:
        temp_df = get_threat(species)
        temp_df["scientific_name"] = species
        return temp_df


def get_threats_return_dataframe_parallel(df):
    # Open the pool based on the number of cpus we have
    
    # We don't want to overload the computer. So we leave one for 
    usable_cores = mp.cpu_count() - 1
    print("There are {} cores available".format(usable_cores))
    pool = mp.Pool(processes = usable_cores)
    
    #Seperate our dataframe based on that
    
    df_split = np.array_split(df,usable_cores)
    results = pool.map(get_threats_for_parallel, df_split)
    pool.close()
    pool.join()
    results_df = pd.concat(results)
    
    # Merge with our original df so we still have that information
    
    result_merged_df = result_df.merge(df, how = "left", on = "scientific_name")
    result_merged_df.to_csv("datasets/partial_threats_parallel.csv")
    return result_merged_df



In [None]:
# For some reason this is not faster than non parallel one which is bizzare.
# TODO
# Use other available pakages for this
get_threats_return_dataframe_parallel(partial_species_df)

There are 3 cores available


In [None]:
# Let's try schoaly first
# I doubt this will work well, since it's hard for it to only show relevant results. 
# Code reference: https://stackoverflow.com/questions/50555855/parsing-google-scholar-results-with-python-and-beautifulsoup

In [13]:
from bs4 import BeautifulSoup
import random 
import time

In [14]:
partial_species_threats_df = pd.read_csv("datasets/partial_threats.csv")

After poking around. It seems the problems are

- Not all Scientific Names are formatted well(Some contain non letters)
- There are dashes in the threats (Don't think query likes that)
- Sleep time was not long enough. (This kinda make this impossible. It would potentially require the program to run for 10 days. ) 

In [64]:
# Test Run our Program on 50 species with the threat Harvesting
partial_test_df = compiled_species_df.head(50)
scholars_result = []
result_length = []
num_results =[]

In [23]:
def cross_check_scholar(query):
    """
    This function scrapes google scholar search result page,and return a list of titles. 
    """
    url = 'https://scholar.google.com/scholar?q=' + query + '&ie=UTF-8&oe=UTF-8&hl=en&btnG=Search'
    content = requests.get(url).text
    page = BeautifulSoup(content, 'lxml')
    results = []
    for entry in page.find_all("h3", attrs={"class": "gs_rt"}):
        results.append("{} ".format(entry.a.text))
    print(results)
    # Not sure if this can prevent us from getting detected as bot. 
    time.sleep((40-10)*np.random.random()+5)
    return results

In [37]:
import re

In [63]:
def cross_check_scholar_num_results(query):
    """
    This function scrapes google scholar search result page,and return the number of results. 
    """
    url = 'https://scholar.google.com/scholar?q=' + query + '&ie=UTF-8&oe=UTF-8&hl=en&btnG=Search'
    content = requests.get(url).text
    page = BeautifulSoup(content, 'lxml')
    result = page.find("div", {"id": "gs_ab_md"})
    num_of_results = re.findall('\d+', str(result) )
    try:
        actual_num = num_of_results[0]
    except IndexError:
        actual_num = 0
    print("This {} search gets {} results".format(query,actual_num))
    # Not sure if this can prevent us from getting detected as bot. 
    time.sleep((40-10)*np.random.random()+5)
    return actual_num

In [65]:
# Run it on 50 species first. 
for sc in partial_test_df["scientific_name"]:
    q,e = sc.split()
    query = "{}%20{}%20harvesting".format(q,e)
    try:
        temp_result = cross_check_scholar_num_results(query)
    except AttributeError:
        temp_result = 0
    num_results.append(temp_result)

This Aaadonta%20angaurana%20harvesting search gets 3 results
This Aaadonta%20constricta%20harvesting search gets 2 results
This Aaadonta%20fuscozonata%20harvesting search gets 0 results
This Aaadonta%20irregularis%20harvesting search gets 1 results
This Aaadonta%20kinlochi%20harvesting search gets 1 results
This Aaadonta%20pelewana%20harvesting search gets 1 results
This Aaptosyax%20grypus%20harvesting search gets 34 results
This Abrocoma%20boliviensis%20harvesting search gets 56 results
This Abronia%20montecristoi%20harvesting search gets 16 results
This Acanthaeschna%20victoria%20harvesting search gets 14 results
This Acanthinula%20spinifera%20harvesting search gets 0 results
This Mirogrex%20hulensis%20harvesting search gets 18 results
This Acanthochelys%20pallidipectoris%20harvesting search gets 32 results
This Acanthochelys%20spixii%20harvesting search gets 65 results
This Acanthochelys%20radiolata%20harvesting search gets 28 results
This Acanthocobitis%20urophthalmus%20harvesting 

KeyboardInterrupt: 

In [60]:
# Combine everything and save to csv. 
# It turns out 5 to 30 might still get you blocked. 
# Now using 10 to 40. Still might get blocked. 
partial_test_df["num of articles"] = num_results
partial_test_df.head()

ValueError: Length of values does not match length of index

In [None]:
# 