In [1]:
%matplotlib inline

In [2]:
import os
import sys
sys.path.append(os.path.abspath('../../'))
from query_indicators import generate_save_path
from query_indicators import get_eu_countries

In [3]:
import boto3
from collections import defaultdict
from clio_lite import clio_search, clio_search_iter
import io
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
from matplotlib.patches import Rectangle
import numpy as np
import pandas as pd

In [4]:
# Env variables
mpl.rcParams['hatch.linewidth'] = 0.2
mpl.rcParams['font.size'] = 18
mpl.rcParams['image.cmap'] = 'Pastel1'
#os.environ['AWS_SHARED_CREDENTIALS_FILE'] = '/home/aidrissov/.aws/credentials'  # <--- Note: NOT nesta's AWS credentials
#from os import path
#print ("File exists:" + str(path.exists('/home/aidrissov/.aws/credentials')))
#print ("directory exists:" + str(path.exists('/home/aidrissov/.aws/')))
#TRY MANUAL CREDENTIALS

In [5]:
# Some globals
URL = "https://search-eurito-prod-bbyn72q2rhx4ifj6h5dom43uhy.eu-west-1.es.amazonaws.com/"
INDEX = "arxiv_v0" 
FIELDS = ['terms_tokens_entity', 'textBody_abstract_article']
EU_COUNTRIES = get_eu_countries()
COLORS = plt.get_cmap('Set2').colors
COLOR_MAP = 'Pastel1'
S3 = boto3.resource('s3')
SAVE_PATH = generate_save_path()  # EURITO collaborators: this is generated assuming you have stuck to the convention 'theme_x/something/something_else.ipynb'
BUCKET = 'eurito-indicators'  # EURITO collaborators: please don't change this
SAVE_RESULTS = True  # Set this to "False" when you want to view figures inline. When "True", results will be saved to S3.

if SAVE_RESULTS:
    plt.ioff()  # <--- for turning off visible figs
else:
    plt.ion()

In [6]:
def make_search(query, max_query_terms, yr0=2014, yr1=2019, countries=EU_COUNTRIES, window=1):
    """
    Retrieve count and score data for a given basic clio search.
    
    Args:
        query (str): Seed query for clio.
        max_query_terms (list): Triple of max_query_terms (low, middle, high) to use from the initial query.
        yr0 (int): Start year in range to use in filter.
        yr1 (int): Final year in range to use in filter.
        countries (list): A list of countries to filter (default to all EU).
        window (int): The number of years to consider in between time windows. Note that changing this will lead to double-counting.
    Returns:
        data (dict): {max_query_terms --> [{year --> sum_score} for each country]}
        all_scores (dict): {max_query_terms --> {country --> [score for doc in docs] } }
    """
    top_doc = None
    _data = defaultdict(lambda: defaultdict(dict))  # {max_query_terms --> {year --> {country --> score} } }
    all_scores = defaultdict(lambda: defaultdict(list))  # {max_query_terms --> {country --> [score for doc in docs] } }
    for n in max_query_terms:
        # Set the order of the countries
        for ctry in EU_COUNTRIES:
            _data[n][ctry]
            all_scores[n][ctry]
        # Iterate over years
        for yr in range(yr0, yr1+1):
            # Set default values for countries
            for ctry in EU_COUNTRIES:
                _data[n][ctry][yr] = 0            
            # Iterate over docs
            filters = [{"range":{"year_of_article":{"gte":yr, "lt":yr+window}}}]
            for doc in clio_search_iter(url=URL, index=INDEX, query=query, fields=FIELDS,
                                        max_query_terms=n, post_filters=filters, chunksize=5000):
                if '_score' not in doc or doc['terms_countries_article'] is None:
                    continue
                score = doc['_score']
                for ctry in filter(lambda x: x in countries, doc['terms_countries_article']):
                    if top_doc is None:
                        top_doc = doc                
                    all_scores[n][ctry].append(score)
                    _data[n][ctry][yr] += score
    # Reformat data as {max_query_terms --> [{year --> score} for each country in order]}
    data = {}
    for n, ctry_data in _data.items():
        data[n] = []
        for ctry, yr_data in ctry_data.items():
            data[n].append(yr_data)
    return top_doc, data, all_scores

## Indicator calculations

Each of these functions is assumed to take the form

```python
def _an_indicator_calulation(data, year=None, _max=1):
    """
    A function calculating an indicator.
    
    Args:
        data (list): Rows of data
        year (int): A year to consider, if applicable.
        _max (int): Divide by this to normalise your results. This is automatically applied in :obj:`make_activity_plot`
    Returns:
        result (list) A list of indicators to plot. The length of the list is assumed to be equal to the number of countries.
    """
    # Calculate something
```

In [7]:
def _total_activity_by_country(data, year=None, _max=1):
    """
    Indicator: Sum of relevance scores, by year (if specified) or in total.
    """    
    if year is None:        
        scores = [sum(row.values())/_max for row in data]
    else:
        scores = [row[year]/_max for row in data]
    return scores
      

def _average_activity_by_country(data, year=None, _max=1):    
    """
    Indicator: Mean relevance score. This function is basically a lambda, since it assumes the average has already been calculated.
    """        
    return [row/_max for row in data]
    
    
def _corrected_average_activity_by_country(data, year=None, _max=1):
    """
    Indicator: Mean relevance score minus it's (very) approximate Poisson error.
    """    
    return [(row - np.sqrt(row))/_max for row in data]
    

def _linear_coeffs(years, scores, _max):
    """Calculates linear coefficients for scores wrt years"""
    return [np.polyfit(_scores, _years, 1)[0]/_max
            if all(v > 0 for v in _scores) else 0
            for _years, _scores in zip(years, scores)]    
    

def _trajectory(data, year=None, _max=1):
    """
    Indicator: Linear coefficient of total relevance score wrt year
    """
    years = [list(row.keys()) for row in data]
    scores = [list(row.values()) for row in data]
    return _linear_coeffs(years, scores, _max)


def _corrected_trajectory(data, year=None, _max=1):
    """
    Indicator: Linear coefficient of upper and lower limits of relevance score wrt year
    """ 
    # Reformulate the data in terms of upper and lower bounds
    years, scores = [], []
    for row in data:
        _years, _scores = [], []
        for k, v in row.items():
            _years += [k,k]
            _scores += [v - np.sqrt(v), v + np.sqrt(v)]  # Estimate upper and lower limits with very approximate Poisson errors
        years.append(_years)
        scores.append(_scores)
    return _linear_coeffs(years, scores, _max)

## Plotting functionality

In [8]:
class _Sorter:
    def __init__(self, values, topn=None):
        if topn is None:
            topn = len(values)
        self.indices = list(np.argsort(values))[-topn:]  # Argsort is ascending, so -ve indexing to pick up topn
    def sort(self, x):
        """Sort list x by indices"""
        return [x[i] for i in self.indices]


def _s3_savefig(query, fig_name, extension='png'):
    """Save the figure to s3. The figure is grabbed from the global scope."""
    if not SAVE_RESULTS:
        return    
    outname = (f'figures/{SAVE_PATH}/'
               f'{query.replace(" ","_").lower()}'
               f'/{fig_name.replace(" ","_").lower()}'
               f'.{extension}')
    with io.BytesIO() as f:
        plt.savefig(f, bbox_inches='tight', format=extension, pad_inches=0)
        obj = S3.Object(BUCKET, outname)
        f.seek(0)
        obj.put(Body=f)

        
def _s3_savetable(data, key, index, object_path, transformer=lambda x: x):
    """Upload the table to s3"""
    if not SAVE_RESULTS:
        return
    df = pd.DataFrame(transformer(data[key]), index=index)
    if len(df.columns) == 1:
        df.columns = ['value']
    df = df / df.max().max()
    table_data = df.to_csv().encode()
    obj = S3.Object(BUCKET, os.path.join(f'tables/{SAVE_PATH}', object_path))
    obj.put(Body=table_data)

        
def make_activity_plot(f, data, countries, max_query_terms, query, 
                       year=None, label=None, x_padding=0.5, y_padding=0.05, xlabel_fontsize=14):
    """
    Make a query and generate indicators by country, saving the plots to S3 and saving the rawest data
    to tables on S3.
    
    
    Args:
        f: An indicator function, as described in the 'Indicator calculations' section.
        data (dict): {max_query_terms --> [{year --> sum_score} for each country]}
        countries (list): A list of EU ISO-2 codes        
        max_query_terms (list): Triple of max_query_terms for clio, corresponding to low, middle and high values of 
                                max_query_terms to test robustness of the query.
        query (str): query used to generate this data.
        year (int): Year to generate the indicator for (if applicable).
        label (str): label for annotating the plot.
        {x,y}_padding (float): Aesthetic padding around the extreme limits of the {x,y} axis.
        xlabel_fontsize (int): Fontsize of the x labels (country ISO-2 codes).
    """    
    # Calculate the indicator for each value of n, then recalculate the normalised indicator
    _, middle, _ = (f(data[n], year=year) for n in max_query_terms)
    low, middle, high = (f(data[n], year=year, _max=max(middle)) for n in max_query_terms)
    indicator = [np.median([a, b, c]) for a, b, c in zip(low, middle, high)]    

    # Sort all data by indicator value
    s = _Sorter(indicator)
    countries = s.sort(countries)
    low = s.sort(low)
    middle = s.sort(middle)
    high =  s.sort(high)
    indicator = s.sort(indicator)

    # Make the scatter plot
    fig, ax = plt.subplots(figsize=(15, 6))    
    make_error_boxes(ax, low, middle, high)  # Draw the bounding box
    ax.scatter(countries, indicator,  s=0, marker='o', color='black')  # Draw the centre mark
    ax.set_title(f'{label}\nQuery: "{query}"')
    ax.set_ylabel(label)

    # Set limits and formulate 
    y0 = min(low+middle+high)    
    y1 = max(low+middle+high)
    if -y1*y_padding < y0:
        y0 = -y1*y_padding
    else:  # In case of negative values
        y0 = y0 - np.abs(y0*y_padding)
    ax.set_ylim(y0, y1*(1+y_padding))
    ax.set_xlim(-x_padding, len(countries)-x_padding)
    for tick in ax.xaxis.get_major_ticks():
        tick.label.set_fontsize(xlabel_fontsize)
    
    # Save to s3 & return
    _s3_savefig(query, label)
    return ax


def make_error_boxes(ax, low, middle, high, facecolor='r',
                     edgecolor='None', alpha=0.5):
    """
    Generate outer rectangles based on three values, and draw a horizontal line through the middle of the rectangle.
    No assumption is made on the order of values, so don't worry if they're not properly ordered.
        
    Args:
        ax (matplotlib.axis): An axis to add patches to.
        {low, middle, high} (list): Three concurrent lists of values from which to calculate the rectangle limits.
        {facecolor, edgecolor} (str): The {face,edge} colour of the rectangles.
        alpha (float): The alpha of the rectangles.
    """
    # Generate the rectangle
    errorboxes = []
    middlelines = []
    for x, ys in enumerate(zip(low, middle, high)):        
        rect = Rectangle((x - 0.45, min(ys)), 0.9, max(ys) - min(ys))
        line = Rectangle((x - 0.45, np.median(ys)), 0.9, 0)
        errorboxes.append(rect)
        middlelines.append(line)

    # Create patch collection with specified colour/alpha
    pc = PatchCollection(errorboxes, facecolor=facecolor, alpha=alpha, edgecolor=edgecolor, hatch='/')
    lc = PatchCollection(middlelines, facecolor='black', alpha=0.9, edgecolor='black')

    # Add collection to axes
    ax.add_collection(pc)
    ax.add_collection(lc)


def stacked_scores(all_scores, query, topn=8,
                   low_bins=[10**i for i in np.arange(0, 1.1, 0.025)],
                   high_bins=[10**i for i in np.arange(1.1, 2.5, 0.05)],
                   x_scale='log', label='Relevance score breakdown', 
                   xlabel='Relevance score', ylabel='Number of relevant documents',
                   legend_fontsize='small', legend_cols=2):
    """
    Create stacked histogram of document scores by country. Two sets of bins are used, 
    in order to have a more legible binning scale.
    
    Args:
        all_scores (dict): {max_query_terms --> {country --> [score for doc in docs] } }
        query (str): query used to generate this data.
        low_bins (list): List of initial bin edges.
        high_bins (list): List of supplementary bin edges. These could have a different spacing scheme to the lower bin edges.
        x_scale (str): Argument for `ax.set_xscale`.
        label (str): label for annotating the plot.
        {x,y}_label (str): Argument for `ax.set_{x,y}label`.
        legend_fontsize (str): Argument for legend fontsize.
        legend_cols (str): Argument for legend ncol.        
    """
    
    # Sort countries and scores by the sum of scores by country
    countries = list(all_scores.keys())
    scores = list(all_scores.values())    
    s = _Sorter([sum(v) for v in scores], topn=topn)
    scores = s.sort(scores)
    countries = s.sort(countries)

    # Plot the stacked scores
    fig, ax = plt.subplots(figsize=(10, 6))
    plt.set_cmap(COLOR_MAP)
    ax.hist(scores, bins=low_bins+high_bins, stacked=True,
            label=countries, color=COLORS[:len(scores)])
    
    # Prettify the plot
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.legend(fontsize=legend_fontsize, ncol=legend_cols)
    ax.set_xlim(low_bins[0], None)
    ax.set_xscale(x_scale)
    ax.set_title(f'{label}\nQuery: "{query}"')
    
    # Save to s3
    _s3_savefig(query, label)
    return ax

## Bringing it all together

In [9]:
def generate_indicator(q, max_query_terms=[7, 10, 13], countries=EU_COUNTRIES, *args, **kwargs):
    """
    Make a query and generate indicators by country, saving the plots to S3 and saving the rawest data
    to tables on S3.
    
    
    Args:
        q (str): The query to Elasticsearch
        max_query_terms (list): Triple of max_query_terms for clio, corresponding to low, middle and high values of 
                                max_query_terms to test robustness of the query.
        countries (list): A list of EU ISO-2 codes
    Returns:
        top_doc (dict): The highest ranking document from the search.
        data (dict): {max_query_terms --> [{year --> sum_score} for each country]}
        all_scores (dict): {max_query_terms --> {country --> [score for doc in docs] } }
    """
    
    # Make the search and retrieve scores by country, and the highest ranking doc
    example_doc, data, all_scores = make_search(q, max_query_terms=max_query_terms, countries=countries, *args, **kwargs)

    # Reformat the scores to calculate the average
    avg_scores = defaultdict(list)
    for ctry in countries:
        for n, _scores in all_scores.items():
            mean = np.mean(_scores[ctry]) if len(_scores[ctry]) > 0 else 0
            avg_scores[n].append(mean)
    
    plot_kwargs = dict(countries=countries, max_query_terms=max_query_terms, query=q)
    # Calculate loads of indicators and save the plots
    _ = make_activity_plot(_total_activity_by_country, data, label='Total relevance score', **plot_kwargs)
    _ = make_activity_plot(_average_activity_by_country, avg_scores, label='Average relevance', **plot_kwargs)
    _ = make_activity_plot(_corrected_average_activity_by_country, avg_scores, label='Corrected average relevance',  **plot_kwargs)
    _ = make_activity_plot(_trajectory, data, label='Trajectory', **plot_kwargs)
    _ = make_activity_plot(_corrected_trajectory, data, label='Corrected trajectory', **plot_kwargs)
    _ = stacked_scores(all_scores[max_query_terms[1]], query=q)
    
    # Save the basic raw data as tables. Note: not as rich as the plotted data.
    _q = q.replace(" ","_").lower()
    _s3_savetable(data, max_query_terms[1], index=countries, object_path=f'{_q}/LMA.csv')
    _s3_savetable(avg_scores, max_query_terms[1], index=countries, object_path=f'{_q}/avg_LMA.csv')
    
    plt.close('all')  # Clean up the memory cache (unbelievable that matplotlib doesn't do this)
    return example_doc, data, all_scores

## Iterate over queries

In [12]:
for term in ["Adaptation to climate change, including societal transformation",
             "Cancer",
             "Climate-neutral and smart cities",
             "Soil health and food"]:
    print(term)
    print("-"*len(term))
    top_doc, data, all_scores = generate_indicator(term)
    print(top_doc['title_of_article'], ",", top_doc['year_of_article'])
    print(top_doc['terms_countries_article'])
    print(top_doc['textBody_abstract_article'])
    print("\n==============================\n")

Adaptation to climate change, including societal transformation
---------------------------------------------------------------
Validity of altmetrics data for measuring societal impact: A study using
  data from Altmetric and F1000Prime , 2014
['DE']
Can altmetric data be validly used for the measurement of societal impact?
The current study seeks to answer this question with a comprehensive dataset
(about 100,000 records) from very disparate sources (F1000, Altmetric, and an
in-house database based on Web of Science). In the F1000 peer review system,
experts attach particular tags to scientific papers which indicate whether a
paper could be of interest for science or rather for other segments of society.
The results show that papers with the tag "good for teaching" do achieve higher
altmetric counts than papers without this tag - if the quality of the papers is
controlled. At the same time, a higher citation count is shown especially by
papers with a tag that is specifically scientif