# Collect and process data

Closes issue #1

## Preamble

In [3]:
%matplotlib inline

In [4]:
import os
import sys
import seaborn as sn
sys.path.append(os.path.abspath('../../'))
#from query_indicators import generate_save_path
from query_indicators import get_notebook_name
from query_indicators import get_eu_countries

In [5]:
import boto3
from collections import defaultdict
from clio_lite import clio_search, clio_search_iter
import io
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.collections import PatchCollection
from matplotlib.patches import Rectangle
import numpy as np
import pandas as pd

In [6]:
# Some globals
URL = "https://search-eurito-prod-bbyn72q2rhx4ifj6h5dom43uhy.eu-west-1.es.amazonaws.com/"
#INDEX = "arxiv_v0" 
FIELDS = ['terms_tokens_entity', 'textBody_abstract_article']
EU_COUNTRIES = get_eu_countries() 
#fix: the above API produces erroneous list, so we clean it below
EU_COUNTRIES = [e for e in EU_COUNTRIES if e not in ("AX", "FO", "GF", "GI", "IM")]
COLORS = plt.get_cmap('Set2').colors
COLOR_MAP = 'Pastel1'
S3 = boto3.resource('s3')

In [7]:
#Functions

In [8]:
def emergent_search(query, max_query_terms,index, fields,yr0=2014, yr1=2019,countries=EU_COUNTRIES, window=1):
    """
    Retrieve count and score data for a given basic clio search.
    
    Args:
        query (str): Seed query for clio.
        max_query_terms (list): Triple of max_query_terms (low, middle, high) to use from the initial query.
        yr0 (int): Start year in range to use in filter.
        yr1 (int): Final year in range to use in filter.
        countries (list): A list of countries to filter (default to all EU).
        window (int): The number of years to consider in between time windows. Note that changing this will lead to double-counting.
    Returns:
        results: list with data
    """
    
    results = []
    
    print(f'running {query}')
    
    for n in max_query_terms:
        # Iterate over years
        for yr in range(yr0, yr1+1):       
            # Iterate over docs
            filters = [{"range":{"year_of_article":{"gte":yr, "lt":yr+window}}}]
            for doc in clio_search_iter(url=URL, index=index, query=query, fields=fields,
                                        max_query_terms=n, post_filters=filters, chunksize=5000):
                
                #We add the query sources
                doc['max_query_terms'] = n
                doc['query_source'] = query
                
                if '_score' not in doc or doc['terms_countries_article'] is None:
                    continue
                else:
                    results.append(doc)
    return(results)
   

## ArXiv

### Collect data

In [20]:
ai_results = [emergent_search(var,[3,10,15],yr0=2000,yr1=2019,index='arxiv_v0',
                              fields=['terms_tokens_entity', 'textBody_abstract_article']) 
              for var in ['artificial intelligence','intelligent system','expert system',
                          'machine learning','neural network','deep learning', 'reinforcement learning']]

running artificial intelligence
running intelligent system
running expert system
running machine learning
running neural network
running deep learning
running reinforcement learning


In [29]:
#Serialise results

queries = ['artificial intelligence','intelligent system','expert system',
                          'machine learning','neural network','deep learning', 'reinforcement learning']

for x,name in zip(ai_results,queries):
    
    clean_name = '_'.join(name.split(' '))
    
    print(clean_name)
    
    with open(f'../../data/ai_results/{clean_name}.p','wb') as outfile:
        pickle.dump(x,outfile)

artificial_intelligence
intelligent_system
expert_system
machine_learning
neural_network
deep_learning
reinforcement_learning


#### Sectoral data

In [18]:
sector_results = [emergent_search(var,[1,2,5],yr0=2010,yr1=2019,index='arxiv_v0',
                              fields=['terms_tokens_entity', 'textBody_abstract_article']) 
              for var in ['health medical','climate environment','manufacturing factory','finance market']]

running health medical
running climate environment
running manufacturing factory
running finance market


In [17]:
[len(x) for x in sector_results]

[14553, 5016, 5319, 6954]