# SHARE Query Requests from the Community

Here's where we can keep track of code for common things that members of the SHARE community might like to know!

## Setup

In [25]:
import furl
import json
import requests

SHARE_SEARCH_API = 'https://osf.io/api/v1/share/search/'
ALL_PROVIDER_INFO = requests.get('https://osf.io/api/v1/share/providers/').json()['providerMap']

def query_share(url, query):
    headers = {'Content-Type': 'application/json'}
    data = json.dumps(query)
    return requests.post(url, headers=headers, data=data, verify=False).json()

def get_longname_for_shortname(shortname):
    for source in ALL_PROVIDER_INFO.keys():
        if source == shortname:
            return ALL_PROVIDER_INFO[source]['long_name']
    

## Queries

In [2]:
# What's the earliest and latest document from each source?

import pandas as pd

date_stats_agg = {
    "aggregations": {
        "sources": {
            "terms": {"field": "_type", "size": 0},
            "aggregations": {
                "source_stats": {
                    "stats": {"field": "providerUpdatedDateTime"}
                }
            }
        }
    }
}

date_results = query_share(SHARE_SEARCH_API, date_stats_agg)['aggregations']['sources']['buckets']

date_results_df = pd.DataFrame()
date_results_df['source_shortname'] = [result['key'].encode('utf-8') for result in date_results]
date_results_df['source_longname'] = [get_longname_for_shortname(name).encode('utf-8') for name in date_results_df['source_shortname']]
date_results_df['earliest_date'] = [result['source_stats']['min_as_string'] for result in date_results]
date_results_df['latest_date'] = [result['source_stats']['max_as_string'] for result in date_results]
date_results_df



Unnamed: 0,source_shortname,source_longname,earliest_date,latest_date
0,datacite,DataCite MDS,2015-07-26T00:03:30.000Z,2016-06-02T01:58:12.000Z
1,crossref,CrossRef,2014-08-03T00:00:00.000Z,2016-06-03T00:00:00.000Z
2,scitech,DoE's SciTech Connect Database,2014-10-03T00:00:00.000Z,2016-06-03T00:00:00.000Z
3,figshare,figshare,2014-10-28T00:00:00.000Z,2016-06-03T15:07:00.000Z
4,pubmedcentral,PubMed Central,2014-12-28T00:00:00.000Z,2016-06-03T00:00:00.000Z
5,dataone,DataONE: Data Observation Network for Earth,2015-04-11T00:00:00.000Z,2016-05-28T00:00:00.000Z
6,arxiv_oai,ArXiv,2014-10-03T00:00:00.000Z,2016-06-03T00:00:00.000Z
7,rcaap,RCAAP - Repositório Científico de Acesso Abert...,2015-12-27T02:00:54.000Z,2016-06-03T05:08:35.000Z
8,cyberleninka,CyberLeninka - Russian open access scientific ...,2015-12-22T00:00:00.000Z,2016-06-02T22:07:47.000Z
9,citeseerx,CiteSeerX Scientific Literature Digital Librar...,2008-07-01T00:00:00.000Z,2016-06-02T00:00:00.000Z


In [None]:
# Uncomment the following lines if running locally - will save to file formats

# date_results_df.to_csv('SHARE_Min_Max_dates.csv')
# date_results_df.to_excel('SHARE_Min_Max_dates.xlsx')

## Lucene Search and NOT Queries

A user wanted to know how to query for one term but exclude another

In [None]:
query = '?q=pedigree NOT child'

In [None]:
results = requests.get(SHARE_SEARCH_API + query).json()
results

## Querying by Document Type

Currently, document type is not curated by SHARE. However, we do collected many sources that are using the OAI-PMH metadata protocol, which includes dc:type. You can search that field in SHARE for now, until the harvesters collect and curate document type.

In [None]:
query = '?q=otherProperties.properties.type:article'


In [None]:
results = requests.get(SHARE_SEARCH_API + query).json()

for result in results['results']:
    for prop in result['otherProperties']:
        if prop['name'] == 'type':
            print(prop)
    print(result['title'])
    print(result['uris']['canonicalUri'])
    

Here is an analysis of the top terms found in SHARE's collected dc:type field

In [None]:
import pandas as pd
from sharepa import ShareSearch, basic_search
from sharepa.helpers import pretty_print

type_search = ShareSearch()
total_documents = basic_search.count()

type_search.aggs.bucket(
    'typeTermFilter',  # Every aggregation needs a name
    'terms',  # There are many kinds of aggregations
    field='otherProperties.properties.type',
    exclude= "of|and|or",
    size=50,
)

type_results_executed = type_search.execute()

type_results = type_results_executed.aggregations.typeTermFilter.to_dict()['buckets']

type_dataframe = pd.DataFrame(type_results)
type_dataframe['percent'] = (type_dataframe['doc_count'] / total_documents)*100

In [None]:
type_dataframe

## Query by Exact Phrase

Question -- Is there a way to search SHARE for a specific phrase? For example, information literacy, information AND literacy, and "information literacy" give results with both terms, but not necessarily as the phrase "information literacy." Information and literacy can be in different parts of the record.

In [None]:
phrase_query = {
    "query": {
        "match_phrase" : {
            "title" : "information literacy"
        }
    }
}

results = query_share(SHARE_SEARCH_API, phrase_query)

for result in results['results']:
    print(
        '{} -- from {} -- {}'.format(
            result['title'].encode('utf-8'),
            result['shareProperties']['source'].encode('utf-8'),
            result['uris']['canonicalUri'].encode('utf-8')
        )
    )


In [None]:

# Using sharepa

phrase_search = ShareSearch()

phrase_search = phrase_search.query(
    'match_phrase',
    title="information literacy"
)

results = phrase_search.execute()

for result in results:
    print(
        '{} -- from {} -- {}'.format(
            result.title.encode('utf-8'),
            result.shareProperties.source.encode('utf-8'),
            result.uris.canonicalUri.encode('utf-8')
        )
    )

## Checking the Validity of our Results

If we make a query, we want to do a quick run through of our results just to make sure the query is behaving as we think it is.

Here's an example of making a long query to include and not include specific results. We'll then iterate through all of the results and make sure that the they contain the things we want and do not contain the things we don't want.

In [35]:
query_url = 'https://osf.io/api/v1/share/search/?start=0&size=1000&q=pedigree* OR "relatedness matrix" NOT child NOT "family tree" NOT "dog food" NOT diagnosis'

In [36]:
desired_words = ['pedigree', 'relatedness matrix']
undesired_words = ['child', 'family tree', 'dog food', 'diagnosis']

In [37]:
# requests.get(query_url).json()

In [77]:
# Iterate through those results and make sure the titles and descriptions have at least one desired word, 
# and do not have any of the undesired words


def results_have_correct_words(this_page, desired, undesired):
    print('REJECTING {}'.format(undesired))
    title_check = True
    description_check = True
    number_checked = 0
    for result in this_page:
        number_checked += 1
        if any(word in result['title'] for word in undesired) or not any(word in result['title'] for word in desired):
            title_check = False
        if result.get('description'):
            if any(word in result['description'] for word in undesired) or not any(word in result['description'] for word in desired):
                description_check = False
        if not title_check and not description_check:
            print('-------------')
            print(result['title'])
            print(result['description'])
            print('title: {}  description: {}'.format(title_check, description_check))
    return title_check or description_check, number_checked


total_results = requests.get(query_url).json()['count']
total_checked = 0
start = 0
size = 1000
def check_results(url_to_check, total_checked, start, desired, undesired):
    print('starting at {}'.format(start))
    url = furl.furl(url_to_check)
    url.args['start'] = start
    results = requests.get(url.url).json()
    if total_checked < total_results:
        status, checked_this_time = results_have_correct_words(results['results'], desired_words, undesired_words)
        total_checked += checked_this_time
        print('I checked {} this time'.format(checked_this_time))

        if status == False:
            return False
        else:
            start += size
            check_results(url, total_checked, start, desired, undesired)
    
    return True

In [78]:
res = check_results(query_url, 0, 0, desired_words, undesired_words)
res

starting at 0
REJECTING ['child', 'family tree', 'dog food', 'diagnosis']
-------------
Hybrid_Matrix
Relatedness matrix used by the "Hybrid" method at the AsREML format (.grm). Matrix generated a posteriori with a R script.
title: False  description: False
-------------
Relatedness_Matrix
Relatedness matrix used by the "Relatedness" method at the AsREML format (.grm). Matrix generated a posteriori with a R script.
title: False  description: False
-------------
HSFS_Matrix
Relatedness matrix used by the "HSFS" method at the AsREML format (.grm). Matrix generated a posteriori with a R script.
title: False  description: False
-------------
HSFS_Matrix
Relatedness matrix used by the "HSFS" method at the AsREML format (.grm). Matrix generated a posteriori with a R script.
title: False  description: False
-------------
PhiA_Matrix
Relatedness matrix used by the reference method "PhiA" at the AsREML format (.grm). Matrix generated a posteriori with a R script.
title: False  description: Fals

KeyError: 'description'