# Literature Search with the Semantic Scholar API

### Notebook for using the Semantic Scholar API to search for literature given a set of topics 

- Search for all papers with the key word education, and field of study (e.g., Education)
- Store the results in a Pandas DataFrame
- Filter in the second stage using titles and abstracts

In [1]:
# Import libraries for importing data from Semantic Scholar and saving the data
# Install Semantic Scholar 
# pip install semanticscholar

import numpy as np
import pandas as pd
import time
from semanticscholar import SemanticScholar as SC

### Fields to store: 
'paperId','url', 'title', 'abstract', 'venue', 'year', 'referenceCount', 'citationCount', 
'isOpenAccess', 'OpenAccessPdf', 'fieldsOfStudy', 'publicationTypes', 'publicationDate'
'journal', 'authors'

In [2]:
# Function to take the data for each paper and flatten it into a single tier dictionary
# for converting to a Pandas DataFrame

def flatten(d):
    
    keys = ['paperId', 'url', 'title', 'abstract', 'venue', 'year', 'referenceCount', 
        'citationCount', 'isOpenAccess', 'openAccessPdf', 'fieldsOfStudy', 'publicationTypes', 'publicationDate']

    dict_ = dict()
    
    for k in keys:
        dict_[k] = d[k]
    
    if d['journal'] is not None:
        if 'name' in d['journal']:        
            dict_['journal_name'] = d['journal']['name']  
    
    i = 1
    for val in d['s2FieldsOfStudy']:
        dict_['s2FieldsOfStudy'+'_'+str(i)] = val['category']
        i = i+1
    
    i = 1
    for val in d['authors']:
        dict_['authors'+'_'+str(i)] = val['name']
        i = i+1
        # Setting max author info to ten
        if i == 11:
            break
      
    
    return dict_

### Search term(s), field(s) of study, and years
- Define the field(s) of study
- Define the search term(s)
- Define the time range - years

In [5]:
# Search for articles using key multiple phrases, with field of study "Economics", across 1980-2023
fields = ['Education']
searchterms = ['linguistic complexity prediction']

# Define the years to search for - 
start = 2021
end = 2022
years = range(start,end+1)

sch = SC()

### Iteratre through the years and search terms

In [6]:
# Create a list to store search results
df_list = []

for y in years:
    data_dict = []
    i = 0
    
    for term in searchterms:
        try:
            # block raising an exception
            results = sch.search_paper(term, fields_of_study=fields, year=y, limit=100)
           
            for item in results:
                data_dict.append(flatten(item))
                
                # pause the search to match API limits
                i = i+1
                if i%5000 == 0:
                    time.sleep(100)
        except:
            pass 

    l = 0
    
    try:
        df = pd.DataFrame(data_dict)
        df.drop_duplicates(subset=['paperId'], inplace=True)
        df_list.append(df)
        l = len(df)
    except:
        pass
    
    print(f'{i} results for {y}, of which {l} are unique.')     

87 results for 2021, of which 87 are unique.
101 results for 2022, of which 101 are unique.


In [7]:
df_all = pd.concat(df_list)
print(f'Total papers: {len(df_all)}.')
df_all.head()

Total papers: 188.


Unnamed: 0,paperId,url,title,abstract,venue,year,referenceCount,citationCount,isOpenAccess,openAccessPdf,...,authors_2,authors_3,authors_4,authors_5,authors_6,authors_7,authors_8,authors_9,authors_10,s2FieldsOfStudy_3
0,37503be3729c5203e15ec45bcdf20f6a30f481e0,https://www.semanticscholar.org/paper/37503be3...,Using speech comprehension to qualify communic...,,,2021,82,10,False,,...,Chiara Visentin,E. Borella,I. Mammarella,A. Domenico,,,,,,
1,f426a6cea183a56c737967c24bebbef7ad166cfe,https://www.semanticscholar.org/paper/f426a6ce...,Comparison of Methods for Evaluating Complexit...,Research has explored using Automatic Text Sim...,International Conference on Human Factors in C...,2021,68,8,False,,...,J. Trussell,Becca Dingman,Matt Huenerfauth,,,,,,,
2,8afa9cc32d6e5fec41856559352e163bda5600cd,https://www.semanticscholar.org/paper/8afa9cc3...,Why is Complexity Science valuable for reachin...,,Rendiconti Lincei SCIENZE FISICHE E NATURALI,2021,124,13,True,{'url': 'https://link.springer.com/content/pdf...,...,,,,,,,,,,
3,4b9b0c661574d1d397ba71bc1cbcace5f7ac9c0f,https://www.semanticscholar.org/paper/4b9b0c66...,ORGANIZATION OF SELF-STUDY WORK IN THE IMPLEME...,The article is devoted to the problems of impl...,Современные наукоемкие технологии (Modern High...,2021,13,0,True,{'url': 'https://s.top-technologies.ru/pdf/202...,...,E. Ivanova,M. Polyakova,,,,,,,,
4,0d0c1f55ab117b36fefd0c4307ff1f84037f75ce,https://www.semanticscholar.org/paper/0d0c1f55...,RETRACTED ARTICLE: Application of Deep Learnin...,,The Arabian journal for science and engineering,2021,39,6,False,,...,Ting Qiu,K. Deepa Thilak,,,,,,,,


### Clean the Title and Abstract fields
- Clean up white spaces
- Add flags to make the file more searchable

In [8]:
# Clean abstract and title to remove extra spaces
df_all['title'] = df_all['title'].replace(r'\s+', ' ', regex=True)
df_all['abstract'] = df_all['abstract'].replace(r'\s+', ' ', regex=True)

### Add tags for: 
- classroom 
- student assessments and exams

The tags will be added based on string matches

In [9]:
# Define strings for the string matches
l_class = ['classroom']
l_exam = ['assessments', 'exams', 'examinations']

In [11]:
# Add tags if any one of the terms in the lists are present

regstr_class = '|'.join(l_class)
df_all['classroomFlag'] = df_all[['title', 'abstract']].apply(lambda x: x.str.contains(regstr_class,case=False)).any(axis=1).astype(int)

regstr_exam = '|'.join(l_exam)
df_all['examFlag'] = df_all[['title', 'abstract']].apply(lambda x: x.str.contains(regstr_exam,case=False)).any(axis=1).astype(int)

In [12]:
df_all = df_all.fillna('')

In [13]:
df_all.head()

Unnamed: 0,paperId,url,title,abstract,venue,year,referenceCount,citationCount,isOpenAccess,openAccessPdf,...,authors_4,authors_5,authors_6,authors_7,authors_8,authors_9,authors_10,s2FieldsOfStudy_3,classroomFlag,examFlag
0,37503be3729c5203e15ec45bcdf20f6a30f481e0,https://www.semanticscholar.org/paper/37503be3...,Using speech comprehension to qualify communic...,,,2021,82,10,False,,...,I. Mammarella,A. Domenico,,,,,,,1,0
1,f426a6cea183a56c737967c24bebbef7ad166cfe,https://www.semanticscholar.org/paper/f426a6ce...,Comparison of Methods for Evaluating Complexit...,Research has explored using Automatic Text Sim...,International Conference on Human Factors in C...,2021,68,8,False,,...,Matt Huenerfauth,,,,,,,,0,0
2,8afa9cc32d6e5fec41856559352e163bda5600cd,https://www.semanticscholar.org/paper/8afa9cc3...,Why is Complexity Science valuable for reachin...,,Rendiconti Lincei SCIENZE FISICHE E NATURALI,2021,124,13,True,{'url': 'https://link.springer.com/content/pdf...,...,,,,,,,,,0,0
3,4b9b0c661574d1d397ba71bc1cbcace5f7ac9c0f,https://www.semanticscholar.org/paper/4b9b0c66...,ORGANIZATION OF SELF-STUDY WORK IN THE IMPLEME...,The article is devoted to the problems of impl...,Современные наукоемкие технологии (Modern High...,2021,13,0,True,{'url': 'https://s.top-technologies.ru/pdf/202...,...,,,,,,,,,0,0
4,0d0c1f55ab117b36fefd0c4307ff1f84037f75ce,https://www.semanticscholar.org/paper/0d0c1f55...,RETRACTED ARTICLE: Application of Deep Learnin...,,The Arabian journal for science and engineering,2021,39,6,False,,...,,,,,,,,,0,0


In [7]:
df_all.to_csv("Literare.csv")