# Imports

In [1]:
import pandas as pd
import os
import re
import requests
import json
import time

# Load data

In [36]:
# Read raw queries file
row_queries = pd.read_csv('raw queries.csv')

print ('Number of raw items:',row_queries.shape[0])

row_queries.head()

Number of raw items: 33130


Unnamed: 0,source,title,abstract,doi,paperId
0,Semantic Scholar,leading the way a new model for data science e...,"Addressing the data skills gap, namely the sup...",,b1452907e85f6e4d315ca8c61a1393403a533b2f
1,Semantic Scholar,reshape a method to teach data ethics for data...,Data has become central to the technologies an...,,4b4fdc82593e3c59a8f18fb7ab4e06ece0af3cf6
2,Semantic Scholar,the role of academia in data science education,As the demand for data scientists continues to...,,41cf91ee13a1d15983ede066ddf6b67cc94a41f4
3,Semantic Scholar,data science education curriculum and pedagogy,Data science is a new field of research focuse...,,5d090dbeae225383d56ee7ac17b611adeb21c3e3
4,Semantic Scholar,introducing gaise ii a guideline for precolleg...,Column Editorâ€™s note: In this column Anna Ba...,,92a7cd8186400b382463dea25b2c0797cac8fbd5


# Pre process raw queries file

The five sources of academic databases used in this research have different information per paper, for example DOI is given only by ACM and IEEE, and not by Web of Science and Google scholar, therefore, it is not possible to search based on a single identifier. As most papers do not have DOI, we use title search. Titles also vary between sources, for example different capitalization style, or using - instead of :. Therefore, we remove any non alphabetic characters from the title and convert all characters to lowercase.


In [37]:
row_queries['title'] = row_queries['title'].str.lower()
row_queries['title'] = row_queries['title'].str.replace('&amp;','')
row_queries['title'] = row_queries['title'].str.replace('[^ 0-9a-zA-Z]','',regex=True)
row_queries['title'] = row_queries['title'].str.replace(' +',' ',regex=True)

Drop duplicated papers

In [38]:
row_queries.drop_duplicates('title',inplace=True)

print ('Number of unique raw items:',row_queries.shape[0])

Number of unique raw items: 31749


# Search papers in Semantic Scholar

In [39]:
Semantic_scholar_API_key = '' #Put your API KEY here

if Semantic_scholar_API_key:
  headers={"x-api-key":Semantic_scholar_API_key}
else:
  headers={}

Find matching Semantic scholar papre ID for papers from the other sources

In [41]:
for i in row_queries[row_queries['paperId'].isna()].index:

  title = row_queries.loc[i,'title']

  # get matching documants from Semantic Scholar
  response = requests.get('https://api.semanticscholar.org/graph/v1/paper/search?query='+title+'&limit=10&fields=title,abstract',headers=headers)
  response_dict = json.loads(response.text)

  if 'data' not in response_dict:
    continue

  # match all papers in the response
  title_stem = re.sub('[^a-zA-Z]','',title).lower()

  for candidate in response_dict['data']:
    candidate_title_stem = re.sub('[^a-zA-Z]','',candidate['title']).lower()

    if candidate_title_stem == title_stem:
      row_queries.loc[i,'paperId'] = candidate['paperId']
      row_queries.loc[i,'abstract'] = candidate['abstract']
      print(i,'found:',title)
      break

    # If exact mach was not found, search for 95% correlation between the paper title and acandidate title
    elif (candidate['title'].count(' ')>4) and (title.count(' ')>4) and len(candidate_title_stem)>5 and len(title_stem)>5:
      (s1,s2) = (candidate_title_stem,title_stem) if len(candidate_title_stem)>len(title_stem) else (title_stem,candidate_title_stem)
      s1=' '*(len(s2)//5)+s1+' '*(len(s2)//5)
      corr = max([sum([c1==c2 for (c1,c2) in zip(s1[i:i+len(s2)],s2)])/len(s2) for i in range(len(s1)+2*len(s2)//5)])
      if corr>0.95:
        row_queries.loc[i,'paperId'] = candidate['paperId']
        row_queries.loc[i,'abstract'] = candidate['abstract']
        print(i,'found:',title)
        break

    else:
      print(i,'missing:',title)

  time.sleep(.1)

29505 found: building bridges for data science education
29511 found: data science a comprehensive overview
29515 found: national academies roundtable on data science postsecondary education
29519 found: a journal for interdisciplinary data science education
29529 found: review of modern data science
29530 found: formation in data science in secondary education big data as a transversal competence
29535 found: data science for all a tale of two cities
29538 missing: exploring selfefficacy in data science
29538 missing: exploring selfefficacy in data science
29538 missing: exploring selfefficacy in data science
29538 missing: exploring selfefficacy in data science
29538 missing: exploring selfefficacy in data science
29538 missing: exploring selfefficacy in data science
29538 missing: exploring selfefficacy in data science
29538 missing: exploring selfefficacy in data science
29538 missing: exploring selfefficacy in data science
29538 missing: exploring selfefficacy in data science
2954

# Verify that the search terms are found in title of in abstract

In [42]:
row_queries.dropna(subset=['paperId','title','abstract'],inplace=True)
row_queries.shape[0]

26645

In [43]:
def key_words_found(s):
  return ('data science' in s) and any([x in s for x in ['education','curriculum','pedagogy','teach']])

row_queries = row_queries[row_queries['title'].str.lower().apply(key_words_found) | row_queries['abstract'].str.lower().apply(key_words_found)]
print('Papers with valid title and abstract:',row_queries.shape[0])

Papers with valid title and abstract: 955


In [44]:
row_queries.to_csv('semantic scholar ids.csv',index=False)