# Imports

In [10]:
import pandas as pd
import os
import re
import requests
import json
import time

# Load data

In [11]:
# Read raw queries file
row_queries = pd.read_excel('https://docs.google.com/uc?id=13-a2B6RNmUQOFAsJ0vLZISuPZNRSXiz-')

print ('Number of raw items:',row_queries.shape[0])

row_queries.head()

Number of raw items: 33716


Unnamed: 0.1,Unnamed: 0,source,title,abstract,doi,paperId
0,0,semantic scholar,LEADING the Way: A New Model for Data Science ...,"Addressing the data skills gap, namely the sup...",,b1452907e85f6e4d315ca8c61a1393403a533b2f
1,1,semantic scholar,Re-Shape: A Method to Teach Data Ethics for Da...,Data has become central to the technologies an...,,4b4fdc82593e3c59a8f18fb7ab4e06ece0af3cf6
2,2,semantic scholar,The Role of Academia in Data Science Education,As the demand for data scientists continues to...,,41cf91ee13a1d15983ede066ddf6b67cc94a41f4
3,3,semantic scholar,Data Science Education: Curriculum and pedagogy,Data science is a new field of research focuse...,,5d090dbeae225383d56ee7ac17b611adeb21c3e3
4,4,semantic scholar,Introducing GAISE II: A Guideline for Precolle...,Column Editorâ€™s note: In this column Anna Ba...,,92a7cd8186400b382463dea25b2c0797cac8fbd5


# Pre process raw queries file

The five sources of academic databases used in this research have different information per paper, for example DOI is given only by ACM and IEEE, and not by Web of Science and Google scholar, therefore, it is not possible to search based on a single identifier. As most papers do not have DOI, we use title search. Titles also vary between sources, for example different capitalization style, or using - instead of :. Therefore, we remove any non alphabetic characters from the title and convert all characters to lowercase.


In [12]:
row_queries['title'] = row_queries['title'].str.lower()
row_queries['title'] = row_queries['title'].str.replace('&amp;','')
row_queries['title'] = row_queries['title'].str.replace('[^ 0-9a-zA-Z]','',regex=True)
row_queries['title'] = row_queries['title'].str.replace(' +',' ',regex=True)

Drop duplicated papers

In [13]:
row_queries.drop_duplicates('title',inplace=True)

print ('Number of unique raw items:',row_queries.shape[0])

Number of unique raw items: 31758


# Search papers in Semantic Scholar

In [19]:
Semantic_scholar_API_key = '' #Put your API KEY here

if Semantic_scholar_API_key:
  headers={"x-api-key":Semantic_scholar_API_key}
else:
  headers={}

Find matching Semantic scholar papre ID for papers from the other sources

In [21]:
for i in row_queries[row_queries['paperId'].isna()].index:

  title = row_queries.loc[i,'title']

  # get matching documants from Semantic Scholar
  response = requests.get('https://api.semanticscholar.org/graph/v1/paper/search?query='+title+'&limit=10&fields=title,abstract',headers=headers)
  response_dict = json.loads(response.text)

  if 'data' not in response_dict:
    continue

  # match all papers in the response
  title_stem = re.sub('[^a-zA-Z]','',title).lower()

  for candidate in response_dict['data']:
    candidate_title_stem = re.sub('[^a-zA-Z]','',candidate['title']).lower()

    if candidate_title_stem == title_stem:
      row_queries.loc[i,'paperId'] = candidate['paperId']
      row_queries.loc[i,'abstract'] = candidate['abstract']
      print(i,'found:',title)
      break

    # If exact mach was not found, search for 95% correlation between the paper title and acandidate title
    elif (candidate['title'].count(' ')>4) and (title.count(' ')>4) and len(candidate_title_stem)>5 and len(title_stem)>5:
      (s1,s2) = (candidate_title_stem,title_stem) if len(candidate_title_stem)>len(title_stem) else (title_stem,candidate_title_stem)
      s1=' '*(len(s2)//5)+s1+' '*(len(s2)//5)
      corr = max([sum([c1==c2 for (c1,c2) in zip(s1[i:i+len(s2)],s2)])/len(s2) for i in range(len(s1)+2*len(s2)//5)])
      if corr>0.95:
        row_queries.loc[i,'paperId'] = candidate['paperId']
        row_queries.loc[i,'abstract'] = candidate['abstract']
        print(i,'found:',title)
        break

    else:
      print(i,'missing:',title)

  time.sleep(.1)

30086 found
30087 found
30092 found
30098 found
30102 found
30110 found
30111 found
30112 found
30114 found
30119 found
30121 found
30122 found
30123 found
30124 found
30127 found
30129 found
30130 found
30131 found
30133 found
30134 found
30137 found
30140 found
30145 found
30146 found
30147 found
30149 found
30150 found
30151 found
30155 found
30157 found
30159 found
30160 found
30163 found
30165 found
30166 found
30167 found
30170 found
30172 found
30174 found
30175 found
30176 found
30179 found
30180 found
30182 found
30186 found
30187 found
30188 found
30189 found
30190 found
30194 found
30197 found
30200 found
30201 found
30202 found
30204 found
30205 found
30206 found
30208 found
30211 found
30215 found
30216 found
30218 found
30219 found
30222 found 1.0 big data business technology education and science big data ubiquity symposium - Big Data: Business, Technology, Education, and Science
30224 found
30227 found
30228 found
30229 found
30230 found
30232 found
30234 found
30235 fo

# Verify that the search terms are found in title of in abstract

In [22]:
row_queries.dropna(subset=['paperId','title','abstract'],inplace=True)
row_queries.shape[0]

26956

In [23]:
def key_words_found(s):
  return ('data science' in s) and any([x in s for x in ['education','curriculum','pedagogy','teach']])

row_queries = row_queries[row_queries['title'].str.lower().apply(key_words_found) | row_queries['abstract'].str.lower().apply(key_words_found)]
print('Papers with valid title and abstract:',row_queries.shape[0])

Papers with valid title and abstract: 1073


In [24]:
pd.to_excel('DSE Semantic scholar ids.xlsx')

AttributeError: ignored