<a href="https://colab.research.google.com/github/BecomeAllan/S2Search/blob/main/SemanticScholarSearch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Consumindo a API do SemanticScholar

A seguir, tem uma classe chamada `Search()`, que ao instanciar-la em uma variável é possível fazer pesquisas sobre papers utilizando a api do SemanticScholar, dentre os parâmetros temos:

- Buscar: Pesquisas sobre tópicos onde adicionar tópicos utiliza-se + (mais) e remover tópicos usamos - (menos)

  ex. "Machine+Medicine"

- Fields: O que será retornado como dados. Para utilizar, escolha dentre as opções sem utilizar espaço e separadas de virgulas:
  - (str): externalIds
  - (str): url
  - (str): title
  - (str): abstract
  - (str): venue 
  - (str): year 
  - (str): referenceCount
  - (str): citationCount
  - (str): influentialCitationCount
  - (str): isOpenAccess
  - list (str): fieldsOfStudy
  - list (str): authors 

  ex. "title,abstract,isOpenAccess,fieldsOfStudy"

- Offset: Número que começa a puxar a partir da ordem dele a lista de papers. (0 seria o primeiro)

- Limite: Número de papers a ser retornados (Máx. 10.000)

**Obs:** A api do SemanticScholar disponibiliza 100 query's a cada 5 min, no qual apenas retorna no máx. 100 resutados (limite). Assim a cada 5 min, é possível puxar 10.000 papers.

In [12]:
import requests
import json
import multiprocessing as mp
import os
# import numpy as np
import pandas as pd
import re
import ast
from pathlib import Path


from time import sleep, time


def timer(fun):
  def warper(*args,**kwargs):
    start = time()
    d = fun(*args,**kwargs)
    end = time()
    print(f"[{fun.__name__}]>> Demorou {round(end-start,2)}s")
    return d
  return warper





class SearchAPI():
  def __init__(self, search="decision making+optimization+artificial intelligence", poolCPU = 4, sleeptry=5, save = False, **kwargs):
     
    self.total = 0
    self.sleeptry = sleeptry
    self.poolCPU = poolCPU
    self.saveName = kwargs.get('Savename', "Data")

    self.badcall = []

    self.saveFile = save

    self._api = "https://api.semanticscholar.org/graph/v1/paper/search"

    self.params = {
    "query": search, 
    "limit": 100,
    "fields": kwargs.get('fields', "title,abstract,isOpenAccess,fieldsOfStudy"),
    "offset": kwargs.get('offset', 0),
    }



  def _query(self, offset):
    # print("_query")
    params = self.params.copy()
    if offset + params['limit'] > 10000:
      params['limit'] = str(10000 - offset) 
    else:
      params['offset'] = str(offset)
    # print(params)
    # post["page"] = page
    try:
      # &
      url = self._api +"?query=" + str(params['query']) + "&limit=" + str(params['limit']) + "&fields=" + str(params['fields']) + "&offset=" +  str(params['offset'])
      # print(url)
      res = requests.get(url, timeout=15)
      # sleep(self.sleeptry)
      print(res)
      res.encoding = 'utf-8'
      return [res, offset, res.status_code]
    except:
      return [None, offset, 400 ]

  def _pandas(self, res):
    dict_data = json.loads(res.text)
    # print(len(dict_data['data']))

    self.total= self.total+ len(dict_data['data'])
    return pd.DataFrame(dict_data['data'])

  def save(self, name, data):
    # csvFile = Path(f"./{name}.csv")
    
    # if csvFile.is_file():
    #   x = pd.read_csv(f'{name}.csv')
    #   data = pd.concat([data,x])

    try:
      data.to_csv(f'{name}.csv')
    except:
      print("[Save]>> Error to save the data.")



  @timer
  def _extract(self, pool, data):
    try:
      papers_list = pool.map(self._pandas, data['Response'].tolist())

      self.all.extend(papers_list)

      if self.saveFile:
        self.save(self.saveName ,pd.concat(self.all).reset_index())

    except:
      print("_extract>> [Fail], see .badcall to reextract content.")
      # self.badcall.append(self.papers_text)
      # print(self.badcall)
   
      


  @timer
  def _runtime(self, pool, offsets):
    # self.totalPages = 0

    # find = False
    
    while True:
      # if self.saveFile:
      #   close = self._startFile(find)
      
      print('\n')
      print('[_runtime]>> Start searching...')

      try:
        res = pool.map(self._query, offsets)
        resultData = pd.DataFrame(res, columns=["Response", "Page", "Code"])
        # print(resultData["Code"])

        resultData.set_index("Page")
        
        if resultData.query("Code !=200").size == 0:
          self._extract(pool, resultData.query("Code ==200"))

          break
        else:

          if resultData.query("Code ==200").size != 0:
            self._extract(pool, resultData.query("Code ==200"))
              
          print("Bad call of pages:")
          # self._data(resultData.query("Code == 200"))
          # self.datasource.append(resultData.query("Code ==200"))
          offsets = resultData.query("Code !=200")["Page"].values.tolist()
          err = resultData.query("Code !=200")["Response"].tolist()
          err = [x.text for x in err]
          print(err)
          try:
            with open("./BadCalls.text", 'w', encoding='UTF-8') as fp:
              fp.write(str(offsets))
          except:
            print("Fail to save badcalls")
          print(f"Tentando de novo daqui a {self.sleeptry/60} min...")


          sleep(self.sleeptry)
      except:
        pass
      print("---")

        
        
    
    
    # self._extract(pool, self.datasource)


  @timer
  def get(self, n = 10, offset = 0, papers = []):
    self.n = n
    self._offset = offset
    # self.post["pageSize"] = 10
    # self.post["page"] = page
    self.all = []
    # print('.post >>')
    # print(self.post)
    # self.datasource = ''
    print("\n")
    print("Searching...")


    

    with mp.Pool(self.poolCPU) as pool:
      if self.n > 100:
    
        if len(papers) != 0:
          self._offsets = papers
        else:
          self._offsets = list(range(self._offset, (self.n//100)+self._offset))
          lista = [x*100 for x in self._offsets]
          # lista.insert(0,self._offsets[0])
          # print(lista)
          offsets = lista
          
          print("offsets: ")
          print(offsets)
        
        # print(self._offsets)
      # for page in range(n//10):
        self._runtime(pool, offsets)
          
        if n%100>0:
          self.params['limit'] = n%100
          self._offsets = [lista[-1] + 100]
          # print(self.params)
          # print(self._offsets)

          self._runtime(pool, self._offsets)
          
      else:
        # pass
        if len(papers) != 0:
          self._offsets = papers
        else:  
          offsets = [self._offset]
        
        self.params['limit'] = n
        self._runtime(pool, offsets)

    self.all = pd.concat(self.all, ignore_index=True)


# if __name__ == '__main__':
#   SearchAPI(sleeptry = 1*20, save=True, Savename = "dataAPI").get(10000)


In [13]:
res_api = SearchAPI(
    search="decision making+optimization+artificial intelligence",
    sleeptry = 1*20,
    save=False)

res_api.get(10)

res_api.all




Searching...


[_runtime]>> Start searching...
<Response [200]>
[_extract]>> Demorou 0.01s
[_runtime]>> Demorou 0.76s
[get]>> Demorou 0.85s


Unnamed: 0,paperId,title,abstract,isOpenAccess,fieldsOfStudy
0,d1e2126a46dfbf53580453c2f666f128afd7bb26,Multi-Objective Optimization Algorithm and Pre...,The operating mechanism of the biological immu...,False,[Computer Science]
1,20c19e2d76a9dc0e51860c174cfdac74aadf0369,Ship structural safety optimization: an integr...,ABSTRACT Majority of the world cargoes is tran...,False,[Computer Science]
2,4e8284cb075a8f0b8a9329f499fdba7b3c2d24fe,Artificial intelligence based commuter behavio...,Road traffic environments are highly dynamic a...,False,[Computer Science]
3,9563b0a34a76d6cf74c91d82583327ed97155668,Developing an artificial intelligence-based de...,,False,[Engineering]
4,ddbbc8b2b3bdbf8b6329d1fbb19372d8e12812fb,An Artificial Intelligence Platform for Asset ...,An Artificial Intelligence system was develo...,False,"[Computer Science, Medicine]"
5,b4cb44551a0e08bf2b5b332df9204e2ba10ff0af,The artificial intelligence and optimization o...,The problem of complicated dynamic system opti...,False,
6,c3dae6d967186a3fce07246e156cbfb4eaf473da,Adaptive Simulation-Based Training of Artifici...,This work studies how an artifical-intelligenc...,False,[Computer Science]
7,8a86870043137e8f803906161ebaec5f4b4afaf0,Structuring an artificial intelligence based d...,Abstract Cyclic steam stimulation (CSS) is one...,False,[Engineering]
8,fd4082c64e54e5aad62e14076fa7c8fc52d830f1,Collective behavior of artificial intelligence...,Collective behavior in the resource allocation...,False,[Computer Science]
9,05ce03fcef7b71d4ec5c8f9db8527ec7b2f70ac0,Artificial Intelligence Enabling Water Desalin...,"Recently, water desalination has been developi...",False,[Computer Science]


# Consumir a classe `Search()`

A duas formas de pesquisar utilizando `Search()`:

1. A primeira é utilizando parâmetros na propria classe:

In [None]:
#@title Classe para pesquisa no SemanticScholar
import IPython
from google.colab import output
import pandas as pd

class Search():
  def __init__(self, **kwargs):
    self.data = ""
    self.data_0 = ""

    self.search = kwargs.get('search', None)
    self.fields = kwargs.get('fields', None)
    self.limit = kwargs.get('limit', None)
    self.offset = kwargs.get('offset', None)

    if self.search == None and self.fields == None and self.limit == None and self.offset == None:
      self._start(False)
    else:
      self._start(True)
  
  def _start(self, *args):

    output.register_callback('notebook.searching', self._searching)
    output.register_callback('notebook.AddListItem', self._add_list_item)
    output.register_callback('notebook.mergeData', self._merge_data)
    output.register_callback('notebook.error', self._error)


    boxs = ''' 
        <label for="query">Buscar: </label>
        <input type="text" id="query" value="Machine Learning+Deep Learning" style="width: 400px;"/>
        <br/>
        <br/>
        
        <label for="fields">Fields: </label>
        <input type="text" id="fields" value="title,abstract,isOpenAccess,fieldsOfStudy" style="width: 400px;"/>
        <br/>
        <br/>
 
        <label for="limit">Limite: </label>
        <input type="text" id="limit" value="10" style="width: 50px;"/><br/>
        <br/>

        <label for="limit">Offset: </label>
        <input type="text" id="offset" value="0" style="width: 50px;"/><br/>
        <br/>

        <button id='button'>Pesquisar</button>
        <br/>
        <br/>
           '''

    button = ''' document.querySelector('#button').onclick = async () => ''' # {}

    search_query = '''
            var search = document.getElementById("query").value
            var fields = document.getElementById("fields").value
            var limit = parseInt(document.getElementById("limit").value)
            var offset = parseInt(document.getElementById("offset").value)
                  '''
    search_params = '''
            var search = "{search}"
            var fields = "{fields}"
            var limit = parseInt({limit})
            var offset = parseInt({offset})
                  '''
    engine = '''
            google.colab.kernel.invokeFunction('notebook.searching', [], {});

            if (limit >100) {
              var number = limit
              var data = ""
              var promises = []
              var offsetSearch = 0
              var rest = 0

              for (let index = 0; index < Math.floor(limit/100); index++) {
                offsetSearch = 100*(index) + offset + 1*(index!==0)


                promises.push(
                  fetch(`https://api.semanticscholar.org/graph/v1/paper/search?query=${search}&offset=${offsetSearch}&limit=100&fields=${fields}`)
    .then(res=> {return(res.json())})
    .then(res=> {return(res)})
                )
              }
              
              if (limit%100 !== 0) { 
                rest= limit%100
                offsetSearch = offsetSearch+100
                
                console.log(rest)
                console.log(offsetSearch)

                promises.push(
                fetch(`https://api.semanticscholar.org/graph/v1/paper/search?query=${search}&offset=${offsetSearch}&limit=${rest}&fields=${fields}`)
    .then(res=> {return(res.json())})
    .then(res=> {return(res)})
                )}

              await Promise.all(promises).then(data=>{
                google.colab.kernel.invokeFunction('notebook.mergeData', [data], {})
              })
              .catch(err=> { return (google.colab.kernel.invokeFunction('notebook.error', [err], {})) })

            } else {

            await fetch(`https://api.semanticscholar.org/graph/v1/paper/search?query=${search}&offset=${offset}&limit=${limit}&fields=${fields}`)
    .then(res=> {return(res.json())})
    .then(res=> {
      console.log(res)
      console.log("AQUIII")
      return(google.colab.kernel.invokeFunction('notebook.AddListItem', [res], {}))})
    .catch(err=> { return (
      google.colab.kernel.invokeFunction('notebook.error', [err], {})) })
            }
                  '''

    asyncfun = "async function asyncfun()"

    if args[0]:

      main_app =  "<script>" + search_params.format(search=self.search, fields=self.fields, limit=self.limit, offset=self.offset) + asyncfun + "{" + engine + "}" + "asyncfun()" + "</script>"

      display(IPython.display.HTML(main_app))
      
    else:
      main_app = boxs + "<script>" + button + "{" + search_query + engine + "}" + "</script>"
      
      display(IPython.display.HTML(main_app))

    

  def _error(self,value):
    try:
      print("ERRO na API SemanticScholar:\n")
      print(value)
    except:
      pass 

  def _searching(self):
    with output.use_tags('some_outputs'):
      print("\n\nPesquisando...")
      sys.stdout.flush();

  def _merge_data(self, data):
    output.clear(output_tags='some_outputs')
    print(f"Achou {data[0]['total']} papers.\n")
    self.data_0 = data

    self.data = pd.DataFrame(data[0]['data'])

    try:
      for x in data[1:len(data)]:
        try:
          self.merge(pd.DataFrame(x['data']))
        except:
          self._error(x)
    except:
      pass 

    print(f"\nApi devolveu >> {self.data.shape[0]} papers\n" )
    print(self.data.head())


  def merge(self, data):
    self.data = pd.concat([self.data, data], ignore_index=True ) 

  def _add_list_item(self, value):
    output.clear(output_tags='some_outputs')

    print(f"Achou {value['total']} papers.\n")

    self.data = pd.DataFrame(value['data'])

    print(f"Api devolveu >> {self.data.shape[0]} papers\n" )
    
    print(self.data.head())



In [None]:
Resultados = Search(search = "Machine Learning+Deep Learning" , fields = "title,abstract,citationCount,isOpenAccess,fieldsOfStudy", limit = "200", offset = "0")

Achou 656970 papers.


Api devolveu >> 200 papers

                                    paperId  ...       fieldsOfStudy
0  846ff7afb7670d62f88b4a8cc99d306ffb81b075  ...          [Medicine]
1  5dc53e50148b01fe8b9536eb79fa6b1dce924174  ...          [Medicine]
2  7cc2e148d27a7508dd23c4e35eb63cc9b3e6a58f  ...  [Computer Science]
3  59444b096f7c8a561d540102e8b5bfb189edabc6  ...                None
4  eee313380ccb45807ea0afa3c1df86f6b48b8867  ...  [Computer Science]

[5 rows x 6 columns]


In [None]:
# Os dados ficam na variável data, no qual é uma tabela do tipo pandas
print(Resultados.data.columns)
print(Resultados.data.sort_values("citationCount", ascending = False ).head())

Index(['paperId', 'title', 'abstract', 'citationCount', 'isOpenAccess',
       'fieldsOfStudy'],
      dtype='object')
                                     paperId  ...                    fieldsOfStudy
17  a4cec122a08216fe8a3bc19b22e78fbaea096256  ...     [Medicine, Computer Science]
14  46200b99c40e8586c8a0f588488ab6414119fb28  ...               [Computer Science]
18  193edd20cae92c6759c18ce93eeea96afd9528eb  ...     [Computer Science, Medicine]
16  9c9d7247f8c51ec5a02b0d911d1d7b9e8160495d  ...               [Computer Science]
11  3c8a456509e6c0805354bd40a35e3f2dbf8069b1  ...  [Computer Science, Mathematics]

[5 rows x 6 columns]


2. A segunda é atravez da api de busca, searchBox, no qual é possivel colocar os campos:

In [None]:
Resultados_2 = Search()

Achou 656969 papers.

Api devolveu >> 10 papers

                                    paperId  ...       fieldsOfStudy
0  846ff7afb7670d62f88b4a8cc99d306ffb81b075  ...          [Medicine]
1  5dc53e50148b01fe8b9536eb79fa6b1dce924174  ...          [Medicine]
2  7cc2e148d27a7508dd23c4e35eb63cc9b3e6a58f  ...  [Computer Science]
3  59444b096f7c8a561d540102e8b5bfb189edabc6  ...                None
4  eee313380ccb45807ea0afa3c1df86f6b48b8867  ...  [Computer Science]

[5 rows x 5 columns]


In [None]:
print(Resultados_2.data)

                                    paperId  ...       fieldsOfStudy
0  846ff7afb7670d62f88b4a8cc99d306ffb81b075  ...          [Medicine]
1  5dc53e50148b01fe8b9536eb79fa6b1dce924174  ...          [Medicine]
2  7cc2e148d27a7508dd23c4e35eb63cc9b3e6a58f  ...  [Computer Science]
3  59444b096f7c8a561d540102e8b5bfb189edabc6  ...                None
4  eee313380ccb45807ea0afa3c1df86f6b48b8867  ...  [Computer Science]
5  46479bbea7749cb2db35b139206039531327053c  ...  [Computer Science]
6  b69fe5a837277ddbea5215d6bacd3a902e9d11ce  ...          [Medicine]
7  b0bf64ccbd651e8c7bc141d8aabaecff562e93a1  ...  [Computer Science]
8  042ab08ec6782cf217f13175162bfd48f7350114  ...  [Computer Science]
9  03e7832982986159400a8eeab148487ffcfabe56  ...  [Computer Science]

[10 rows x 5 columns]



# **SearchWeb()**

In [None]:
import requests
import json
import multiprocessing as mp
import os
# import numpy as np
import pandas as pd
import re
import ast
from pathlib import Path


from time import sleep, time


def timer(fun):
  def warper(*args,**kwargs):
    start = time()
    d = fun(*args,**kwargs)
    end = time()
    print(f"[{fun.__name__}]>> Demorou {round(end-start,2)}s")
    return d
  return warper



class SearchWeb():
  def __init__(self, search="Machine Learning+Deep Learning", poolCPU = 4, sleeptry=5, save = False, **kwargs):
     
    self.sleeptry = sleeptry
    self.poolCPU = poolCPU
    self.saveName = kwargs.get('Savename', "Data")

    self.badcall = []
    self._start = True

    self.saveFile = save
    self._search = search
    self._sort = kwargs.get('sort', "relevance")
    self._authors = kwargs.get('authors', [])
    self._coAuthors = kwargs.get('coAuthors', [])
    self._venues = kwargs.get('venues', ['PloS one', 'AAAI', 'Scientific reports', 'IEEE Access', 'ArXiv', 'Expert Syst. Appl.', 'ICML', 'Neurocomputing', 'Sensors', 'Remote. Sens.'])
    self._yearFilter = kwargs.get('yearFilter', None) # {"min": 2008,"max": 2021}
    self._requireViewablePdf = kwargs.get('requireViewablePdf', False)
    self._publicationTypes = kwargs.get('publicationTypes', ["ClinicalTrial", "CaseReport", "Editorial","Study","Book","News","Review","Conference","LettersAndComments","JournalArticle"])
    self._fieldsOfStudy = kwargs.get('fieldsOfStudy', ["biology","art","business","computer-science","chemistry","economics","engineering","environmental-science","geography","geology","history","materials-science","mathematics","medicine","philosophy","physics","political-science","psychology","sociology"])
    self._useFallbackRankerService = kwargs.get('useFallbackRankerService', False)
    self._useFallbackSearchCluster = kwargs.get('useFallbackSearchCluster', False)
    self._hydrateWithDdb = kwargs.get('hydrateWithDdb', True)
    self._includeTldrs = kwargs.get('includeTldrs', True)
    self._performTitleMatch = kwargs.get('performTitleMatch', True)
    self._includeBadges = kwargs.get('includeBadges', True)
    self._tldrModelVersion = kwargs.get('tldrModelVersion', 'v2.0.0')
    self._getQuerySuggestions = kwargs.get('getQuerySuggestions', False)


    self.post = {
    "page": 1, 
    "pageSize": 10,
    "queryString": self._search,
    "sort": self._sort,
    "authors": self._authors,
    "coAuthors": self._coAuthors,
    "venues": self._venues,
    "yearFilter": self._yearFilter,
    "requireViewablePdf": self._requireViewablePdf,
    "publicationTypes": self._publicationTypes,
    "externalContentTypes": [],
    "fieldsOfStudy": self._fieldsOfStudy,
    "useFallbackRankerService": self._useFallbackRankerService,
    "useFallbackSearchCluster": self._useFallbackSearchCluster,
    "hydrateWithDdb": self._hydrateWithDdb,
    "includeTldrs": self._includeTldrs,
    "performTitleMatch": self._performTitleMatch,
    "includeBadges": self._includeBadges,
    "tldrModelVersion": "v2.0.0",
    "getQuerySuggestions": self._getQuerySuggestions,
    }

  
  
  def _paperExtract(self, data):
    p = {
        "authors": [author[0]['name'] for author in data.get('authors',[{'name':None},None])],
        "id": data.get('id',None),
        "socialLinks": data.get('socialLinks',None),
        "title": data.get('title',{'text':None})['text'],
        "paperAbstract": data.get('paperAbstract',{'text':None})['text'],
        "year": data.get('year',{'text':None})['text'],
        "venue": data.get('venue',{'text':None})['text'],
        "citationContexts":data.get('citationContexts',None),
        "citationStats": data.get('citationStats',None),
        "sources":data.get('sources',None),
        "externalContentStats":data.get('externalContentStats',None),
        "journal":data.get('journal',None),
        "presentationUrls":data.get('presentationUrls',None),
        "links": data.get('links',None),
        "primaryPaperLink": data.get('primaryPaperLink',None),
        "alternatePaperLinks": data.get('alternatePaperLinks',None),
        "entities": [author['name'] for author in data.get('entities',[{'name':None}])],
        "entityRelations": data.get('entityRelations',None),
        "blogs":data.get('blogs',None),
        "videos":data.get('videos',None),
        "githubReferences": data.get('githubReferences',None),
        "scorecardStats": data.get('scorecardStats',None),
        "fieldsOfStudy":data.get('fieldsOfStudy',None),
        "pubDate":data.get('pubDate',None),
        "pubUpdateDate":data.get('pubUpdateDate',None),
        "badges":data.get('badges',None),
        "tldr":data.get('tldr',None)
        }
    return p

  def _query(self, page):
    url = "https://www.semanticscholar.org/api/1/search"
    post = self.post.copy()
    post["page"] = page
    try:
      res = requests.post(url, json=post, timeout=15)
      res.encoding = 'utf-8'
      return [res, page, res.status_code]
    except:
      return [None, page, 400 ]
      

  def _json(self, res):
#     print(res.text)
    return json.loads(res.text).copy()

    # c['querySuggestions']
    # c['totalPages']
    # c['totalResults']
  def save(self, name, data):
    try:
      with open(f'./{name}.json', 'w',encoding='UTF-8') as fp:
          json.dump(data, fp)
    except:
      print("[Save]>> Error to save the data.")
  
  def load_json(self, path):
    try:
      with open(f'{path}', 'r', encoding='UTF-8') as fp:
          return json.load(fp)
    except:
      print("[load_json]>> Error to load json file.")
    
  def _startFile(self, find):
    jsonFile = Path(f"./{self.saveName}.json")
    textFile = Path(f"./{self.saveName}.text")
    
    if jsonFile.is_file():
      if find:
        try:
          print(f"[_startFile] >> Loading ./{self.saveName}.json")
          with open(f'./{self.saveName}.json', 'r',encoding='UTF-8') as f:
            data = json.load(f)
          print(f"[Create] >> Creating a ./{self.saveName}.text file to save data.")
          with open(f'./{self.saveName}.text', 'w',encoding='UTF-8') as fp:
            fp.write("{\"Results\": [")
        
        # print(data['Results'])
          self._save(data['Results'])
        except:
          print(f"[_startFile] >> Fail to load ./{self.saveName}.json")
          try:
            # print(f"[Create] >> Creating a ./{self.saveName}.text file to save data.")
            with open(f'./{self.saveName}.text', 'w', encoding='UTF-8') as fp:
              fp.write("{\"Results\": [")
          except:
            print("[Create] >> Fail")
      else:
        return False
    else:
      try:
        print(f"[Create] >> Creating a ./{self.saveName}.text file to save data.")
        with open(f'./{self.saveName}.text', 'w',encoding='UTF-8') as fp:
          fp.write("{\"Results\": [")
      except:
        print("[Create] >> Fail")
    return True


  def _save(self, check_point):
    if str(check_point) == '[]' or str(check_point) == '[,]':
      return _
    else:
      text = str(check_point)

      text = re.sub('^\[', '', text)
      text = re.sub('\]$', '', text)

      
      with open(f'./{self.saveName}.text', 'a', encoding='utf-8') as fp:
        fp.write(text)
          # json.dump(self.all['Results'], fp)
      print(f"[Save] >> Saving check_point at current directory, ./{self.saveName}.text")
      

  def _endFile(self):
    # ast.literal_eval(text)
    try:
      with open(f'./{self.saveName}.text', 'a', encoding='UTF-8') as fp:
        fp.write(']}')
      
      try:
        with open(f'./{self.saveName}.text', 'r', encoding='UTF-8') as fp:
          text = fp.read()
          text_dict = ast.literal_eval(text)
        
      
      # os.rename(f'./{self.saveName}.text', f'./{self.saveName}.json')
      # os.remove(f"./{self.saveName}.text")
      # print(text_dict)
        with open(f'./{self.saveName}.json', 'w', encoding='UTF-8') as fp:
          json.dump(text_dict, fp)
        print(f"[Close] >> Closed and save in ./{self.saveName}.json file the data.")
      except:
        print(f"[Close] >> Fail to save the data ./{self.saveName}.json file.")

    except:
      print('[Close] >> Fail')

  

  @timer
  def _extract(self, pool, data):
    try:
      # print("data")
      # print(data)

      # print("data['Response'].tolist()")
      # print(data['Response'].tolist())
      self.papers_text = pool.map(self._json, data['Response'].tolist())
      # print("self.papers_text")
      # print(self.papers_text)

      if self._start:
        print('\n ---')
        print(f"Total Results: {self.papers_text[0]['totalResults']}")
        print(f"Total Pages: {self.papers_text[0]['totalPages']}")
        print(f"Query Suggestions: {self.papers_text[0]['querySuggestions']}")
        print('--- \n')
        self.totalPages = self.papers_text[0]['totalPages']
        self.totalResults = self.papers_text[0]['totalResults']
        self._start = False

      
      print("[_extract] >> extracting relevant data.")
      check_point= [{"Page": {"N_Page":page['query']['page'],
                                   "N_Papers":len(page['results']),
                                   "Papers": pool.map(self._paperExtract,
                                                      page['results'])}} for page in self.papers_text]


      # print(check_point)
      if self.saveFile:
        try:
          self._save(check_point)
        except:
          print("_save >> [Fail] to save.")
          print("_extract>> [Fail], see .badcall to reextract content.")
          self.badcall.append(self.papers_text)
          # print(self.badcall)
      else:
        self.all["Results"].extend(check_point)


    except:
      print("_extract>> [Fail], see .badcall to reextract content.")
      self.badcall.append(self.papers_text)
      print(self.badcall)
    
    

  # def _data(self, data):
  #   if type(self.datasource) == str:
  #     self.datasource = data
  #   else:
  #     self.datasource = pd.concat([self.datasource, data])

  @timer
  def _runtime(self, pool, pages):
    self.totalPages = 0

    find = False
    
    while True:
      if self.saveFile:
        close = self._startFile(find)
      
      print('\n')
      print('[_runtime]>> Start searching...')
      # print(self.totalPages)
      # print(self.totalResults)
      # print(self.n)

      if self.totalResults < self.n:
        self.n = self.totalResults
        pages = list(range(self._page, self.totalPages))
      
      try:
        res = pool.map(self._query, pages)
        # print(res)
        resultData = pd.DataFrame(res, columns=["Response", "Page", "Code"])
        resultData.set_index("Page")
        
        if resultData.query("Code !=200").size == 0:
          # self._data(resultData)
          self._extract(pool, resultData.query("Code ==200"))
          if self.saveFile:
            if close:
              self._endFile()
              find = True
          break
        else:
          find = False

          if resultData.query("Code ==200").size != 0:
            self._extract(pool, resultData.query("Code ==200"))
            if self.saveFile:
              if close:
                find = True
                self._endFile()
              
          print("Bad call of pages:")
          # self._data(resultData.query("Code == 200"))
          # self.datasource.append(resultData.query("Code ==200"))
          pages = resultData.query("Code !=200")["Page"].values.tolist()
          print(pages)
          try:
            with open("./BadCalls.text", 'w', encoding='UTF-8') as fp:
              fp.write(str(pages))
          except:
            print("Fail to save badcalls")
          print(f"Tentando de novo daqui a {self.sleeptry/60} min...")


          sleep(self.sleeptry)
      except:
        pass
      print("---")
        
        
    
    
    # self._extract(pool, self.datasource)


  @timer
  def get(self, n = 10, page = 1, pages = []):
    self._pages = pages
    self.n = n
    self._page = page
    self.totalResults = 1000000000000000000000
    self.post["pageSize"] = 10
    self.post["page"] = page
    self.all = {"Results": []}
    print('.post >>')
    print(self.post)
    # self.datasource = ''
    print("\n")
    print("Searching...")
    print(self.all)

    

    with mp.Pool(self.poolCPU) as pool:
      if self.n > 10:
    
        if len(pages) != 0:
          self._pages = pages
        else:  
          self._pages = list(range(self._page, (self.n//10)+self._page))
      # for page in range(n//10):
        self._runtime(pool, self._pages)
          
        if n%10>0:
          self._pages = [self.n//10+self._page]
          self.post["pageSize"] = self.n%10

          self._runtime(pool, self._pages)
          
      else:
        # pass
        self._pages = [self._page]
        self.post["page"] = self._page
        self.post["pageSize"] = self.n

        self._runtime(pool, self._pages)

    
      # self._extract(pool, self.datasource)


##### Description #####
# ex. {"params": value} 
#  
##### Params that can pass in SearchWeb().get(params = value): #####
#     {
#     "n": 1000 (how much papers)
#     "page": 1, (where start search)
#      }
##### Params that can pass in SearchWeb(params = value): #####
# data = '''{
#     "Savename": 'Data'
#     "sleeptry": 3 (seconds)
#     "poolCPU": 4 (Number of clusters, CPU)
#     "save": False
#     "queryString": "Machine Learning+Deep Learning",
#     "sort": "total-citations", #influence #"pub-date" #relevance
#     "authors": [],
#     "coAuthors": [],
#     "venues": [
#         "PloS one",
#         "AAAI",
#         "Scientific reports",
#         "IEEE Access",
#         "ArXiv",
#         "Expert Syst. Appl.",
#         "ICML",
#         "Neurocomputing",
#         "Sensors",
#         "Remote. Sens."
#     ],
#     "yearFilter": {
#         "min": 2008,
#         "max": 2021
#     },
#     "requireViewablePdf": True,
#     "publicationTypes": [
#         "ClinicalTrial",
#         "CaseReport",
#         "Editorial",
#         "Study",
#         "Book",
#         "News",
#         "Review",
#         "Conference",
#         "LettersAndComments",
#         "JournalArticle"
#     ],
#     "externalContentTypes": [],
#     "fieldsOfStudy": [
#         "biology",
#         "art",
#         "business",
#         "computer-science",
#         "chemistry",
#         "economics",
#         "engineering",
#         "environmental-science",
#         "geography",
#         "geology",
#         "history",
#         "materials-science",
#         "mathematics",
#         "medicine",
#         "philosophy",
#         "physics",
#         "political-science",
#         "psychology",
#         "sociology"
#     ],
#     "useFallbackRankerService": False,
#     "useFallbackSearchCluster": False,
#     "hydrateWithDdb": True,
#     "includeTldrs": True,
#     "performTitleMatch": True,
#     "includeBadges": True,
#     "tldrModelVersion": "v2.0.0",
#     "getQuerySuggestions": False
# }
# '''
# '''
# Obs. Params that have a list can be a empty list
# Ex. {"venues": []}

### Discoment here to have a script
# if __name__ == '__main__':
#   SearchWeb(
#     search= "decision making+optimization+artificial intelligence",
#     sort= "influence",
#     Savename = "influence_data",
#     save=True,
#     poolCPU = 4,
#     sleeptry = 3.5*60,
#     venues = [],
#     publicationTypes = ['JournalArticle'],
#     fieldsOfStudy = [],
#     getQuerySuggestions = True
#     ).get(20000, page = 1)


In [None]:
from_Webpage = SearchWeb(search= "Machine Learning+Deep Learning", sort= "total-citations", save=True, saveName = "Data")

In [None]:
# Retorna 100 papers a partir da pag. 2 com base nos parametros passados em SearchWeb() que constitui ().post
from_Webpage.get(100, page = 2)

.post >>
{'page': 2, 'pageSize': 10, 'queryString': 'Machine Learning+Deep Learning', 'sort': 'total-citations', 'authors': [], 'coAuthors': [], 'venues': ['PloS one', 'AAAI', 'Scientific reports', 'IEEE Access', 'ArXiv', 'Expert Syst. Appl.', 'ICML', 'Neurocomputing', 'Sensors', 'Remote. Sens.'], 'yearFilter': None, 'requireViewablePdf': False, 'publicationTypes': ['ClinicalTrial', 'CaseReport', 'Editorial', 'Study', 'Book', 'News', 'Review', 'Conference', 'LettersAndComments', 'JournalArticle'], 'externalContentTypes': [], 'fieldsOfStudy': ['biology', 'art', 'business', 'computer-science', 'chemistry', 'economics', 'engineering', 'environmental-science', 'geography', 'geology', 'history', 'materials-science', 'mathematics', 'medicine', 'philosophy', 'physics', 'political-science', 'psychology', 'sociology'], 'useFallbackRankerService': False, 'useFallbackSearchCluster': False, 'hydrateWithDdb': True, 'includeTldrs': True, 'performTitleMatch': True, 'includeBadges': True, 'tldrModelVe

In [None]:
# Tudo que vem com base em 1 paper (dict.)

json_file = SearchWeb().load_json("Data.json")

json_file["Results"][0]['Page']["Papers"][0]

{'alternatePaperLinks': [],
 'authors': ['E. Samaniego',
  'C. Anitescu',
  'S. Goswami',
  'Vien Minh Nguyen-Thanh',
  'Hongwei Guo',
  'Khader M. Hamdia',
  'T. Rabczuk',
  'X. Zhuang'],
 'badges': [{'id': 'OPEN_ACCESS'}],
 'blogs': [],
 'citationContexts': [],
 'citationStats': {'citationAcceleration': 0.703125,
  'citationVelocity': 58.333333333333336,
  'citedByBuckets': [{'count': 2, 'endKey': 2019, 'startKey': 2019},
   {'count': 64, 'endKey': 2020, 'startKey': 2020},
   {'count': 109, 'endKey': 2021, 'startKey': 2021}],
  'estNumCitations': 84.08133939843164,
  'firstCitationVelocityYear': 2019,
  'keyCitationRate': 0.0,
  'keyCitedByBuckets': [],
  'lastCitationVelocityYear': 2021,
  'numCitations': 175,
  'numKeyCitations': 0,
  'numKeyReferences': 2,
  'numReferences': 48,
  'numViewableReferences': 48},
 'entities': ['Machine learning',
  'Computational mechanics',
  'Finite element method',
  'Isogeometric analysis',
  'Computation',
  'Loss function',
  'Collocation',
  '