## Importing Required Packages

In [2]:
import requests
import pandas as pd
import dimcli
from dimcli.shortcuts import dslquery_json as dslquery
from bs4 import BeautifulSoup

In [5]:
USERNAME = '*****'
PASSWORD = '*****'
ENDPOINT = 'app.dimensions.ai' 

In [6]:
dimcli.login(username= USERNAME, password=PASSWORD)

DimCli v0.5.8.1 - Succesfully connected to <https://app.dimensions.ai> (method: manual login)


In [7]:
dsl = dimcli.Dsl()

## Exploring Dimensions API

### All Spanish publications
##### Using Magic Commands available for easy exploration

In [6]:
%%dsl
search publications 
where  research_orgs.country_name="Spain"
and type = "article"
return publications[id] 

Returned Publications: 20 (total = 1019271)




There is a total of 1,019,271 papers published in Spain in Dimensions database.

### All Spanish publications in the last 10 years

In [7]:
%%dsl
search publications 
where  research_orgs.country_name="Spain"
and type = "article"
and year in [2009:2019] 
return publications[id] 

Returned Publications: 20 (total = 640669)




We got 640,669 papers published in the last 10 years in Spain.

### Getting Top 25 Spanish Institutions by number of publications in the last 10 years

In [15]:
%%dsl
search publications
where type="article"
and research_orgs.country_name = "Spain"
and year in [2009:2019] 
return research_orgs limit 25

Returned Research_orgs: 25




#### Converting results from previous query in a pandas dataframe

In [16]:
data = _['research_orgs']
df_toporgs = pd.DataFrame.from_dict(data)
df_toporgs

Unnamed: 0,acronym,count,country_name,id,name
0,UB,36864,Spain,grid.5841.8,University of Barcelona
1,UAB,35016,Spain,grid.7080.f,Autonomous University of Barcelona
2,,31684,Spain,grid.4795.f,Complutense University of Madrid
3,UV,26951,Spain,grid.5338.d,University of Valencia
4,UAM,23576,Spain,grid.5515.4,Autonomous University of Madrid
5,UGR,23042,Spain,grid.4489.1,University of Granada
6,UPV,21731,Spain,grid.11480.3c,University of the Basque Country
7,,21104,Spain,grid.413448.e,Institute of Health Carlos III
8,,18441,Spain,grid.9224.d,University of Seville
9,,16976,Spain,grid.11205.37,University of Zaragoza


#### Getting the Ids of the Top organizations and creating a list with them

In [10]:
list_orgs = df_toporgs['id'].tolist()

#### Create functions to get the all the publications Ids published in the last 10 years in the Top 25 Spanish Organizations

We use the list previously created with the Ids of the Organizations to iterate over it and get all the publications published from 2009.

In [11]:
def getpubs(year,list_orgs,limit=1000, skip=0):
    data = """search publications
    where  year in [{}] 
    and type="article" 
    and research_orgs.id  in [{}]
    return publications[id+year]
    limit {} skip {}
    """.format(year,",".join([ '"{}"'.format(i) for i in list_orgs]),limit,skip)
    return data

In [12]:
def getallpubsfrominst(year,gridid):
    skip = 0
    pubs = []
    total_pubs = []
    
    while (skip == 0) or (len(pubs) == 1000):
        pubs = dslquery(getpubs(year,list_orgs,skip=skip)).get('publications',[])
        total_pubs += pubs
        skip += 1000      
                            
    return total_pubs

Dimension's API has a limit to skip for documents of 50,000. Hence we call a function for each year to adapt our results to this limit and store the results in several dataframes.

In [17]:
pubs2009df = pd.DataFrame(getallpubsfrominst(2009,list_orgs))
pubs2010df = pd.DataFrame(getallpubsfrominst(2010,list_orgs))
pubs2011df = pd.DataFrame(getallpubsfrominst(2011,list_orgs))
pubs2012df = pd.DataFrame(getallpubsfrominst(2012,list_orgs))
pubs2013df = pd.DataFrame(getallpubsfrominst(2013,list_orgs))
pubs2014df = pd.DataFrame(getallpubsfrominst(2014,list_orgs))
pubs2015df = pd.DataFrame(getallpubsfrominst(2015,list_orgs))
pubs2016df = pd.DataFrame(getallpubsfrominst(2016,list_orgs))
pubs2017df = pd.DataFrame(getallpubsfrominst(2017,list_orgs))
pubs2018df = pd.DataFrame(getallpubsfrominst(2018,list_orgs))
pubs2019df = pd.DataFrame(getallpubsfrominst(2019,list_orgs))

Next step, we get rid of the first characters in the id of the publication so we can further loop thru this Id's when scrapping Dimension's web.

In [18]:
df_list = [pubs2009df,pubs2010df,pubs2011df,pubs2012df,pubs2013df,pubs2014df,pubs2015df,pubs2016df,pubs2017df,pubs2018df,pubs2019df]

In [19]:
for i in df_list:
    i['id'] = i['id'].str[4:]

In [20]:
pubs2015df.head()

Unnamed: 0,id,year
0,1072986379,2015
1,1058239462,2015
2,1055110640,2015
3,1055109576,2015
4,1042186661,2015


Now we create a list with the publications ids and convert the Ids to int, as this is what we need to insert in our web scraper. For this we create a function to do same operation over all dataframes.

In [8]:
def prepare_list_of_pubid (df):
    publs_list = df['id'].tolist()
    for i in range(0, len(publs_list)): 
        publs_list[i] = int(publs_list[i]) 
    return publs_list

In [22]:
test_function = prepare_list_of_pubid(pubs2009df)

Lets check the output of the list and lenght

In [23]:
test_function[0]

1005992287

In [24]:
len(test_function)

22308

## Exploring Dimension's web

In [9]:
## Connecting test to dimensions web
url = "https://app.dimensions.ai/details/publication/pub.1111348256"
response = requests.get(url)
print(url)

https://app.dimensions.ai/details/publication/pub.1111348256


In [64]:
soup = BeautifulSoup(response.text)

### Getting publications abstract from url

In [65]:
abstract = soup.find_all('meta', property='og:description')
print(abstract)

[<meta content="Permafrost warming has the potential to amplify global climate change, because when frozen sediments thaw it unlocks soil organic carbon. Yet to date, no globally consistent assessment of permafrost temperature change has been compiled. Here we use a global data set of permafrost temperature time series from the Global Terrestrial Network for Permafrost to evaluate temperature change across permafrost regions for the period since the International Polar Year (2007-2009). During the reference decade between 2007 and 2016, ground temperature near the depth of zero annual amplitude in the continuous permafrost zone increased by 0.39 ± 0.15 °C. Over the same period, discontinuous permafrost warmed by 0.20 ± 0.10 °C. Permafrost in mountains warmed by 0.19 ± 0.05 °C and in Antarctica by 0.37 ± 0.10 °C. Globally, permafrost temperature increased by 0.29 ± 0.12 °C. The observed trend follows the Arctic amplification of air temperature increase in the Northern Hemisphere. In the

### Writing our web scrapper in a function to loop thru our Publications Ids and get their abstracts

In [10]:
def get_abstract (publications_list):
    pub_list=[]
    for i in publications_list:
        url = ("https://app.dimensions.ai/details/publication/pub.{}".format(i))
        r = requests.get(url)
        soup = BeautifulSoup(r.content)
        abstract = soup.find_all('meta', property='og:description')
        pub_list.append([i,abstract])
    return pub_list 

#### Test our function

In [11]:
test_list = [1111348256, 1111569301, 1111917329, 1113874979, 1112214268]

In [12]:
test_result = get_abstract(test_list)

In [13]:
test_result

[[1111348256,
  [<meta content="Permafrost warming has the potential to amplify global climate change, because when frozen sediments thaw it unlocks soil organic carbon. Yet to date, no globally consistent assessment of permafrost temperature change has been compiled. Here we use a global data set of permafrost temperature time series from the Global Terrestrial Network for Permafrost to evaluate temperature change across permafrost regions for the period since the International Polar Year (2007-2009). During the reference decade between 2007 and 2016, ground temperature near the depth of zero annual amplitude in the continuous permafrost zone increased by 0.39 ± 0.15 °C. Over the same period, discontinuous permafrost warmed by 0.20 ± 0.10 °C. Permafrost in mountains warmed by 0.19 ± 0.05 °C and in Antarctica by 0.37 ± 0.10 °C. Globally, permafrost temperature increased by 0.29 ± 0.12 °C. The observed trend follows the Arctic amplification of air temperature increase in the Northern He

#### Run for all our list of Publications Ids in 2019 and store results in dataframe

In [31]:
publs_abstracts_2019 = get_abstract(publs_list_2019)

#### Creating pandas dataframe from function results and saving it to a csv file

In [32]:
df_publ_abstracts_2019 = pd.DataFrame(publs_abstracts_2019) 

In [33]:
df_publ_abstracts_2019.shape

(42520, 2)

In [34]:
df_publ_abstracts_2019.head(10)

Unnamed: 0,0,1
0,1117506301,"[<meta content=""Four types of calcined MCM-41 ..."
1,1117603937,"[<meta content=""First magnetic characterizatio..."
2,1117505850,"[<meta content=""Metal oxide nanoparticles of d..."
3,1120933764,"[<meta content=""The objective of this study is..."
4,1117193292,"[<meta content=""A long-term multi-parameter sk..."
5,1121801826,"[<meta content=""The objective of this study wa..."
6,1121017012,"[<meta content=""Research has shown that athlet..."
7,1117155867,"[<meta content=""Due to the anatomical continui..."
8,1120908992,"[<meta content=""Focused electron beam induced ..."
9,1113949992,"[<meta content=""This paper aims at describing ..."


In [35]:
## naming data frame columns
df_publ_abstracts_2019.columns = ['Publication Id','Publication Abstract']

In [36]:
df_publ_abstracts_2019.head()

Unnamed: 0,Publication Id,Publication Abstract
0,1117506301,"[<meta content=""Four types of calcined MCM-41 ..."
1,1117603937,"[<meta content=""First magnetic characterizatio..."
2,1117505850,"[<meta content=""Metal oxide nanoparticles of d..."
3,1120933764,"[<meta content=""The objective of this study is..."
4,1117193292,"[<meta content=""A long-term multi-parameter sk..."


In [44]:
df_publ_abstracts_2019.to_csv("./papers_abstracts.csv", sep=',',index=False)

#### Create funtion for easily run for rest of years and store results in dataframes

We will use two functions previously created ( prepare_list_of_pubid and get_abstract functions) to do get the needed data and carry out the required transformations

In [18]:
def publ_abstracts_to_df (df):
    pubid_list = prepare_list_of_pubid(df)
    publs_abstracts = get_abstract(pubid_list)
    df_abstracts = pd.DataFrame(publs_abstracts) 
    df_abstracts.columns = ['Publication Id','Publication Abstract']
    df_abstracts.to_csv("./papers_abstracts.csv", sep=',', mode='a', index=False )

We run the function for all years (below we would execute the function for 2009 papers) , this process takes several hours to complete.

In [None]:
publ_abstracts_to_df(pubs2009df)

### Final preparation and cleansing of our dataset

First we need to merge the resulted csv file containing the abstracts with the dataframes for each year to get the publication year of the paper in order to create a timeline of the evolution in the topics.

We first concatenate all dataframes containing the paper id and the year of publication.

In [None]:
alldfs = [pubs2009df,pubs2010df,pubs2011df,pubs2012df,pubs2013df,pubs2014df,pubs2015df
          ,pubs2016df,pubs2017df,pubs2018df,pubs2019df] 
new_df = pd.concat(alldfs)

Now we load our csv file containing the abstracts into a pandas dataframe and merge it to the previous dataframe created.

In [None]:
df_abstracts = pd.read_csv("./papers_abstracts.csv")

In [None]:
df_abstract_merged = pd.merge(df_abstracts.astype(str), new_df.astype(str)
                          , left_on = 'Publication Id', right_on = 'id', how = 'left')

Dropping id column created.

In [None]:
df_abstracts_years = df_abstract_merged.drop(columns=['id'])

Removing characters at beginning and end of abstract element coming with html tag ( i.e brackets, '<meta content='..)

In [None]:
df_abstracts_years['Publication Abstract'] = df_abstracts_years['Publication Abstract'].str[16:-30]

Now we are going to detect the language of the abstract to keep only the ones written in english for our model. We will do this thru the package 'langdetect'. 
Use the command line to install the package like this: 

*pip install langdetect*

In [None]:
from langdetect import detect

This package will help us to identify the language of the abstract, so we will create a new column with the language detected and then we will filtered the dataframe to keep only the papers with the abstract written in english.

In [None]:
df_abstracts_years['language'] = [detect(i) for i in df_abstracts_years['Publication Abstract']]

In [None]:
english = df_abstracts_years[df_abstracts_years['language']=='en']

In [None]:
df_abstr_year_paper = english.drop(columns=['language'])

Our dataset is ready , we copy it to a csv file

In [None]:
df_abstr_year_paper.to_csv("./papers_abstracts_published_year.csv", sep=',', index=False )