# Importing Required Packages

In [23]:
import requests
import pandas as pd
import dimcli
from dimcli.shortcuts import dslquery_json as dslquery
from dimcli.shortcuts import dslquery
from bs4 import BeautifulSoup

In [2]:
USERNAME = 'alberto.corbella@frontiersin.org'
PASSWORD = 'Messias10'
ENDPOINT = 'app.dimensions.ai'  # eg no need to include 'https://'

In [3]:
dimcli.login(username= USERNAME, password=PASSWORD)

DimCli v0.5.8.1 - Succesfully connected to <https://app.dimensions.ai> (method: manual login)


In [6]:
dsl = dimcli.Dsl()

# Exploring Dimensions API 

### All Spanish publications in Spain
##### Using Magic Commands available for easy exploration

In [5]:
%%dsl
search publications 
where  research_orgs.country_name="Spain"
and type = "article"
return publications[id] 

Returned Publications: 20 (total = 1010312)




### Getting no of Publications in last 10 years

In [7]:
%%dsl
search publications 
where  research_orgs.country_name="Spain"
and type = "article"
and year in [2009:2019] 
return publications[id] 

Returned Publications: 20 (total = 633502)




### Getting Top Spanish Institutions by No of Publications in last 10 years

In [10]:
%%dsl
search publications
where type="article"
and research_orgs.country_name = "Spain"
and year in [2009:2019] 
return research_orgs limit 5

Returned Research_orgs: 5




##### Converting results from previous query in a pandas data frame

In [11]:
data = _['research_orgs']
df_orgs = pd.DataFrame.from_dict(data)
print(df_orgs)

  acronym  count country_name           id                                name
0      UB  36510        Spain  grid.5841.8             University of Barcelona
1     UAB  34676        Spain  grid.7080.f  Autonomous University of Barcelona
2     NaN  31426        Spain  grid.4795.f    Complutense University of Madrid
3      UV  26708        Spain  grid.5338.d              University of Valencia
4     UAM  23378        Spain  grid.5515.4     Autonomous University of Madrid


### Getting sample of data out of the Top Spanish Organizations by No of Publications

In this step, due to the high number of publications we would need to loop thru when carrying out the web scrapping, we will get a sample of the data based on the Top Institutions by the number of publications in the last year. Also we are retrieving the year of publication in order to be able to see the evolution later on of the different topics in this last 10 years.

We know from previous steps that 633,502 publications are registered in database to be related to Spanish Institutions. We wil be looking at the top 3 Institutions to sample the data and we will filter further to get those publications that have been cited more than 5 times so we can identify those with certain relevance in the period.

In [17]:
%%dslloop
search publications
where 
    (
        research_orgs = "grid.5841.8" 
        or
        research_orgs = "grid.7080.f" 
        or
        research_orgs = "grid.4795.f" 
    )
    and year in [2009:2019] 
    and type="article" 
    and research_orgs.country_name = "Spain"
    and times_cited > 5
    
return publications[id+year] 

1000 / 48139
2000 / 48139
3000 / 48139
4000 / 48139
5000 / 48139
6000 / 48139
7000 / 48139
8000 / 48139
9000 / 48139
10000 / 48139
11000 / 48139
12000 / 48139
13000 / 48139
14000 / 48139
15000 / 48139
16000 / 48139
17000 / 48139
18000 / 48139
19000 / 48139
20000 / 48139
21000 / 48139
22000 / 48139
23000 / 48139
24000 / 48139
25000 / 48139
26000 / 48139
27000 / 48139
28000 / 48139
29000 / 48139
30000 / 48139
31000 / 48139
32000 / 48139
33000 / 48139
34000 / 48139
35000 / 48139
36000 / 48139
37000 / 48139
38000 / 48139
39000 / 48139
40000 / 48139
41000 / 48139
42000 / 48139
43000 / 48139
44000 / 48139
45000 / 48139
46000 / 48139
47000 / 48139
48000 / 48139
48139 / 48139


<dimcli.Result object #139639995605400. Dict keys: '_stats', 'publications'>

We get 48,139 publications. Now we convert this to a pandas dataframe

In [18]:
results = dsl_last_results.as_dataframe()

In [19]:
results.shape

(48139, 2)

In [20]:
results.head()

Unnamed: 0,id,year
0,pub.1111348256,2019
1,pub.1111569301,2019
2,pub.1111917329,2019
3,pub.1113874979,2019
4,pub.1112214268,2019


Now we get rid of the first characters in the id of the publication so we can further loop thru this Id's when scrapping Dimension's web

In [21]:
results['id'] = results['id'].str[4:]

In [22]:
results.head()

Unnamed: 0,id,year
0,1111348256,2019
1,1111569301,2019
2,1111917329,2019
3,1113874979,2019
4,1112214268,2019


Now we store our results in a csv file

In [88]:
results.to_csv("./publications_list.csv", sep=',',index=False)

In [35]:
df_pub_year = pd.read_csv("./publications_list.csv")

# Exploring Dimension's web

In [25]:
## Connecting test to dimensions web
url = "https://app.dimensions.ai/details/publication/pub.1111348256"
response = requests.get(url)
print(url)

https://app.dimensions.ai/details/publication/pub.1111348256


In [26]:
soup = BeautifulSoup(response.text)
soup

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="IE=EDGE" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<meta content="10.1038/s41467-018-08240-4" name="citation_doi"/>
<meta content="30651568" name="citation_pmid"/>
<meta content="Permafrost is warming at a global scale" name="citation_title"/>
<title> Permafrost is warming at a global scale - Dimensions </title>
<link href="https://app.dimensions.ai/static/img/favicon/apple-touch-icon.png?_cbt=3725547" rel="apple-touch-icon" sizes="180x180"/>
<link href="https://app.dimensions.ai/static/img/favicon/favicon-32x32.png?_cbt=3725547" rel="icon" sizes="32x32" type="image/png"/>
<link href="https://app.dimensions.ai/static/img/favicon/favicon-16x16.png?_cbt=3725547" rel="icon" sizes="16x16" type="image/png"/>
<link href="https://app.dimensions.ai/static/img/favicon/manifest.json?_cbt=3725547" rel="manifest"/>
<link color="#5bbad5" href="https://app.dimensi

### Getting publications abstract from url

In [27]:
abstract = soup.find_all('meta', property='og:description')
print(abstract)

[<meta content="Permafrost warming has the potential to amplify global climate change, because when frozen sediments thaw it unlocks soil organic carbon. Yet to date, no globally consistent assessment of permafrost temperature change has been compiled. Here we use a global data set of permafrost temperature time series from the Global Terrestrial Network for Permafrost to evaluate temperature change across permafrost regions for the period since the International Polar Year (2007-2009). During the reference decade between 2007 and 2016, ground temperature near the depth of zero annual amplitude in the continuous permafrost zone increased by 0.39 ± 0.15 °C. Over the same period, discontinuous permafrost warmed by 0.20 ± 0.10 °C. Permafrost in mountains warmed by 0.19 ± 0.05 °C and in Antarctica by 0.37 ± 0.10 °C. Globally, permafrost temperature increased by 0.29 ± 0.12 °C. The observed trend follows the Arctic amplification of air temperature increase in the Northern Hemisphere. In the

### Preparing list of publications to loop thru in the web scrapper

In [45]:
df_Pub_Ids_Year = pd.read_csv("./publications_list.csv")
mat = df_Pub_Ids_Year[df_Pub_Ids_Year.columns[0]].as_matrix()
publ_list = mat.tolist()
print(publ_list)

[1111348256, 1111569301, 1111917329, 1113874979, 1112214268, 1112602632, 1111775082, 1111767790, 1063973185, 1063973259, 1103467249, 1103925405, 1090396983, 1110452722, 1120524430, 1103762517, 1113303739, 1116135094, 1112847699, 1115511841, 1104351224, 1118009774, 1118040996, 1090915095, 1063973285, 1117410421, 1107744411, 1114081462, 1110483060, 1113600536, 1111372703, 1110580080, 1110712497, 1093058201, 1105440938, 1101148600, 1101718699, 1113308043, 1112312775, 1112901595, 1116663132, 1112590850, 1100233019, 1014806058, 1104096569, 1090927280, 1114413217, 1106149849, 1103241605, 1111594265, 1110858207, 1116675496, 1112394197, 1112542942, 1111267229, 1113879887, 1104249871, 1104412512, 1107185750, 1101780840, 1114025611, 1112851447, 1112248335, 1114505633, 1101754309, 1113811354, 1113858742, 1114204269, 1104327250, 1112203438, 1109828919, 1113860296, 1113878351, 1112605107, 1112939004, 1111626819, 1116018387, 1110097955, 1100481613, 1099641295, 1111903486, 1113301496, 1107453522, 111

  


### Writing our web scrapper in a function to loop thru our Publications Ids and get their abstracts

In [46]:
def get_abstract (publications_list):
    pub_list=[]
    for i in publications_list:
        url = ("https://app.dimensions.ai/details/publication/pub.{}".format(i))
        r = requests.get(url)
        soup = BeautifulSoup(r.content)
        abstract = soup.find_all('meta', property='og:description')
        pub_list.append([i,abstract])
    return pub_list 
        

#### Test our function

In [47]:
test_list = [1111348256, 1111569301, 1111917329, 1113874979, 1112214268]
test_result = get_abstract(test_list)
test_result[3]

[1113874979,
 [<meta content="Polyglutamine (polyQ) tracts are regions of low sequence complexity frequently found in transcription factors. Tract length often correlates with transcriptional activity and expansion beyond specific thresholds in certain human proteins is the cause of polyQ disorders. To study the structural basis of the association between tract length, transcriptional activity and disease, we addressed how the conformation of the polyQ tract of the androgen receptor, associated with spinobulbar muscular atrophy (SBMA), depends on its length. Here we report that this sequence folds into a helical structure stabilized by unconventional hydrogen bonds between glutamine side chains and main chain carbonyl groups, and that its helicity directly correlates with tract length. These unusual hydrogen bonds are bifurcate with the conventional hydrogen bonds stabilizing α-helices. Our findings suggest a plausible rationale for the association between polyQ tract length and androg

#### Run for all our list of Publications Ids

In [50]:
import time

start = time.time()
pubs_abstracts = get_abstract(publ_list)
end = time.time()

elapsed = end - start

In [51]:
elapsed

34790.74991989136

#### Creating pandas dataframe from function results and saving it to a csv file

In [54]:
df_publ_abstracts = pd.DataFrame(pubs_abstracts) 

In [55]:
df_publ_abstracts.shape

(43727, 2)

In [57]:
df_publ_abstracts.head(10)

Unnamed: 0,0,1
0,1111348256,"[<meta content=""Permafrost warming has the pot..."
1,1111569301,"[<meta content=""Choroidal neovascularization (..."
2,1111917329,"[<meta content=""BACKGROUND: Age has been tradi..."
3,1113874979,"[<meta content=""Polyglutamine (polyQ) tracts a..."
4,1112214268,"[<meta content=""Background: The aim of this tr..."
5,1112602632,"[<meta content=""In Parkinson's disease (PD) th..."
6,1111775082,"[<meta content=""Group 15 elements in zero oxid..."
7,1111767790,"[<meta content=""Rapid progress in the developm..."
8,1063973185,"[<meta content=""Objective: To estimate the pre..."
9,1063973259,"[<meta content=""Objective: To examine ADHD sym..."


In [58]:
## naming data frame columns
df_publ_abstracts.columns = ['Publication Id','Publication Abstract']

In [59]:
df_publ_abstracts.head()

Unnamed: 0,Publication Id,Publication Abstract
0,1111348256,"[<meta content=""Permafrost warming has the pot..."
1,1111569301,"[<meta content=""Choroidal neovascularization (..."
2,1111917329,"[<meta content=""BACKGROUND: Age has been tradi..."
3,1113874979,"[<meta content=""Polyglutamine (polyQ) tracts a..."
4,1112214268,"[<meta content=""Background: The aim of this tr..."


In [None]:
df_pub_year = pd.read_csv("./publications_list.csv")