# Import libraries and dependencies

In [1]:
from selenium import webdriver
import time
import pandas as pd
import json  
from pathlib import Path
import requests
import numpy as np
import time
# import langid

In [2]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.chrome.service import Service

# Download all researchers from Research Portal

This step was executed in the terminal to download all the resources locally, avoiding looping over the same resources repeteadly and getting the access denied to the Research Portal.

*Further information about this procedure can be found in the README.txt.*

In [3]:
baseURL = 'https://researchportal.uc3m.es/display/inv'
not_found_txt = 'Individual not found'
valid_ids = []

In [4]:
for idx in np.arange(49040, 49070): 
    idx_str = str(idx)
    portal_page = requests.get(baseURL+idx_str)
    if portal_page.ok:
        valid_ids.append(idx_str)
        with open('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/display/inv' + idx_str + '.html', 'w') as fout:
            fout.write(portal_page.text)
    print('Researchers so far: ' + str(len(valid_ids)) + ' (' + idx_str + ')\r', end='')
    time.sleep(1)

Researchers so far: 11 (49069)

In [5]:
with open('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/all_inv.txt', 'w') as fout:
    fout.write('\n'.join(valid_ids))

In [6]:
#valid_ids

# Loading the data

In [7]:
with open('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/all_inv.txt', 'r') as f:
    inv_ids = f.read().splitlines()

In [8]:
# inv_ids

# Loading the driver and browser

In [9]:
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
# open it, go to a website, and get results
driver = webdriver.Chrome(options=options)
driver2 = webdriver.Chrome(options=options)

# Data scraping

In [10]:
authors_data = []
publications_data = []
projects_data = []
others_data = []

In [11]:
# Path.cwd()

In [12]:
# Access each url given the ID
for n, inv in enumerate(inv_ids):
    print(f'Researcher {n+1} out of {len(inv_ids)}')
    inv_file = Path.cwd().joinpath(f'researchportal.uc3m.es/display/inv{inv}.html')
    auth_url = f'file:///{inv_file}'
    driver.get(auth_url)
    
# SCRAPING RESEARCHERS' INFORMATION
    try:
        name = driver.find_element(By.XPATH,'//span[@itemprop="name"]')
    except:
        name = ''

    try:
        cat = driver.find_element(By.CLASS_NAME, 'categoriainv').text.split(': ')[1]
    except:
        cat = ''
        
    try:
        orcid = driver.find_element(By.CLASS_NAME, 'individual-orcid').find_element(By.TAG_NAME, 'a').get_attribute('href')
    except:
        orcid = ''

    try:
        scopus = driver.find_element(By.ID, 'scopusId-noRangeClass-List').find_element(By.TAG_NAME, 'a').text
    except:
        scopus = ""     
    
    try:
        pos = driver.find_elements(By.CLASS_NAME, 'currentPosition')
    except:
        pos = []
    
    # Position: Academic department, Research group, Institute, ...
    positions = []
    for p in pos:
            group = p.text.split(' : ')
            pos_type = group[0]
            if len(group) > 1:
                pos_name = group[1]
            else:
                pos_name = ''
            # link = p.find_element_by_tag_name('a').get_attribute('href')

            positions.append((pos_type, pos_name))

    try:
        subject = driver.find_element(By.ID, 'individual-hasResearchArea').text
    except:
        subject = ""
    
    # create an array for which each subject is an element
    subjects = subject.splitlines()

            
    try:
        email = driver.find_element(By.CLASS_NAME, 'individual-emails').find_element(By.TAG_NAME,'a').text
    except:
        email = ''
       
    
    authors_data.append({'invID':inv, 'name' : name.text, 'orcid' : orcid, 'scopus': scopus, 'category':cat, 'email':email, 'positions':dict(positions), 'subjects': subjects})
    
    
# SCRAPING PUBLICATIONS' INFORMATION
    publications = []
    try:
        # element that references the 'sample of publications'
        pub_list = driver.find_element(By.ID, 'publicationsGroup').find_elements(By.CLASS_NAME, 'property')
        
        for section in pub_list:
            # section title: sample of publications
            section_title = section.find_element(By.TAG_NAME,'h3').text
            sections = []
            
            # pub_type: articles, book chapters, books, conference contributions, ...
            for pub in section.find_elements(By.CLASS_NAME, 'subclass'):
                articles = []
                pub_type = pub.find_element(By.TAG_NAME, 'h3').text
            
                for p in pub.find_elements(By.TAG_NAME, 'li'):
                    element = p.find_element(By.TAG_NAME,'a')
                    #resID: resource ID
                    resID = element.get_attribute('href').split('/')[-1][3:]

                    try:
                        inv_file = Path.cwd().joinpath(f'researchportal.uc3m.es/display/act{resID}.html')
                        auth_url = f'file:///{inv_file}'
                        driver2.get(auth_url)
                        
                        property_list = driver2.find_elements(By.CLASS_NAME, 'property')

                        # Iterar a través de los elementos 'property' para encontrar el abstract
                        for article in property_list:
                            abstract = ""

                            # Verificar si el ID del elemento contiene 'abstract'
                            abstract_elements = article.find_elements(By.ID, "abstract-noRangeClass-List")
                            if abstract_elements:
                                # Extraer el texto del elemento
                                abstract = abstract_elements[0].text

                        
                    except:
                        abstract = ""
                        
                    title = element.text
                    year = p.find_element(By.TAG_NAME,'span').text
                    
                    try:
                        doi_element = p.find_element(By.CLASS_NAME, "altmetric-embed")
                        doi = doi_element.get_attribute("data-doi")
                        
                        # convert DOIs to the same format
                        if doi.startswith("https://doi.org/"):
                            doi = doi
                        else:
                            doi = f"https://doi.org/{doi}"
                        
                    except:
                        doi = ""
                    
                    
                
                    articles.append({'resID': resID, 'doi': doi,'title': title, 'year': year, 'abstract': abstract})
                sections.append((pub_type, articles))
            publications.append((section_title, dict(sections)))

    except:
        pass
    
    publications_data.append({'author':inv, 'publications':dict(publications)})
    
    
    # SCRAPING PROJECTS' INFORMATION
    projects = []
    try:
        pub_list = driver.find_element(By.ID,'projectsGroup').find_elements(By.CLASS_NAME, 'property')
        for section in pub_list:
            # section_title: 'principal researcher on', 'researcher on'
            section_title = section.find_element(By.TAG_NAME, 'h3').text
            sections = []
            for p in section.find_elements(By.TAG_NAME,'li'):
                element = p.find_element(By.TAG_NAME, 'a')
                # resource ID
                resID = element.get_attribute('href').split('/')[-1][3:]
                
                try:
                    inv_file = Path.cwd().joinpath(f'researchportal.uc3m.es/display/act{resID}.html')
                    auth_url = f'file:///{inv_file}'
                    driver2.get(auth_url)

                    property_list = driver2.find_elements(By.CLASS_NAME, 'property')

                    # Iterar a través de los elementos 'property' para encontrar el abstract
                    for article in property_list:
                        abstract = ""

                        # Verificar si el ID del elemento contiene 'abstract'
                        abstract_elements = article.find_elements(By.ID, "abstract-noRangeClass-List")
                        if abstract_elements:
                            # Extraer el texto del elemento
                            abstract = abstract_elements[0].text
                except:
                    abstract = ""
                    
                title = element.text
                year = p.find_element(By.TAG_NAME, 'span').text
                
                                
                try:
                    funding_entity = p.find_element(By.XPATH, './/a[@title="awarded by"]').text
                
                except:
                    funding_entity = ""
                    
                sections.append({'resID':resID, 'title':title, 'year':year, 'funding_entity': funding_entity, 'abstract': abstract})
            projects.append((section_title, sections))
    except:
        pass
    projects_data.append({'author':inv, 'projects':dict(projects)})

     
# SCRAPING OTHERS' INFORMATION
    others = []
    try:
        pub_list = driver.find_element(By.ID, 'otherGroup').find_elements(By.CLASS_NAME, 'property')
        for section in pub_list:
            # section_title: 'other activities'
            section_title = section.find_element(By.TAG_NAME, 'h3').text
            sections = []
            
            for pub in section.find_elements(By.CLASS_NAME,'subclass'):
                articles = []
                # pub_type: 'mobility', 'supervised thesis', ...
                pub_type = pub.find_element(By.TAG_NAME, 'h3').text
                
                for p in pub.find_elements(By.TAG_NAME, 'li'):
                    element = p.find_element(By.TAG_NAME, 'a')
                    resID = element.get_attribute('href').split('/')[-1][3:]
                    title = element.text
                    
                    articles.append({'resID':resID, 'title':title})
                sections.append((pub_type, articles))
            others.append((section_title, dict(sections)))
            
    except:
        pass
    others_data.append({'author':inv, 'others':dict(others)})
    
# Conditional check and savings (Periodic backups)
    if not n%100:
        with open('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/researchers.json', 'w') as f:
            json.dump(authors_data, f)
        with open('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/publications.json', 'w') as f:
            json.dump(publications_data, f)
        with open('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/projects.json', 'w') as f:
            json.dump(projects_data, f)
        with open('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/others.json', 'w') as f:
            json.dump(others_data, f)

# Always saving (Periodic backups)

with open('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/researchers.json', 'w') as f:
    json.dump(authors_data, f)
with open('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/publications.json', 'w') as f:
    json.dump(publications_data, f)
with open('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/projects.json', 'w') as f:
    json.dump(projects_data, f)
with open('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/others.json', 'w') as f:
    json.dump(others_data, f)  
    

    

Researcher 1 out of 11
Researcher 2 out of 11
Researcher 3 out of 11
Researcher 4 out of 11
Researcher 5 out of 11
Researcher 6 out of 11
Researcher 7 out of 11
Researcher 8 out of 11
Researcher 9 out of 11
Researcher 10 out of 11
Researcher 11 out of 11


In [13]:
print(publications_data)

[{'author': '49042', 'publications': {}}, {'author': '49043', 'publications': {'sample of publications': {'articles ': [{'resID': '555859', 'doi': 'https://doi.org/10.1016/j.jmva.2021.104871', 'title': 'Variable selection in functional regression models: A review', 'year': '2022', 'abstract': ''}, {'resID': '555861', 'doi': 'https://doi.org/10.1111/anzs.12355', 'title': 'Fast and efficient algorithms for sparse semiparametric bifunctional regression', 'year': '2021', 'abstract': ''}, {'resID': '555858', 'doi': 'https://doi.org/10.1007/s11749-020-00728-w', 'title': 'Sparse semiparametric regression when predictors are mixture of functional and high&-dimensional variables', 'year': '2021', 'abstract': ''}, {'resID': '555860', 'doi': 'https://doi.org/10.1016/j.spl.2020.109028', 'title': 'A kNN procedure in semiparametric functional data analysis', 'year': '2021', 'abstract': ''}, {'resID': '555857', 'doi': 'https://doi.org/10.1080/10485252.2019.1567726', 'title': 'Automatic and location-a

# Process JSON files

In [14]:
def process_json(obj):
    '''
    Function to process json recursively
    '''
    resources = []
    auth_res = []
    author = ''
    def process(obj, objType='', author=''):
        if isinstance(obj, dict):
            for k, v in obj.items():     
                if k == 'author':
                    author = v
                if k == 'title':
                    d = {'type':objType.strip()}
                    d.update(obj)
                    resources.append(d)
                    auth_res.append((author, obj['resID']))
                else:
                    if isinstance(v, (dict, list)):
                        process(v, k, author)
        elif isinstance(obj, list):
            for el in obj:
                process(el, objType, author)
    process(obj)
    return resources, auth_res

In [15]:
# DF researchers
with open('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/researchers.json', 'r') as f:
    data = json.load(f)
    df_researchers = pd.json_normalize(data)

df_researchers.columns = ['invID']+[cname.split('.')[-1].replace(' ', '_').lower() for cname in df_researchers.columns[1:]]

# DF resources, DF auth-res
resources = []
auth_res = []
with open('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/publications.json', 'r') as f:
    data = json.load(f)
res, a_r = process_json(data)
resources.extend(res)
auth_res.extend(a_r)

# DF projects
with open('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/projects.json', 'r') as f:
    data = json.load(f)
res, a_r = process_json(data)
resources.extend(res)
auth_res.extend(a_r)

# DF Others
with open('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/others.json', 'r') as f:
    data = json.load(f)
res, a_r = process_json(data)
resources.extend(res)
auth_res.extend(a_r)

df_resources = pd.DataFrame(resources)
df_resources = df_resources.drop_duplicates(subset='resID')
df_resources = df_resources[list(df_resources.columns[1:]) + [df_resources.columns[0]]]
df_auth_res = pd.DataFrame(auth_res, columns=['invID', 'resID'])


In [16]:
df_researchers
df_resources
df_auth_res

Unnamed: 0,invID,resID
0,49043,555859
1,49043,555861
2,49043,555858
3,49043,555860
4,49043,555857
5,49050,557822
6,49050,557823
7,49050,556484
8,49050,557829
9,49050,557826


# Save as a CSV file

In [17]:

df_researchers.to_csv('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/researchers.csv', index=False)
df_resources.to_csv('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/resources.csv', index=False)
df_auth_res.to_csv('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/auth_res.csv', index=False)


#df = pd.read_csv('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/researchers.csv')
#df = pd.read_csv('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/resources.csv')
#df = pd.read_csv('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/auth_res.csv')

#df


# Save as a parquet file

In [18]:
df_researchers.to_parquet('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/researchers.parquet', index=False)
df_resources.to_parquet('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/resources.parquet', index=False)
df_auth_res.to_parquet('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/auth_res.parquet', index=False)


#df = pd.read_parquet('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/auth_res.parquet')
#df = pd.read_parquet('resources.parquet')
#df = pd.read_parquet('auth_res.parquet')

#df

# Completing the abstracts

In [19]:
query_results = pd.read_parquet('/Users/lcsanchez/Desktop/Research/Scopus/query_results.parquet')
data = pd.read_parquet('/Users/lcsanchez/Desktop/Research/Scopus/scopus_data.parquet')

In [20]:
print(data.columns)

filtered_data = data[['doi', 'abstract']]
# print(filtered_data.head())

Index(['eid', 'doi', 'pii', 'pubmed_id', 'title', 'subtype',
       'subtypeDescription', 'creator', 'afid', 'affilname',
       'affiliation_city', 'affiliation_country', 'author_count',
       'author_names', 'author_ids', 'author_afids', 'coverDate',
       'coverDisplayDate', 'publicationName', 'issn', 'source_id', 'eIssn',
       'aggregationType', 'volume', 'issueIdentifier', 'article_number',
       'pageRange', 'description', 'authkeywords', 'citedby_count',
       'openaccess', 'freetoread', 'freetoreadLabel', 'fund_acr', 'fund_no',
       'fund_sponsor', 'abstract'],
      dtype='object')


In [21]:
pd.read_parquet('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/resources.parquet')

Unnamed: 0,resID,doi,title,year,abstract,funding_entity,type
0,555859,https://doi.org/10.1016/j.jmva.2021.104871,Variable selection in functional regression mo...,2022,,,articles
1,555861,https://doi.org/10.1111/anzs.12355,Fast and efficient algorithms for sparse semip...,2021,,,articles
2,555858,https://doi.org/10.1007/s11749-020-00728-w,Sparse semiparametric regression when predicto...,2021,,,articles
3,555860,https://doi.org/10.1016/j.spl.2020.109028,A kNN procedure in semiparametric functional d...,2021,,,articles
4,555857,https://doi.org/10.1080/10485252.2019.1567726,Automatic and location-adaptive estimation in ...,2019,,,articles
5,557822,https://doi.org/10.15366/riejs2022.11.1,"Alfabetización ecosocial: fundamentos, experie...",2022,,,articles
6,557823,,Entrevista a Jaime Vindel en torno a su último...,2022,,,articles
7,556484,https://doi.org/10.3390/su132111867,Assessing Energy Descent Scenarios for the Eco...,2021,,,articles
8,557829,,¿Verde y digital?,2021,,,articles
9,557826,https://doi.org/10.12795/Argumentos/2021.i24.02,La filosofía de la tecnología de Cornelius Cas...,2021,,,articles
