# Import libraries and dependencies

In [1]:
from selenium import webdriver
import time
import pandas as pd
import json  
from pathlib import Path
import requests
import numpy as np
import time
# import langid

In [2]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.chrome.service import Service

# Download all researchers from Research Portal

This step was executed in the terminal to download all the resources locally, avoiding looping over the same resources repeteadly and getting the access denied to the Research Portal.

*Further information about this procedure can be found in the README.txt.*

In [11]:
baseURL = 'https://researchportal.uc3m.es/display/inv'
not_found_txt = 'Individual not found'
valid_ids = []

In [12]:
for idx in np.arange(49050, 49065): 
    idx_str = str(idx)
    portal_page = requests.get(baseURL+idx_str)
    if portal_page.ok:
        valid_ids.append(idx_str)
        with open('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/display/inv' + idx_str + '.html', 'w') as fout:
            fout.write(portal_page.text)
    print('Researchers so far: ' + str(len(valid_ids)) + ' (' + idx_str + ')\r', end='')
    time.sleep(1)

Researchers so far: 4 (49064)

In [13]:
with open('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/all_inv.txt', 'w') as fout:
    fout.write('\n'.join(valid_ids))

In [18]:
valid_ids

['49050', '49052', '49053', '49062']

# Loading the data

In [19]:
with open('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/all_inv.txt', 'r') as f:
    inv_ids = f.read().splitlines()

In [20]:
# inv_ids

['49050', '49052', '49053', '49062']

# Loading the driver and browser

In [21]:
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
# open it, go to a website, and get results
driver = webdriver.Chrome(options=options)
driver2 = webdriver.Chrome(options=options)

# Data scraping

In [22]:
authors_data = []
publications_data = []
projects_data = []
others_data = []

In [None]:
# Path.cwd()

In [23]:
# Access each url given the ID
for n, inv in enumerate(inv_ids):
    print(f'Researcher {n+1} out of {len(inv_ids)}')
    inv_file = Path.cwd().joinpath(f'researchportal.uc3m.es/display/inv{inv}.html')
    auth_url = f'file:///{inv_file}'
    driver.get(auth_url)
    
# SCRAPING RESEARCHERS' INFORMATION
    try:
        name = driver.find_element(By.XPATH,'//span[@itemprop="name"]')
    except:
        name = ''

    try:
        cat = driver.find_element(By.CLASS_NAME, 'categoriainv').text.split(': ')[1]
    except:
        cat = ''
        
    try:
        orcid = driver.find_element(By.CLASS_NAME, 'individual-orcid').find_element(By.TAG_NAME, 'a').get_attribute('href')
    except:
        orcid = ''

    try:
        scopus = driver.find_element(By.ID, 'scopusId-noRangeClass-List').find_element(By.TAG_NAME, 'a').text
    except:
        scopus = ''    
    
    try:
        pos = driver.find_elements(By.CLASS_NAME, 'currentPosition')
    except:
        pos = []
    
    # Position: Academic department, Research group, Institute, ...
    positions = []
    for p in pos:
            group = p.text.split(' : ')
            pos_type = group[0]
            if len(group) > 1:
                pos_name = group[1]
            else:
                pos_name = ''
            # link = p.find_element_by_tag_name('a').get_attribute('href')

            positions.append((pos_type, pos_name))

    try:
        subject = driver.find_element(By.ID, 'individual-hasResearchArea').text
    except:
        subject = ''
    
    # create an array for which each subject is an element
    subjects = subject.splitlines()

            
    try:
        email = driver.find_element(By.CLASS_NAME, 'individual-emails').find_element(By.TAG_NAME,'a').text
    except:
        email = ''
       
    
    authors_data.append({'invID':inv, 'name' : name.text, 'orcid' : orcid, 'scopus': scopus, 'category':cat, 'email':email, 'positions':dict(positions), 'subjects': subjects})
    
    
# SCRAPING PUBLICATIONS' INFORMATION
    publications = []
    try:
        # element that references the 'sample of publications'
        pub_list = driver.find_element(By.ID, 'publicationsGroup').find_elements(By.CLASS_NAME, 'property')
        
        for section in pub_list:
            # section title: sample of publications
            section_title = section.find_element(By.TAG_NAME,'h3').text
            sections = []
            
            # pub_type: articles, book chapters, books, conference contributions, ...
            for pub in section.find_elements(By.CLASS_NAME, 'subclass'):
                articles = []
                pub_type = pub.find_element(By.TAG_NAME, 'h3').text
            
                for p in pub.find_elements(By.TAG_NAME, 'li'):
                    element = p.find_element(By.TAG_NAME,'a')
                
                    title = element.text
                    year = p.find_element(By.TAG_NAME,'span').text
                    
                    try:
                        doi_element = p.find_element(By.CLASS_NAME, "altmetric-embed")
                        doi = doi_element.get_attribute("data-doi")
                        
                        # convert DOIs to the same format
                        if doi.startswith("https://doi.org/"):
                            doi = doi
                        else:
                            doi = f"https://doi.org/{doi}"
                        
                    except:
                        doi = ""
                    
                    
                    #resID: resource ID
                    resID = element.get_attribute('href').split('/')[-1][3:]

                    try:
                        inv_file = Path.cwd().joinpath(f'researchportal.uc3m.es/display/act{resID}.html')
                        auth_url = f'file:///{inv_file}'
                        driver2.get(auth_url)
                        
        
                        property_list = driver2.find_elements(By.CLASS_NAME, 'property')
                        # Iterar a través de los elementos 'property' para encontrar el abstract
                        for article_element in property_list:
                            abstract = ''
                            # Verificar si el ID del elemento contiene 'abstract'
                            abstract_elements = article_element.find_elements(By.ID, "abstract-noRangeClass-List")
                            if abstract_elements:
                                # Within the 'ul' element, find the 'li' element to extract the abstract text
                                li_elements = abstract_elements[0].find_elements(By.TAG_NAME, "li")
                                if li_elements:
                                    abstract = li_elements[0].text
                                    print("Este abstract pertenece a: ", title)
                                    print(abstract)
                            else:
                                abstract = ''
                                print("Esta obra no tiene abstract: ", title)
                                print(abstract)
                                
                            print("Así sale el abstract de esta obra impreso: ", title)
                            print(abstract)
                                
                    except: 
                        abstract = ''
                        print("Esta obra pasa por la excepción: ", title)
                        print(abstract)


                    articles.append({'resID': resID, 'doi': doi,'title': title, 'year': year, 'abstract': abstract})
                sections.append((pub_type, articles))
            publications.append((section_title, dict(sections)))

    except:
        pass
    
    publications_data.append({'author':inv, 'publications':dict(publications)})
    
    
    # SCRAPING PROJECTS' INFORMATION
    projects = []
    try:
        pub_list = driver.find_element(By.ID,'projectsGroup').find_elements(By.CLASS_NAME, 'property')
        for section in pub_list:
            # section_title: 'principal researcher on', 'researcher on'
            section_title = section.find_element(By.TAG_NAME, 'h3').text
            sections = []
            for p in section.find_elements(By.TAG_NAME,'li'):
                element = p.find_element(By.TAG_NAME, 'a')
                # resource ID
                resID = element.get_attribute('href').split('/')[-1][3:]
                
                try:
                    inv_file = Path.cwd().joinpath(f'researchportal.uc3m.es/display/act{resID}.html')
                    auth_url = f'file:///{inv_file}'
                    driver2.get(auth_url)

                    
                    property_list = driver2.find_elements(By.CLASS_NAME, 'property')

                    # Iterar a través de los elementos 'property' para encontrar el abstract
                    for article in property_list:
                        abstract = ""

                        # Verificar si el ID del elemento contiene 'abstract'
                        abstract_elements = article.find_elements(By.ID, "abstract-noRangeClass-List")
                        if abstract_elements:
                            # Extraer el texto del elemento
                            abstract = abstract_elements[0].text
                except:
                    abstract = ''
                    
                title = element.text
                year = p.find_element(By.TAG_NAME, 'span').text
                
                                
                try:
                    funding_entity = p.find_element(By.XPATH, './/a[@title="awarded by"]').text
                
                except:
                    funding_entity = ""
                    
                sections.append({'resID':resID, 'title':title, 'year':year, 'funding_entity': funding_entity, 'abstract': abstract})
            projects.append((section_title, sections))
    except:
        pass
    projects_data.append({'author':inv, 'projects':dict(projects)})

     
# SCRAPING OTHERS' INFORMATION
    others = []
    try:
        pub_list = driver.find_element(By.ID, 'otherGroup').find_elements(By.CLASS_NAME, 'property')
        for section in pub_list:
            # section_title: 'other activities'
            section_title = section.find_element(By.TAG_NAME, 'h3').text
            sections = []
            
            for pub in section.find_elements(By.CLASS_NAME,'subclass'):
                articles = []
                # pub_type: 'mobility', 'supervised thesis', ...
                pub_type = pub.find_element(By.TAG_NAME, 'h3').text
                
                for p in pub.find_elements(By.TAG_NAME, 'li'):
                    element = p.find_element(By.TAG_NAME, 'a')
                    resID = element.get_attribute('href').split('/')[-1][3:]
                    title = element.text
                    
                    articles.append({'resID':resID, 'title':title})
                sections.append((pub_type, articles))
            others.append((section_title, dict(sections)))
            
    except:
        pass
    others_data.append({'author':inv, 'others':dict(others)})
    
# Conditional check and savings (Periodic backups)
    if not n%100:
        with open('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/researchers.json', 'w') as f:
            json.dump(authors_data, f)
        with open('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/publications.json', 'w') as f:
            json.dump(publications_data, f)
        with open('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/projects.json', 'w') as f:
            json.dump(projects_data, f)
        with open('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/others.json', 'w') as f:
            json.dump(others_data, f)

# Always saving (Periodic backups)

with open('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/researchers.json', 'w') as f:
    json.dump(authors_data, f)
with open('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/publications.json', 'w') as f:
    json.dump(publications_data, f)
with open('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/projects.json', 'w') as f:
    json.dump(projects_data, f)
with open('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/others.json', 'w') as f:
    json.dump(others_data, f)      

Researcher 1 out of 4
Esta obra no tiene abstract:  Alfabetización ecosocial: fundamentos, experiencias y retos. Presentación

Así sale el abstract de esta obra impreso:  Alfabetización ecosocial: fundamentos, experiencias y retos. Presentación

Esta obra no tiene abstract:  Alfabetización ecosocial: fundamentos, experiencias y retos. Presentación

Así sale el abstract de esta obra impreso:  Alfabetización ecosocial: fundamentos, experiencias y retos. Presentación

Esta obra no tiene abstract:  Alfabetización ecosocial: fundamentos, experiencias y retos. Presentación

Así sale el abstract de esta obra impreso:  Alfabetización ecosocial: fundamentos, experiencias y retos. Presentación

Esta obra no tiene abstract:  Alfabetización ecosocial: fundamentos, experiencias y retos. Presentación

Así sale el abstract de esta obra impreso:  Alfabetización ecosocial: fundamentos, experiencias y retos. Presentación

Esta obra no tiene abstract:  Alfabetización ecosocial: fundamentos, experiencias 

In [10]:
print(publications_data)

[{'author': '49050', 'publications': {'sample of publications': {'articles ': [{'resID': '557822', 'doi': 'https://doi.org/10.15366/riejs2022.11.1', 'title': 'Alfabetización ecosocial: fundamentos, experiencias y retos. Presentación', 'year': '2022', 'abstract': ''}, {'resID': '557823', 'doi': '', 'title': 'Entrevista a Jaime Vindel en torno a su último libro Estética fósil. Imaginarios de la energía y crisis ecosocial', 'year': '2022', 'abstract': ''}, {'resID': '556484', 'doi': 'https://doi.org/10.3390/su132111867', 'title': 'Assessing Energy Descent Scenarios for the Ecological Transition in Spain 2020-2030', 'year': '2021', 'abstract': ''}, {'resID': '557829', 'doi': '', 'title': '¿Verde y digital?', 'year': '2021', 'abstract': ''}, {'resID': '557826', 'doi': 'https://doi.org/10.12795/Argumentos/2021.i24.02', 'title': 'La filosofía de la tecnología de Cornelius Castoriadis', 'year': '2021', 'abstract': ''}, {'resID': '557825', 'doi': 'https://doi.org/http://dx.doi.org/10.5209/rced.

# Process JSON files

In [24]:
def process_json(obj):
    '''
    Function to process json recursively
    '''
    resources = []
    auth_res = []
    author = ''
    def process(obj, objType='', author=''):
        if isinstance(obj, dict):
            for k, v in obj.items():     
                if k == 'author':
                    author = v
                if k == 'title':
                    d = {'type':objType.strip()}
                    d.update(obj)
                    resources.append(d)
                    auth_res.append((author, obj['resID']))
                else:
                    if isinstance(v, (dict, list)):
                        process(v, k, author)
        elif isinstance(obj, list):
            for el in obj:
                process(el, objType, author)
    process(obj)
    return resources, auth_res

In [25]:
# DF researchers
with open('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/researchers.json', 'r') as f:
    data = json.load(f)
    df_researchers = pd.json_normalize(data)

df_researchers.columns = ['invID']+[cname.split('.')[-1].replace(' ', '_').lower() for cname in df_researchers.columns[1:]]

# DF resources, DF auth-res
resources = []
auth_res = []
with open('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/publications.json', 'r') as f:
    data = json.load(f)
res, a_r = process_json(data)
resources.extend(res)
auth_res.extend(a_r)

# DF projects
with open('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/projects.json', 'r') as f:
    data = json.load(f)
res, a_r = process_json(data)
resources.extend(res)
auth_res.extend(a_r)

# DF Others
with open('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/others.json', 'r') as f:
    data = json.load(f)
res, a_r = process_json(data)
resources.extend(res)
auth_res.extend(a_r)

df_resources = pd.DataFrame(resources)
df_resources = df_resources.drop_duplicates(subset='resID')
df_resources = df_resources[list(df_resources.columns[1:]) + [df_resources.columns[0]]]
df_auth_res = pd.DataFrame(auth_res, columns=['invID', 'resID'])


In [None]:
#df_researchers
df_resources
#df_auth_res

# Save as a CSV file

In [26]:

df_researchers.to_csv('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/researchers.csv', index=False)
df_resources.to_csv('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/resources.csv', index=False)
df_auth_res.to_csv('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/auth_res.csv', index=False)


#df = pd.read_csv('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/researchers.csv')
#df = pd.read_csv('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/resources.csv')
#df = pd.read_csv('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/auth_res.csv')

#df


# Save as a parquet file

In [27]:
df_researchers.to_parquet('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/researchers.parquet', index=False)
df_resources.to_parquet('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/resources.parquet', index=False)
df_auth_res.to_parquet('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/auth_res.parquet', index=False)


#df = pd.read_parquet('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/auth_res.parquet')
df_resources = pd.read_parquet('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/resources.parquet')
#df = pd.read_parquet('auth_res.parquet')

#df_resources['abstract'] = df_resources['abstract'].apply(lambda x: None if x == '' else x)

# Regular expression to extract DOI from the URL
doi_pattern = r'https://doi\.org/(?:http://dx\.doi\.org/)?(.+)'

# Extract DOI values from URLs in the 'doi' column
df_resources['doi'] = df_resources['doi'].str.extract(doi_pattern)

"""
filtered_df = df_resources[(df_resources['doi'].notna()) & (df_resources['abstract'].notna())]
filtered_df.reset_index(drop=True, inplace=True)
filtered_df
"""

"\nfiltered_df = df_resources[(df_resources['doi'].notna()) & (df_resources['abstract'].notna())]\nfiltered_df.reset_index(drop=True, inplace=True)\nfiltered_df\n"

In [28]:
df_resources

Unnamed: 0,resID,doi,title,year,abstract,funding_entity,type
0,557822,10.15366/riejs2022.11.1,"Alfabetización ecosocial: fundamentos, experie...",2022,,,articles
1,557823,,Entrevista a Jaime Vindel en torno a su último...,2022,,,articles
2,556484,10.3390/su132111867,Assessing Energy Descent Scenarios for the Eco...,2021,,,articles
3,557829,,¿Verde y digital?,2021,,,articles
4,557826,10.12795/Argumentos/2021.i24.02,La filosofía de la tecnología de Cornelius Cas...,2021,,,articles
5,557825,10.5209/rced.68068,La elección de centro educativo en un sistema ...,2021,,,articles
6,557828,,Praxis y compromiso,2020,,,articles
7,557827,,La non-neutralité de la technologie. Une ontol...,2020,,,articles
8,557824,10.15366/riejs2020.9.3,Covid-19: ¿Punto sin retorno de la digitalizac...,2020,,,articles
9,557821,,La ontología del mundo sociohistórico de Corne...,2020,,,articles


# Formating our dataset and completing abstracts with 'scopus' database

### Formating output database

In [29]:
# Homogenizing the NAs to the same format
# Lista de nombres de las columnas en las que deseas reemplazar los valores nulos o vacíos por NaN
columns_to_process = ['doi', 'title', 'year', 'abstract', 'funding_entity', 'type']

# Iterar sobre las columnas y reemplazar los valores nulos o vacíos por NaN
for column in columns_to_process:
    df_resources[column] = df_resources[column].apply(lambda x: np.nan if x in [None, ''] else x)

In [None]:
# df_resources

In [30]:
# Filter the dataset by observations that have at least doi or at least abstract (that can be tracked)
filtered_df_resources = df_resources[(df_resources['doi'].isna() & ~df_resources['abstract'].isna()) | (~df_resources['doi'].isna() & ~df_resources['abstract'].isna()) | (~df_resources['doi'].isna() & df_resources['abstract'].isna())]

In [None]:
filtered_df_resources

### Formating scopus database

In [31]:
data = pd.read_parquet('/Users/lcsanchez/Desktop/Research/Scopus/scopus_data.parquet')

In [32]:
filtered_data = data[['doi', 'abstract']]
#filtered_data.head()

### Joining databases

In [None]:
# merged_df.loc[merged_df['resID'] == '560391']

In [33]:
# Perform a left join on 'doi' column
merged_df = pd.merge(filtered_df_resources, filtered_data, on='doi', how='left', suffixes=('_df_resources', '_filtered_data'))
#merged_df

# Llenar NaN en la columna 'abstract_df_resources' con el valor de  si ambas columnas están vacías
merged_df['abstract_df_resources'] = merged_df['abstract_df_resources'].combine_first(merged_df['abstract_filtered_data'])

In [34]:
# Eliminamos los duplicados
merged_df.drop_duplicates(subset='resID', keep='first', inplace=True)
merged_df.head(100)

Unnamed: 0,resID,doi,title,year,abstract_df_resources,funding_entity,type,abstract_filtered_data
0,557822,10.15366/riejs2022.11.1,"Alfabetización ecosocial: fundamentos, experie...",2022,,,articles,
1,556484,10.3390/su132111867,Assessing Energy Descent Scenarios for the Eco...,2021,"© 2021 by the authors. Licensee MDPI, Basel, S...",,articles,"© 2021 by the authors. Licensee MDPI, Basel, S..."
2,557826,10.12795/Argumentos/2021.i24.02,La filosofía de la tecnología de Cornelius Cas...,2021,,,articles,
3,557825,10.5209/rced.68068,La elección de centro educativo en un sistema ...,2021,,,articles,
4,557824,10.15366/riejs2020.9.3,Covid-19: ¿Punto sin retorno de la digitalizac...,2020,,,articles,
5,557835,10.15366/bp2017.15.0013,La aproximación de Manuel Sacristán a la cuest...,2017,,,articles,
6,557836,10.33064/20euph1362,Los modos de vida y la paradoja de la ética co...,2017,,,articles,
7,557833,10.18848/2471-8238/CGP,De la catástrofe a la transformación social: l...,2017,,,articles,
8,557839,10.1002/adma.201405632,TiS3 transistors with tailored morphology and ...,2015,,,articles,
9,560391,,Speak for Nature: Interdisciplinary Approaches...,2023 - 2027,The complexity of current ecological challenge...,EUROPEAN COMMISSION RESEARCH EXECUTIVE AGENCY,principal researcher on,


In [None]:
# Estas observaciones no salen puesto que hemos eliminado ya las observaciones sin doi ni abstract (no podemos hacer nada con ellas)
# filtered_df_resources.loc[filtered_df_resources['resID'] == '557858']

#Esta sí tiene doi pero no abstract, y sale igual en el filtrado y en el original
# df_resources.loc[df_resources['resID'] == '555859']
# filtered_df_resources.loc[filtered_df_resources['resID'] == '555859']

In [None]:
# Sale bien el merge de la observación que hemos comprobado antes
#merged_df.loc[merged_df['resID'] == '555859']

In [None]:
# Comprobación rápida de que los valores de abstract vacíos
# de la nueva database están vacíos en las bases de datos originales (o no están)
#data.loc[data['doi'] == '10.15366/bp2017.15.0013']
#df_resources.loc[df_resources['doi'] == '10.15366/bp2017.15.0013']

In [None]:
merged_df

In [36]:
# Save the merged table as a CSV file
merged_df.to_csv('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/output/merged_table.csv', index=False)