# Import libraries and dependencies

In [1]:
from selenium import webdriver
import time
import pandas as pd
import json  
from pathlib import Path
import requests
import numpy as np
import time
# import langid

In [2]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.chrome.service import Service

# Download all researchers from Research Portal

This step was executed in the terminal to download all the resources locally, avoiding looping over the same resources repeteadly and getting the access denied to the Research Portal.

*Further information about this procedure can be found in the README.txt.*

In [3]:
baseURL = 'https://researchportal.uc3m.es/display/inv'
baseURL_activity = 'https://researchportal.uc3m.es/display/act'
relative_path = '/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/'
not_found_txt = 'Individual not found'
valid_ids = []
valid_acts_ids = []

In [None]:
for idx in np.arange(10000, 50000): 
    idx_str = str(idx)
    portal_page = requests.get(baseURL+idx_str)
    if portal_page.ok:
        valid_ids.append(idx_str)
        with open(relative_path + 'display/inv' + idx_str + '.html', 'w') as fout:
            fout.write(portal_page.text)
    print('Researchers so far: ' + str(len(valid_ids)) + ' (' + idx_str + ')\r', end='')
    time.sleep(1)

In [None]:
with open(relative_path + 'all_inv.txt', 'w') as fout:
    fout.write('\n'.join(valid_ids))

In [4]:
for idx in np.arange(371443, 371445): 
    idx_str = str(idx)
    portal_page = requests.get(baseURL_activity+idx_str)
    if portal_page.ok:
        valid_acts_ids.append(idx_str)
        with open(relative_path + 'display/act' + idx_str + '.html', 'w') as fout:
            fout.write(portal_page.text)
    print('Activities so far: ' + str(len(valid_acts_ids)) + ' (' + idx_str + ')\r', end='')
    time.sleep(1)

Activities so far: 2 (371444)

In [5]:
with open(relative_path + 'all_act.txt', 'w') as fout:
    fout.write('\n'.join(valid_acts_ids))

# Loading the data

In [6]:
with open(relative_path + 'all_inv.txt', 'r') as f:
    inv_ids = f.read().splitlines()
    
with open(relative_path + 'all_act.txt', 'r') as f:
    inv_act = f.read().splitlines()

In [7]:
inv_act

['371443', '371444']

# Loading the driver and browser

In [8]:
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
# open it, go to a website, and get results
driver = webdriver.Chrome(options=options)
driver2 = webdriver.Chrome(options=options)

# Crawling

In [9]:
researchers = []
publications = []
projects = []

# Dataset researcherID - publicationID
inv_pub = []

In [None]:
# Access each url given the ID
for n, inv in enumerate(inv_ids):
    print(f'Researcher {n+1} out of {len(inv_ids)}')
    inv_file = Path.cwd().joinpath(f'researchportal.uc3m.es/display/inv{inv}.html')
    auth_url = f'file:///{inv_file}'
    driver.get(auth_url)
    
# SCRAPING RESEARCHERS' INFORMATION
    try:
        name = driver.find_element(By.XPATH,'//span[@itemprop="name"]')
    except:
        name = ''

    try:
        cat = driver.find_element(By.CLASS_NAME, 'categoriainv').text.split(': ')[1]
    except:
        cat = ''
        
    try:
        orcid = driver.find_element(By.CLASS_NAME, 'individual-orcid').find_element(By.TAG_NAME, 'a').get_attribute('href')
    except:
        orcid = ''

    try:
        scopus = driver.find_element(By.ID, 'scopusId-noRangeClass-List').find_element(By.TAG_NAME, 'a').text
    except:
        scopus = ''    
    
    try:
        pos = driver.find_elements(By.CLASS_NAME, 'currentPosition')
    except:
        pos = []
    
    # Position: Academic department, Research group, Institute, ...
    positions = []
    for p in pos:
            group = p.text.split(' : ')
            pos_type = group[0]
            if len(group) > 1:
                pos_name = group[1]
            else:
                pos_name = ''
            # link = p.find_element_by_tag_name('a').get_attribute('href')

            positions.append((pos_type, pos_name))

    try:
        subject = driver.find_element(By.ID, 'individual-hasResearchArea').text
    except:
        subject = ''
    
    # Create an array for which each subject is an element
    subjects = subject.splitlines()

            
    try:
        email = driver.find_element(By.CLASS_NAME, 'individual-emails').find_element(By.TAG_NAME,'a').text
    except:
        email = ''
       
    
    researchers.append({'invID':inv, 'name' : name.text, 'orcid' : orcid, 'scopus': scopus, 'category':cat, 'email':email, 'positions':dict(positions), 'subjects': subjects})
    
    # Conditional check and savings (Periodic backups)
    if not n%100:
        with open(relative_path + 'outputs/researchers.json', 'w') as f:
            json.dump(researchers, f, indent=4)

In [10]:
# Access each url given the ID
for n, inv in enumerate(inv_act):
    
    print(f'\nActivity {n+1} out of {len(inv_act)}')
    inv_file = Path.cwd().joinpath(f'researchportal.uc3m.es/display/act{inv}.html')
    auth_url = f'file:///{inv_file}'
    driver.get(auth_url)
    
    # section: articles, book chapters, conference contributions, working papers, projects, ...
    section = driver.find_element(By.CLASS_NAME, 'display-title').text
    valid_sections = ["Articles", "Book Chapters", "Conference Contributions", "Working Papers"]
            
    try:
        if section in valid_sections:
            resID = inv
            print("Resource ID: ", resID)
            
            # Title of the activity
            title = driver.find_element(By.CLASS_NAME, 'fn').text
            for section_name in valid_sections:
                if title.endswith(section_name):
                    title = title.rsplit(section_name, 1)[0].strip()
                    print("Title: ", title)
                    break  
                    
            # Publication date
            try:         
                publication_date = driver.find_element(By.XPATH, '//h3[@id="dateTimeValue"]/following-sibling::ul/li').text.strip()
                print("Publication Date: ", publication_date)
            except:
                publication_date = ""
                print("Publication Date: ", publication_date)

                
            # Publisher/magazine
            try: 
                if section == "Articles":
                    publication_venue = driver.find_element(By.XPATH, '//h3[@id="hasPublicationVenue"]/following-sibling::ul/li/a').text.strip()
                    print("Publication Venue: ", publication_venue)
                    
                elif section in ["Book Chapters", "Conference Contributions", "Working Papers"]:
                    publication_venue = driver.find_element(By.XPATH, '//h3[@id="publisher"]/following-sibling::ul/li/a').text.strip()
                    print("Publisher:", publication_venue)
                    
            except:
                publication_venue = ""
                print("Publisher:", publication_venue)

                    
            # DOI number
            try: 
                doi = driver.find_element(By.XPATH, '//h3[@id="doi"]/following-sibling::ul/li/a').text.strip()
                print("DOI:", doi)
            except: 
                doi = ""
                print("DOI:", doi)

                
            # Abstract
            try:
                abstract = driver.find_element(By.XPATH, '//h3[@id="abstract"]/following-sibling::ul/li').text.strip()
                print("Abstract: ", abstract)
            except:
                abstract = ""
                print("Abstract: ", abstract)


                
            # Keywords
            try:
                keywords_list = []
                keywords = driver.find_element(By.XPATH, '//h3[@id="freetextKeyword"]/following-sibling::ul/li').text.strip()
                # Split the string using commas
                keywords_split_by_comma = keywords.split(',')

                # Split each resulting keyword using semicolons
                keywords_list = [keyword.strip() for keyword_with_semicolon in keywords_split_by_comma for keyword in keyword_with_semicolon.split(';')]
                print("Keywords List: ", keywords_list)
            except:
                keywords_list = []
                print("Keywords List: ", keywords_list)

                
            # Research Areas
            try: 
                research_areas = driver.find_elements(By.XPATH, '//h3[@id="hasResearchArea"]/following-sibling::ul/li')
                research_areas = [element.text.strip() for element in research_areas]
                print("Research Areas: ", research_areas)
            except:
                research_areas = []
                print("Research Areas: ", research_areas)

            
            # Authors IDs (if there is any ID)
            try:
                # Locate the parent <article> element
                article_element = driver.find_element(By.XPATH, '//article[@class="property" and @role="article"]')
                # Locate the <ul> element within the article for authors
                authors_list = article_element.find_element(By.XPATH, '//ul[@role="list" and @id="relatedBy-Authorship-List"]')

                # Get all <li> elements within the authors list
                author_items = authors_list.find_elements(By.XPATH, 'li')

                # Extract the author IDs
                author_ids = []

                for author_order, author_item in enumerate(author_items, start=1):
                    author_name = author_item.text.strip()
                    print("Author: ", author_name, "with order:", author_order)
                    href_attribute = author_item.find_elements(By.XPATH, 'a')
                    
                    for invID in href_attribute:
                        invID = invID.get_attribute('href').split("inv")[1]
                        author_ids.append(invID)

                        inv_pub.append({'invID': invID, 'pubID': resID, 'orderID': author_order})
        
                print("Valid Researchers IDs: ", author_ids)

            except:
                author_ids = []
    
            publications.append({'resID': resID, 'section': section, 'title': title, 'doi': doi, 'publication_date': publication_date, 'publisher': publication_venue, 'abstract': abstract, 'keywords': keywords_list, 'research_areas': research_areas})

    except:
        pass
    
    if not n%100:
        with open(relative_path + 'outputs/publications.json', 'w') as f:
            json.dump(publications, f, indent=4)
        with open(relative_path + 'outputs/inv_pub.json', 'w') as f:
            json.dump(inv_pub, f, indent=4)
            
with open(relative_path + 'outputs/publications.json', 'w') as f:
    json.dump(publications, f, indent=4)
with open(relative_path + 'outputs/inv_pub.json', 'w') as f:
    json.dump(inv_pub, f, indent=4)


Activity 1 out of 2
Resource ID:  371443
Title:  Extending the concurrency model of the real-time specification for Java
Publication Date:  September 2011
Publication Venue:  CONCURRENCY AND COMPUTATION-PRACTICE & EXPERIENCE
DOI: https://doi.org/10.1002/cpe.1675
Abstract:  The current RTSJ (Real-Time Specification for Java) threading model is dualized: a programmer has to decide between the high predictability offered by the region-based model and the flexibility offered by the garbage collection. So far, there is no unique type of thread which offers both the high predictability of a non-heap thread and the flexibility of a real-time thread in a single entity. Furthermore, this lack has a serious impact on the programmer who has to deal with new and sometimes non-trivial to use mechanisms, such as specific queues of objects or new types of threads, in order to avoid the priority inversion caused by the garbage collector. In order to tackle the concern properly and provide an improved

In [11]:
inv_pub

[{'invID': '21578', 'pubID': '371443', 'orderID': 3},
 {'invID': '35641', 'pubID': '371444', 'orderID': 2},
 {'invID': '18070', 'pubID': '371444', 'orderID': 4}]

In [None]:
"""
    # SCRAPING PROJECTS' INFORMATION
    projects = []
    try:
        pub_list = driver.find_element(By.ID,'projectsGroup').find_elements(By.CLASS_NAME, 'property')
        for section in pub_list:
            # section_title: 'principal researcher on', 'researcher on'
            section_title = section.find_element(By.TAG_NAME, 'h3').text
            sections = []
            for p in section.find_elements(By.TAG_NAME,'li'):
                element = p.find_element(By.TAG_NAME, 'a')
                # resource ID
                resID = element.get_attribute('href').split('/')[-1][3:]
                
                try:
                    inv_file = Path.cwd().joinpath(f'researchportal.uc3m.es/display/act{resID}.html')
                    auth_url = f'file:///{inv_file}'
                    driver2.get(auth_url)

                    
                    property_list = driver2.find_elements(By.CLASS_NAME, 'property')

                    # Iterar a través de los elementos 'property' para encontrar el abstract
                    for article in property_list:
                        abstract = ""

                        # Verificar si el ID del elemento contiene 'abstract'
                        abstract_elements = article.find_elements(By.ID, "abstract-noRangeClass-List")
                        if abstract_elements:
                            # Extraer el texto del elemento
                            abstract = abstract_elements[0].text
                except:
                    abstract = ''
                    
                title = element.text
                year = p.find_element(By.TAG_NAME, 'span').text
                
                                
                try:
                    funding_entity = p.find_element(By.XPATH, './/a[@title="awarded by"]').text
                
                except:
                    funding_entity = ""
                    
                sections.append({'resID':resID, 'title':title, 'year':year, 'funding_entity': funding_entity, 'abstract': abstract})
            projects.append((section_title, sections))
    except:
        pass
    projects.append({'author':inv, 'projects':dict(projects)})
  
    
    # Conditional check and savings (Periodic backups)
    if not n%100:
        with open(relative_path + 'outputs/researchers.json', 'w') as f:
            json.dump(researchers, f, indent=4)
        with open(relative_path + 'outputs/publications.json', 'w') as f:
            json.dump(publications_data, f, indent=4)
        with open(relative_path + 'outputs/projects.json', 'w') as f:
            json.dump(projects, f, indent=4)

# Always saving (Periodic backups)

with open(relative_path + 'outputs/researchers.json', 'w') as f:
    json.dump(researchers, f, indent=4)
with open(relative_path + 'outputs/publications.json', 'w') as f:
    json.dump(publications, f, indent=4)
with open(relative_path + 'outputs/projects.json', 'w') as f:
    json.dump(projects, f, indent=4) 

"""

# Process JSON files

In [None]:
def process_json(obj):
    '''
    Function to process json recursively
    '''
    resources = []
    auth_res = []
    author = ''
    def process(obj, objType='', author=''):
        if isinstance(obj, dict):
            for k, v in obj.items():     
                if k == 'author':
                    author = v
                if k == 'title':
                    d = {'type':objType.strip()}
                    d.update(obj)
                    resources.append(d)
                    auth_res.append((author, obj['resID']))
                else:
                    if isinstance(v, (dict, list)):
                        process(v, k, author)
        elif isinstance(obj, list):
            for el in obj:
                process(el, objType, author)
    process(obj)
    return resources, auth_res

### Create tables to match author ID (invID) to resource ID (resID)

In [None]:
# DF researchers
with open(relative_path + 'outputs/researchers.json', 'r') as f:
    data = json.load(f)
    df_researchers = pd.json_normalize(data)

df_researchers.columns = ['invID']+[cname.split('.')[-1].replace(' ', '_').lower() for cname in df_researchers.columns[1:]]

# DF resources, DF auth-res
resources = []
auth_res = []
auth_res_publications = []
auth_res_projects = []
auth_res_others = []

with open(relative_path + 'outputs/publications.json', 'r') as f:
    data = json.load(f)
res, a_r = process_json(data)
resources.extend(res)
auth_res.extend(a_r)
auth_res_publications.extend(a_r)

# DF projects
with open(relative_path + 'outputs/projects.json', 'r') as f:
    data = json.load(f)
res, a_r = process_json(data)
resources.extend(res)
auth_res.extend(a_r)
auth_res_projects.extend(a_r)

# DF Others
with open(relative_path + 'outputs/others.json', 'r') as f:
    data = json.load(f)
res, a_r = process_json(data)
resources.extend(res)
auth_res.extend(a_r)
auth_res_others.extend(a_r)

df_resources = pd.DataFrame(resources)
df_resources = df_resources.drop_duplicates(subset='resID')
df_resources = df_resources[list(df_resources.columns[1:]) + [df_resources.columns[0]]]

In [None]:
df_auth_res = pd.DataFrame(auth_res, columns=['invID', 'resID'])
df_auth_res_publications = pd.DataFrame(auth_res_publications, columns=['invID', 'resID'])
df_auth_res_projects = pd.DataFrame(auth_res_projects, columns=['invID', 'resID'])
df_auth_res_others = pd.DataFrame(auth_res_others, columns=['invID', 'resID'])

# Save as a CSV file

In [None]:
df_researchers.to_csv(relative_path + 'outputs/researchers.csv', index=False)
df_resources.to_csv(relative_path + 'outputs/resources.csv', index=False)

df_auth_res_publications.to_csv(relative_path + 'outputs/auth_res_publications.csv', index=False)
df_auth_res_projects.to_csv(relative_path + 'outputs/auth_res_projects.csv', index=False)
df_auth_res_others.to_csv(relative_path + 'outputs/auth_res_others.csv', index=False)

# Save as a parquet file

In [None]:
df_researchers.to_parquet(relative_path + 'outputs/researchers.parquet', index=False)
df_resources.to_parquet(relative_path + 'outputs/resources.parquet', index=False)
df_auth_res.to_parquet(relative_path + '/outputs/auth_res.parquet', index=False)

df_auth_res_publications.to_parquet(relative_path + 'outputs/auth_res_publications.parquet', index=False)
df_auth_res_projects.to_parquet(relative_path + 'outputs/auth_res_projects.parquet', index=False)
df_auth_res_others.to_parquet(relative_path + 'outputs/auth_res_others.parquet', index=False)

# Formating our dataset and completing abstracts with 'scopus' database

In [None]:
"""
# Regular expression to extract DOI from the URL
doi_pattern = r'https://doi\.org/(?:http://dx\.doi\.org/)?(.+)'

# Extract DOI values from URLs in the 'doi' column
df_resources['doi'] = df_resources['doi'].str.extract(doi_pattern)
"""

### Formating 'df_resources' database: homogenizing NAs and filtering observations

In [None]:
"""
# Homogenizing the NAs to the same format
# Lista de nombres de las columnas en las que deseas reemplazar los valores nulos o vacíos por NaN
columns_to_process = ['doi', 'title', 'year', 'abstract', 'funding_entity', 'type']

# Iterar sobre las columnas y reemplazar los valores nulos o vacíos por NaN
for column in columns_to_process:
    df_resources[column] = df_resources[column].apply(lambda x: np.nan if x in [None, ''] else x)
    
"""

In [None]:
"""
# Filter the dataset by observations that have at least doi or at least abstract (so, an observation that can be completed if it has no abstract)
filtered_df_resources = df_resources[(df_resources['doi'].isna() & ~df_resources['abstract'].isna()) | (~df_resources['doi'].isna() & ~df_resources['abstract'].isna()) | (~df_resources['doi'].isna() & df_resources['abstract'].isna())]
filtered_df_resources.reset_index(drop=True, inplace=True)
"""

### Formating Scopus database

In [None]:
"""
data = pd.read_parquet('/Users/lcsanchez/Desktop/Research/Scopus/scopus_data.parquet')

filtered_data = data[['doi', 'description']]
"""

### Joining databases

In [None]:
"""
# Perform a left join on 'doi' column
merged_df = pd.merge(filtered_df_resources, filtered_data, on='doi', how='left')
#merged_df

# Llenar NaN en la columna 'abstract' con el valor del abstract de SCOPUS ('description' column) si 'abstract' está vacía
merged_df['abstract'] = merged_df['abstract'].combine_first(merged_df['description'])

# Eliminamos los duplicados
merged_df.drop_duplicates(subset='resID', keep='first', inplace=True)

merged_df = merged_df.drop(columns=['description'])

# Save the merged table as a CSV file
merged_df.to_csv(relative_path + 'outputs/merged_table.csv', index=False)

"""