# 1. Import libraries and dependencies

In [1]:
import time
import pandas as pd
import json  
from pathlib import Path
import numpy as np
import time
import pyarrow.parquet as pq
from bs4 import BeautifulSoup
from tqdm import tqdm

# 2. Define input and output directories

In [2]:
raw_data = Path("/export/data_ml4ds/AI4U/Datasets/ResearchPortal/20231005/rawdata/researchportal.uc3m.es/display")
parquet_data = Path("/export/data_ml4ds/AI4U/Datasets/ResearchPortal/20231005/parquet")

parquet_data.mkdir(parents=True, exist_ok=True)

# 3. Table `researchers`

In [3]:
all_researchers_files = list(raw_data.glob("inv*"))
all_researchers = []

for file_path in tqdm(all_researchers_files):
    if file_path.is_file():
        with file_path.open("r") as fin:
            content = fin.read()

            # Crear un objeto BeautifulSoup
            soup = BeautifulSoup(content, "html.parser")

            element = soup.find(class_="categoriainv")

            # Nombre
            try:
                element = soup.find('span', {'itemprop': 'name'})
                name = element.get_text().strip().title()
            except:
                name = None
            
            # Categoría del Investigador
            try:
                element = soup.find(class_="categoriainv")
                cat = element.get_text().split(': ')[1].strip()
            except:
                cat = None

            # ORCID
            try:
                element = soup.find(class_='individual-orcid')
                # Buscar dentro de este elemento el primer enlace y obtener su 'href'
                enlace = element.find('a')
                if enlace and 'href' in enlace.attrs:
                    orcid = enlace['href'].split('/')[-1].strip()
            except:
                orcid = None

            # Scopus
            try:
                element = soup.find(id='scopusId-noRangeClass-List')
                if element:
                    enlace = element.find('a')
                    if enlace:
                        scopus = enlace.get_text().strip()
            except:
                scopus = None

            # Positions
            try:
                element = soup.find_all(class_='currentPosition')
                for el in element:
                    if "Academic Department" in el.get_text():
                        dep = el.find('span', itemprop='name').get_text().strip()
                    if "Research Group" in el.get_text():
                        res_group = el.find('span', itemprop='name').get_text().strip()
            except:
                dep = None
                res_group = None

            # Research Areas
            try:
                element = soup.find('ul', {'id': 'individual-hasResearchArea'})
                subject = []
                # Si se encuentra el elemento ul, extraer los textos de los elementos li
                list_items = element.find_all('li', role="listitem")
                for item in list_items:
                    subject.append(item.get_text(strip=True))
            except:
                subject = None

            # email
            try:
                element = soup.find(class_='individual-emails')
                email = element.find('a').get_text().strip()
            except:
                email = None

            # Number of publications
            try:
                element = soup.find(id='publicationsGroup').find_all('a')
                element = [el for el in element if el.get('title') == "resource name"]
                n_publis = len(element)
            except:
                n_publis = 0

            # Projects_IP
            try:
                element = soup.find(id='RO_0000053-PrincipalInvestigatorRole-List').find_all('a')
                element = [el for el in element if el.get('title') == "activity name"]
                n_IP = len(element)
            except:
                n_IP = 0

            # Projects_noIP
            try:
                element = soup.find(id='RO_0000053-InvestigatorRole-List').find_all('a')
                element = [el for el in element if el.get('title') == "activity name"]
                n_noIP = len(element)
            except:
                n_noIP = 0

        all_researchers.append([file_path.stem, name, email, cat, orcid, dep, res_group, subject, n_publis, n_IP, n_noIP])

columns = ['invID', 'Name', 'Email', 'Category', 'ORCID', 'Department', 'Research Group', 'Subjects', 'no Publis', 'Projects IP', 'Projects no IP']
df = pd.DataFrame(all_researchers, columns=columns)

100%|██████████| 1393/1393 [00:38<00:00, 35.88it/s]


In [4]:
print("Number of Researchers", len(df))
df.head()

Number of Researchers 1393


Unnamed: 0,invID,Name,Email,Category,ORCID,Department,Research Group,Subjects,no Publis,Projects IP,Projects no IP
0,inv17981,"Gonzalez-Cuellar Serrano, Maria Luisa",mlgonzal@der-pu.uc3m.es,Full Professor,0000-0002-0999-7711,Public State Law,Research Group on Financial and Tax Law,[Law],41,7,5
1,inv35577,"Quintana Montero, David",dquintan@inf.uc3m.es,Associate Professor,0000-0003-0320-1695,Computer Science and Engineering,Evolutionary Computation and Neural Networks (...,"[Computer Science, Economics, Robotics and Ind...",37,7,8
2,inv48535,"Varela Garcia, Nicolas",nvarela@pa.uc3m.es,PhD Candidate,0000-0002-9135-5338,Social Sciences,Histories of Global Latin Capitalisms (H-GLACIAL),,0,0,0
3,inv40978,"Gutierrez Fernandez, Eric",egutie1@ing.uc3m.es,Assistant Professor,0000-0002-2901-441X,Electronic Technology,Histories of Global Latin Capitalisms (H-GLACIAL),"[Education, Electronics, Telecommunications]",27,4,7
4,inv35940,"Gonzalez Rodriguez, Pedro",pgonzale@ing.uc3m.es,Associate Professor,0000-0002-1378-273X,Mathematics,Grupo de Métodos Numéricos y Aplicaciones,"[Biology and Biomedicine, Fission, Mathematics...",26,1,6


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1393 entries, 0 to 1392
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   invID           1393 non-null   object
 1   Name            1393 non-null   object
 2   Email           1278 non-null   object
 3   Category        1380 non-null   object
 4   ORCID           1150 non-null   object
 5   Department      1393 non-null   object
 6   Research Group  1393 non-null   object
 7   Subjects        1113 non-null   object
 8   no Publis       1393 non-null   int64 
 9   Projects IP     1393 non-null   int64 
 10  Projects no IP  1393 non-null   int64 
dtypes: int64(3), object(8)
memory usage: 119.8+ KB


In [6]:
df.to_parquet(parquet_data.joinpath('researchers.parquet'))

# 4. Tables `publications` and `researchers_publications`

In [7]:
all_activities = list(raw_data.glob("act*"))

all_publis = []
res_publis = []

nact = 0

for file_path in tqdm(all_activities):
    if file_path.is_file():
        with file_path.open("r") as fin:
            content = fin.read()

            # Crear un objeto BeautifulSoup
            soup = BeautifulSoup(content, "html.parser")
            activity_type = soup.find(class_="display-title").get_text().strip()
            if activity_type in ['Articles', 'Book Chapters', 'Conference Contributions']:
                
                # Título
                try: 
                    title = soup.title.get_text()
                except:
                    title = None
                
                # DOI
                try:
                    doi = soup.find('a', title="Digital Object Identifier (DOI)").get_text()
                except:
                    doi = None

                # Year
                try:
                    year = int(soup.find(id='dateTimeValue-DateTimeValue-List').get_text().strip().split()[-1])
                except:
                    year = np.nan

                # Abstract
                try:
                    abstract = soup.find(id='abstract-noRangeClass-List').get_text().strip()
                except:
                    abstract = None

                # Abstract
                try:
                    keywords = soup.find(id='freetextKeyword-noRangeClass-List').get_text().strip()
                except:
                    keywords = None

                # Autores
                try:
                    authors = soup.find('ul', id='relatedBy-Authorship-List').find_all('li', role="listitem")
                    for idx,el in enumerate(authors):
                        if el.find('a'):
                            res_publis.append([file_path.stem, el.find('a')['href'].strip().split('.html')[0], idx+1])
                except:
                    pass

                all_publis.append([file_path.stem, activity_type, title, abstract, keywords, doi, year])

columns = ['actID', 'ActivityType', 'Title', 'Abstract', 'Keywords', 'DOI', 'Year']
df_publis = pd.DataFrame(all_publis, columns=columns)

columns = ['actID', 'invID', 'Order']
df_res_publis = pd.DataFrame(res_publis, columns=columns)

100%|██████████| 53063/53063 [09:15<00:00, 95.58it/s] 


In [8]:
print("Number of Publis", len(df_publis))
df_publis.head()

Number of Publis 40243


Unnamed: 0,actID,ActivityType,Title,Abstract,Keywords,DOI,Year
0,act398486,Conference Contributions,Implementation of 2D Domain Decomposition in t...,,,,2012
1,act501456,Conference Contributions,Accessibility Guidelines for Tactile Displays ...,,,https://doi.org/10.1007/978-3-319-94274-2_29,2018
2,act523614,Book Chapters,En la víspera del esplendor: el Washington Cit...,,,,2019
3,act324958,Articles,Practical SPI Planning,This paper presents a practical procedure name...,,https://doi.org/10.1007/978-3-540-85936-9_8,2008
4,act492251,Articles,Fractional-order PID control of a chopper-fed ...,"In this paper, a dynamic control mechanism is ...",chopper-fed dc motor drive; fractional-order p...,https://doi.org/10.1007/s00500-017-2677-5,2017


In [9]:
df_publis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40243 entries, 0 to 40242
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   actID         40243 non-null  object
 1   ActivityType  40243 non-null  object
 2   Title         40243 non-null  object
 3   Abstract      15625 non-null  object
 4   Keywords      15489 non-null  object
 5   DOI           16772 non-null  object
 6   Year          40243 non-null  int64 
dtypes: int64(1), object(6)
memory usage: 2.1+ MB


In [10]:
def clean_surrogates(text):
    if text:
        try:
            # Reemplazar los caracteres de sustitución, o simplemente eliminarlos
            return text.encode('utf-16', 'surrogatepass').decode('utf-16')
            return text
        except:
            return np.nan

# Aplicar esta función a todas las columnas de texto en tu DataFrame
df_publis['Abstract'] = df_publis['Abstract'].apply(clean_surrogates)
df_publis.to_parquet(parquet_data.joinpath('publications.parquet'))

In [11]:
print("Number of Signatures", len(df_res_publis))
df_res_publis.head()

Number of Signatures 48977


Unnamed: 0,actID,invID,Order
0,act398486,inv17843,4
1,act501456,inv15183,2
2,act523614,inv18754,1
3,act324958,inv16849,2
4,act324958,inv15355,3


In [12]:
df_res_publis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48977 entries, 0 to 48976
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   actID   48977 non-null  object
 1   invID   48977 non-null  object
 2   Order   48977 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.1+ MB


In [13]:
df_res_publis.to_parquet(parquet_data.joinpath('researchers_publications.parquet'))

In [None]:
# Access each url given the ID
for n, inv in enumerate(inv_act):
    
    print(f'\nActivity {n+1} out of {len(inv_act)}')
    inv_file = Path.cwd().joinpath(f'researchportal.uc3m.es/display/act{inv}.html')
    auth_url = f'file:///{inv_file}'
    driver.get(auth_url)
    
    # section: articles, book chapters, conference contributions, working papers, projects, ...
    section = driver.find_element(By.CLASS_NAME, 'display-title').text
    valid_sections = ["Articles", "Book Chapters", "Conference Contributions", "Working Papers"]
            
    try:
        if section in valid_sections:
            resID = inv
            # print("Resource ID: ", resID)
            
            # Title of the activity
            title = driver.find_element(By.CLASS_NAME, 'fn').text
            for section_name in valid_sections:
                if title.endswith(section_name):
                    title = title.rsplit(section_name, 1)[0].strip()
                    # print("Title: ", title)
                    break  
                    
            # Publication date
            try:         
                publication_date = driver.find_element(By.XPATH, '//h3[@id="dateTimeValue"]/following-sibling::ul/li').text.strip()
                # print("Publication Date: ", publication_date)
            except:
                publication_date = ""
                # print("Publication Date: ", publication_date)

                
            # Publisher/magazine
            try: 
                if section == "Articles":
                    publication_venue = driver.find_element(By.XPATH, '//h3[@id="hasPublicationVenue"]/following-sibling::ul/li/a').text.strip()
                    # print("Publication Venue: ", publication_venue)
                    
                elif section in ["Book Chapters", "Conference Contributions", "Working Papers"]:
                    publication_venue = driver.find_element(By.XPATH, '//h3[@id="publisher"]/following-sibling::ul/li/a').text.strip()
                    # print("Publisher:", publication_venue)
                    
            except:
                publication_venue = ""
                # print("Publisher:", publication_venue)

                    
            # DOI number
            try: 
                doi = driver.find_element(By.XPATH, '//h3[@id="doi"]/following-sibling::ul/li/a').text.strip()
                # print("DOI:", doi)
            except: 
                doi = ""
                # print("DOI:", doi)

                
            # Abstract
            try:
                abstract = driver.find_element(By.XPATH, '//h3[@id="abstract"]/following-sibling::ul/li').text.strip()
                # print("Abstract: ", abstract)
            except:
                abstract = ""
                # print("Abstract: ", abstract)


                
            # Keywords
            try:
                keywords_list = []
                keywords = driver.find_element(By.XPATH, '//h3[@id="freetextKeyword"]/following-sibling::ul/li').text.strip()
                # Split the string using commas
                keywords_split_by_comma = keywords.split(',')

                # Split each resulting keyword using semicolons
                keywords_list = [keyword.strip() for keyword_with_semicolon in keywords_split_by_comma for keyword in keyword_with_semicolon.split(';')]
                # print("Keywords List: ", keywords_list)
            except:
                keywords_list = []
                # print("Keywords List: ", keywords_list)

                
            # Research Areas
            try: 
                research_areas = driver.find_elements(By.XPATH, '//h3[@id="hasResearchArea"]/following-sibling::ul/li')
                research_areas = [element.text.strip() for element in research_areas]
                # print("Research Areas: ", research_areas)
            except:
                research_areas = []
                # print("Research Areas: ", research_areas)

            
            # Authors IDs (if there is any ID)
            try:
                # Locate the parent <article> element
                article_element = driver.find_element(By.XPATH, '//article[@class="property" and @role="article"]')
                # Locate the <ul> element within the article for authors
                authors_list = article_element.find_element(By.XPATH, '//ul[@role="list" and @id="relatedBy-Authorship-List"]')

                # Get all <li> elements within the authors list
                author_items = authors_list.find_elements(By.XPATH, 'li')

                # Extract the author IDs
                author_ids = []

                for author_order, author_item in enumerate(author_items, start=1):
                    author_name = author_item.text.strip()
                    # print("Author: ", author_name, "with order:", author_order)
                    href_attribute = author_item.find_elements(By.XPATH, 'a')
                    
                    for invID in href_attribute:
                        invID = invID.get_attribute('href').split("inv")[1]
                        author_ids.append(invID)

                        inv_pub.append({'invID': invID, 'pubID': resID, 'orderID': author_order})
        
                # print("Valid Researchers IDs: ", author_ids)

            except:
                author_ids = []
    
            publications.append({'resID': resID, 'section': section, 'title': title, 'doi': doi, 'publication_date': publication_date, 'publisher': publication_venue, 'abstract': abstract, 'keywords': keywords_list, 'research_areas': research_areas})

    except:
        pass
    
    if not n%100:
        with open(relative_path + 'outputs/publications.json', 'w') as f:
            json.dump(publications, f, indent=4)
        with open(relative_path + 'outputs/inv_pub.json', 'w') as f:
            json.dump(inv_pub, f, indent=4)
            
with open(relative_path + 'outputs/publications.json', 'w') as f:
    json.dump(publications, f, indent=4)
with open(relative_path + 'outputs/inv_pub.json', 'w') as f:
    json.dump(inv_pub, f, indent=4)

In [None]:
"""
    # SCRAPING PROJECTS' INFORMATION
    projects = []
    try:
        pub_list = driver.find_element(By.ID,'projectsGroup').find_elements(By.CLASS_NAME, 'property')
        for section in pub_list:
            # section_title: 'principal researcher on', 'researcher on'
            section_title = section.find_element(By.TAG_NAME, 'h3').text
            sections = []
            for p in section.find_elements(By.TAG_NAME,'li'):
                element = p.find_element(By.TAG_NAME, 'a')
                # resource ID
                resID = element.get_attribute('href').split('/')[-1][3:]
                
                try:
                    inv_file = Path.cwd().joinpath(f'researchportal.uc3m.es/display/act{resID}.html')
                    auth_url = f'file:///{inv_file}'
                    driver2.get(auth_url)

                    
                    property_list = driver2.find_elements(By.CLASS_NAME, 'property')

                    # Iterar a través de los elementos 'property' para encontrar el abstract
                    for article in property_list:
                        abstract = ""

                        # Verificar si el ID del elemento contiene 'abstract'
                        abstract_elements = article.find_elements(By.ID, "abstract-noRangeClass-List")
                        if abstract_elements:
                            # Extraer el texto del elemento
                            abstract = abstract_elements[0].text
                except:
                    abstract = ''
                    
                title = element.text
                year = p.find_element(By.TAG_NAME, 'span').text
                
                                
                try:
                    funding_entity = p.find_element(By.XPATH, './/a[@title="awarded by"]').text
                
                except:
                    funding_entity = ""
                    
                sections.append({'resID':resID, 'title':title, 'year':year, 'funding_entity': funding_entity, 'abstract': abstract})
            projects.append((section_title, sections))
    except:
        pass
    projects.append({'author':inv, 'projects':dict(projects)})
  
    
    # Conditional check and savings (Periodic backups)
    if not n%100:
        with open(relative_path + 'outputs/researchers.json', 'w') as f:
            json.dump(researchers, f, indent=4)
        with open(relative_path + 'outputs/publications.json', 'w') as f:
            json.dump(publications_data, f, indent=4)
        with open(relative_path + 'outputs/projects.json', 'w') as f:
            json.dump(projects, f, indent=4)

# Always saving (Periodic backups)

with open(relative_path + 'outputs/researchers.json', 'w') as f:
    json.dump(researchers, f, indent=4)
with open(relative_path + 'outputs/publications.json', 'w') as f:
    json.dump(publications, f, indent=4)
with open(relative_path + 'outputs/projects.json', 'w') as f:
    json.dump(projects, f, indent=4) 

"""

# Process JSON files

In [None]:
def process_json(obj):
    '''
    Function to process json recursively
    '''
    resources = []
    auth_res = []
    author = ''
    def process(obj, objType='', author=''):
        if isinstance(obj, dict):
            for k, v in obj.items():     
                if k == 'author':
                    author = v
                if k == 'title':
                    d = {'type':objType.strip()}
                    d.update(obj)
                    resources.append(d)
                    auth_res.append((author, obj['resID']))
                else:
                    if isinstance(v, (dict, list)):
                        process(v, k, author)
        elif isinstance(obj, list):
            for el in obj:
                process(el, objType, author)
    process(obj)
    return resources, auth_res

# Formating our dataset and completing abstracts with 'scopus' database

In [None]:
"""
# Regular expression to extract DOI from the URL
doi_pattern = r'https://doi\.org/(?:http://dx\.doi\.org/)?(.+)'

# Extract DOI values from URLs in the 'doi' column
df_resources['doi'] = df_resources['doi'].str.extract(doi_pattern)
"""

### Formating 'df_resources' database: homogenizing NAs and filtering observations

In [None]:
"""
# Homogenizing the NAs to the same format
# Lista de nombres de las columnas en las que deseas reemplazar los valores nulos o vacíos por NaN
columns_to_process = ['doi', 'title', 'year', 'abstract', 'funding_entity', 'type']

# Iterar sobre las columnas y reemplazar los valores nulos o vacíos por NaN
for column in columns_to_process:
    df_resources[column] = df_resources[column].apply(lambda x: np.nan if x in [None, ''] else x)
    
"""

In [None]:
"""
# Filter the dataset by observations that have at least doi or at least abstract (so, an observation that can be completed if it has no abstract)
filtered_df_resources = df_resources[(df_resources['doi'].isna() & ~df_resources['abstract'].isna()) | (~df_resources['doi'].isna() & ~df_resources['abstract'].isna()) | (~df_resources['doi'].isna() & df_resources['abstract'].isna())]
filtered_df_resources.reset_index(drop=True, inplace=True)
"""

### Formating Scopus database

In [None]:
"""
data = pd.read_parquet('/Users/lcsanchez/Desktop/Research/Scopus/scopus_data.parquet')

filtered_data = data[['doi', 'description']]
"""

### Joining databases

In [None]:
"""
# Perform a left join on 'doi' column
merged_df = pd.merge(filtered_df_resources, filtered_data, on='doi', how='left')
#merged_df

# Llenar NaN en la columna 'abstract' con el valor del abstract de SCOPUS ('description' column) si 'abstract' está vacía
merged_df['abstract'] = merged_df['abstract'].combine_first(merged_df['description'])

# Eliminamos los duplicados
merged_df.drop_duplicates(subset='resID', keep='first', inplace=True)

merged_df = merged_df.drop(columns=['description'])

# Save the merged table as a CSV file
merged_df.to_csv(relative_path + 'outputs/merged_table.csv', index=False)

"""