# Import libraries and dependencies

In [1]:
from selenium import webdriver
import time
import pandas as pd
import json  
from pathlib import Path
import requests
import numpy as np
import time
# import langid

In [2]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.chrome.service import Service

# Download all researchers from Research Portal

This step was executed in the terminal to download all the resources locally, avoiding looping over the same resources repeteadly and getting the access denied to the Research Portal.

*Further information about this procedure can be found in the README.txt.*

In [3]:
baseURL = 'https://researchportal.uc3m.es/display/inv'
relative_path = '/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/'
not_found_txt = 'Individual not found'
valid_ids = []

In [4]:
for idx in np.arange(10000, 50000): 
    idx_str = str(idx)
    portal_page = requests.get(baseURL+idx_str)
    if portal_page.ok:
        valid_ids.append(idx_str)
        with open(relative_path + 'display/inv' + idx_str + '.html', 'w') as fout:
            fout.write(portal_page.text)
    print('Researchers so far: ' + str(len(valid_ids)) + ' (' + idx_str + ')\r', end='')
    time.sleep(1)

Researchers so far: 1394 (49999)

In [5]:
with open(relative_path + 'all_inv.txt', 'w') as fout:
    fout.write('\n'.join(valid_ids))

# Loading the data

In [6]:
with open(relative_path + 'all_inv.txt', 'r') as f:
    inv_ids = f.read().splitlines()

In [7]:
inv_ids

['14905',
 '14923',
 '14927',
 '14954',
 '14975',
 '14995',
 '14999',
 '15001',
 '15013',
 '15021',
 '15024',
 '15026',
 '15089',
 '15106',
 '15149',
 '15160',
 '15168',
 '15183',
 '15186',
 '15194',
 '15206',
 '15213',
 '15217',
 '15225',
 '15233',
 '15236',
 '15238',
 '15262',
 '15284',
 '15290',
 '15291',
 '15292',
 '15317',
 '15319',
 '15341',
 '15355',
 '15356',
 '15360',
 '15363',
 '15367',
 '15371',
 '15373',
 '15374',
 '15376',
 '15380',
 '15385',
 '15388',
 '15399',
 '15401',
 '15445',
 '15456',
 '15457',
 '15460',
 '15461',
 '15463',
 '15466',
 '15468',
 '15525',
 '15543',
 '15548',
 '15553',
 '15609',
 '15616',
 '15621',
 '15622',
 '15623',
 '15625',
 '15630',
 '15631',
 '15633',
 '15636',
 '15639',
 '15641',
 '15645',
 '15654',
 '15663',
 '15681',
 '15693',
 '15704',
 '15720',
 '15755',
 '15762',
 '15765',
 '15770',
 '15783',
 '15831',
 '15871',
 '15872',
 '15873',
 '15875',
 '15877',
 '15878',
 '15904',
 '15905',
 '15939',
 '15959',
 '15962',
 '15966',
 '15989',
 '16012',


# Loading the driver and browser

In [8]:
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
# open it, go to a website, and get results
driver = webdriver.Chrome(options=options)
driver2 = webdriver.Chrome(options=options)

# Data scraping

In [9]:
authors_data = []
publications_data = []
projects_data = []
others_data = []

In [10]:
# Access each url given the ID
for n, inv in enumerate(inv_ids):
    print(f'Researcher {n+1} out of {len(inv_ids)}')
    inv_file = Path.cwd().joinpath(f'researchportal.uc3m.es/display/inv{inv}.html')
    auth_url = f'file:///{inv_file}'
    driver.get(auth_url)
    
    # SCRAPING RESEARCHERS' INFORMATION
    try:
        name = driver.find_element(By.XPATH,'//span[@itemprop="name"]')
    except:
        name = ''

    try:
        cat = driver.find_element(By.CLASS_NAME, 'categoriainv').text.split(': ')[1]
    except:
        cat = ''
        
    try:
        orcid = driver.find_element(By.CLASS_NAME, 'individual-orcid').find_element(By.TAG_NAME, 'a').get_attribute('href')
    except:
        orcid = ''

    try:
        scopus = driver.find_element(By.ID, 'scopusId-noRangeClass-List').find_element(By.TAG_NAME, 'a').text
    except:
        scopus = ''    
    
    try:
        pos = driver.find_elements(By.CLASS_NAME, 'currentPosition')
    except:
        pos = []
    
    # Position: Academic department, Research group, Institute, ...
    positions = []
    for p in pos:
            group = p.text.split(' : ')
            pos_type = group[0]
            if len(group) > 1:
                pos_name = group[1]
            else:
                pos_name = ''
            # link = p.find_element_by_tag_name('a').get_attribute('href')

            positions.append((pos_type, pos_name))

    try:
        subject = driver.find_element(By.ID, 'individual-hasResearchArea').text
    except:
        subject = ''
    
    # create an array for which each subject is an element
    subjects = subject.splitlines()

            
    try:
        email = driver.find_element(By.CLASS_NAME, 'individual-emails').find_element(By.TAG_NAME,'a').text
    except:
        email = ''
       
    
    authors_data.append({'invID':inv, 'name' : name.text, 'orcid' : orcid, 'scopus': scopus, 'category':cat, 'email':email, 'positions':dict(positions), 'subjects': subjects})
    
    
    # SCRAPING PUBLICATIONS' INFORMATION
    publications = []
    try:
        # element that references the 'sample of publications'
        pub_list = driver.find_element(By.ID, 'publicationsGroup').find_elements(By.CLASS_NAME, 'property')
        
        for section in pub_list:
            # section title: sample of publications
            section_title = section.find_element(By.TAG_NAME,'h3').text
            sections = []
            
            # pub_type: articles, book chapters, books, conference contributions, ...
            for pub in section.find_elements(By.CLASS_NAME, 'subclass'):
                articles = []
                pub_type = pub.find_element(By.TAG_NAME, 'h3').text
            
                for p in pub.find_elements(By.TAG_NAME, 'li'):
                    element = p.find_element(By.TAG_NAME,'a')
                
                    title = element.text
                    year = p.find_element(By.TAG_NAME,'span').text
                    
                    try:
                        doi_element = p.find_element(By.CLASS_NAME, "altmetric-embed")
                        doi = doi_element.get_attribute("data-doi")
                        
                        # convert DOIs to the same format
                        if doi.startswith("https://doi.org/"):
                            doi = doi
                        else:
                            doi = f"https://doi.org/{doi}"
                        
                    except:
                        doi = ""
                    
                    
                    #resID: resource ID
                    resID = element.get_attribute('href').split('/')[-1][3:]

                    try:
                        inv_file = Path.cwd().joinpath(f'researchportal.uc3m.es/display/act{resID}.html')
                        auth_url = f'file:///{inv_file}'
                        driver2.get(auth_url)

                        property_list = driver2.find_elements(By.CLASS_NAME, 'property')
                        abstract = ''  # Inicializar abstract fuera del bucle

                        # Iterar a través de los elementos 'property' para encontrar el abstract
                        for article_element in property_list:
                            # Verificar si el ID del elemento contiene 'abstract'
                            abstract_elements = article_element.find_elements(By.ID, "abstract-noRangeClass-List")
                            if abstract_elements:
                                # Within the 'ul' element, find the 'li' element to extract the abstract text
                                li_elements = abstract_elements[0].find_elements(By.TAG_NAME, "li")
                                if li_elements:
                                    abstract = li_elements[0].text
                                    # print("\nEste abstract pertenece a: ", title)
                                    # print(abstract)
                                    break  # Salir del bucle después de encontrar un abstract válido
                            else:
                                pass
                            # print("Esta obra no tiene abstract: ", title)

                        # print("\nAsí sale el abstract de esta obra impreso: ", title)
                        # print(abstract)

                    except Exception as e:
                        abstract = ''
                        # print("Ocurrió un error:", str(e))
                        # print("Esta obra pasa por la excepción: ", title)
                        # print(abstract)



                    articles.append({'resID': resID, 'doi': doi,'title': title, 'year': year, 'abstract': abstract})
                sections.append((pub_type, articles))
            publications.append((section_title, dict(sections)))

    except:
        pass
    
    publications_data.append({'author':inv, 'publications':dict(publications)})

    
    # SCRAPING PROJECTS' INFORMATION
    projects = []
    try:
        pub_list = driver.find_element(By.ID,'projectsGroup').find_elements(By.CLASS_NAME, 'property')
        for section in pub_list:
            # section_title: 'principal researcher on', 'researcher on'
            section_title = section.find_element(By.TAG_NAME, 'h3').text
            sections = []
            for p in section.find_elements(By.TAG_NAME,'li'):
                element = p.find_element(By.TAG_NAME, 'a')
                # resource ID
                resID = element.get_attribute('href').split('/')[-1][3:]
                
                try:
                    inv_file = Path.cwd().joinpath(f'researchportal.uc3m.es/display/act{resID}.html')
                    auth_url = f'file:///{inv_file}'
                    driver2.get(auth_url)

                    
                    property_list = driver2.find_elements(By.CLASS_NAME, 'property')

                    # Iterar a través de los elementos 'property' para encontrar el abstract
                    for article in property_list:
                        abstract = ""

                        # Verificar si el ID del elemento contiene 'abstract'
                        abstract_elements = article.find_elements(By.ID, "abstract-noRangeClass-List")
                        if abstract_elements:
                            # Extraer el texto del elemento
                            abstract = abstract_elements[0].text
                except:
                    abstract = ''
                    
                title = element.text
                year = p.find_element(By.TAG_NAME, 'span').text
                
                                
                try:
                    funding_entity = p.find_element(By.XPATH, './/a[@title="awarded by"]').text
                
                except:
                    funding_entity = ""
                    
                sections.append({'resID':resID, 'title':title, 'year':year, 'funding_entity': funding_entity, 'abstract': abstract})
            projects.append((section_title, sections))
    except:
        pass
    projects_data.append({'author':inv, 'projects':dict(projects)})

     
    # SCRAPING OTHERS' INFORMATION
    others = []
    try:
        pub_list = driver.find_element(By.ID, 'otherGroup').find_elements(By.CLASS_NAME, 'property')
        for section in pub_list:
            # section_title: 'other activities'
            section_title = section.find_element(By.TAG_NAME, 'h3').text
            sections = []
            
            for pub in section.find_elements(By.CLASS_NAME,'subclass'):
                articles = []
                # pub_type: 'mobility', 'supervised thesis', ...
                pub_type = pub.find_element(By.TAG_NAME, 'h3').text
                
                for p in pub.find_elements(By.TAG_NAME, 'li'):
                    element = p.find_element(By.TAG_NAME, 'a')
                    resID = element.get_attribute('href').split('/')[-1][3:]
                    title = element.text
                    
                    articles.append({'resID':resID, 'title':title})
                others.append((pub_type, articles))
            
    except:
        pass
    others_data.append({'author':inv, 'others':dict(others)})    
    
    # Conditional check and savings (Periodic backups)
    if not n%100:
        with open(relative_path + 'outputs/researchers.json', 'w') as f:
            json.dump(authors_data, f, indent=4)
        with open(relative_path + 'outputs/publications_data.json', 'w') as f:
            json.dump(publications_data, f, indent=4)
        with open(relative_path + 'outputs/projects_data.json', 'w') as f:
            json.dump(projects_data, f, indent=4)
        with open(relative_path + 'outputs/others_data.json', 'w') as f:
            json.dump(others_data, f, indent=4)

# Always saving (Periodic backups)

with open(relative_path + 'outputs/researchers.json', 'w') as f:
    json.dump(authors_data, f, indent=4)
with open(relative_path + 'outputs/publications.json', 'w') as f:
    json.dump(publications_data, f, indent=4)
with open(relative_path + 'outputs/projects.json', 'w') as f:
    json.dump(projects_data, f, indent=4)
with open(relative_path + 'outputs/others.json', 'w') as f:
    json.dump(others_data, f, indent=4)      

Researcher 1 out of 1394
Researcher 2 out of 1394
Researcher 3 out of 1394
Researcher 4 out of 1394
Researcher 5 out of 1394
Researcher 6 out of 1394
Researcher 7 out of 1394
Researcher 8 out of 1394
Researcher 9 out of 1394
Researcher 10 out of 1394
Researcher 11 out of 1394
Researcher 12 out of 1394
Researcher 13 out of 1394
Researcher 14 out of 1394
Researcher 15 out of 1394
Researcher 16 out of 1394
Researcher 17 out of 1394
Researcher 18 out of 1394
Researcher 19 out of 1394
Researcher 20 out of 1394
Researcher 21 out of 1394
Researcher 22 out of 1394
Researcher 23 out of 1394
Researcher 24 out of 1394
Researcher 25 out of 1394
Researcher 26 out of 1394
Researcher 27 out of 1394
Researcher 28 out of 1394
Researcher 29 out of 1394
Researcher 30 out of 1394
Researcher 31 out of 1394
Researcher 32 out of 1394
Researcher 33 out of 1394
Researcher 34 out of 1394
Researcher 35 out of 1394
Researcher 36 out of 1394
Researcher 37 out of 1394
Researcher 38 out of 1394
Researcher 39 out of 

Researcher 309 out of 1394
Researcher 310 out of 1394
Researcher 311 out of 1394
Researcher 312 out of 1394
Researcher 313 out of 1394
Researcher 314 out of 1394
Researcher 315 out of 1394
Researcher 316 out of 1394
Researcher 317 out of 1394
Researcher 318 out of 1394
Researcher 319 out of 1394
Researcher 320 out of 1394
Researcher 321 out of 1394
Researcher 322 out of 1394
Researcher 323 out of 1394
Researcher 324 out of 1394
Researcher 325 out of 1394
Researcher 326 out of 1394
Researcher 327 out of 1394
Researcher 328 out of 1394
Researcher 329 out of 1394
Researcher 330 out of 1394
Researcher 331 out of 1394
Researcher 332 out of 1394
Researcher 333 out of 1394
Researcher 334 out of 1394
Researcher 335 out of 1394
Researcher 336 out of 1394
Researcher 337 out of 1394
Researcher 338 out of 1394
Researcher 339 out of 1394
Researcher 340 out of 1394
Researcher 341 out of 1394
Researcher 342 out of 1394
Researcher 343 out of 1394
Researcher 344 out of 1394
Researcher 345 out of 1394
R

Researcher 613 out of 1394
Researcher 614 out of 1394
Researcher 615 out of 1394
Researcher 616 out of 1394
Researcher 617 out of 1394
Researcher 618 out of 1394
Researcher 619 out of 1394
Researcher 620 out of 1394
Researcher 621 out of 1394
Researcher 622 out of 1394
Researcher 623 out of 1394
Researcher 624 out of 1394
Researcher 625 out of 1394
Researcher 626 out of 1394
Researcher 627 out of 1394
Researcher 628 out of 1394
Researcher 629 out of 1394
Researcher 630 out of 1394
Researcher 631 out of 1394
Researcher 632 out of 1394
Researcher 633 out of 1394
Researcher 634 out of 1394
Researcher 635 out of 1394
Researcher 636 out of 1394
Researcher 637 out of 1394
Researcher 638 out of 1394
Researcher 639 out of 1394
Researcher 640 out of 1394
Researcher 641 out of 1394
Researcher 642 out of 1394
Researcher 643 out of 1394
Researcher 644 out of 1394
Researcher 645 out of 1394
Researcher 646 out of 1394
Researcher 647 out of 1394
Researcher 648 out of 1394
Researcher 649 out of 1394
R

Researcher 917 out of 1394
Researcher 918 out of 1394
Researcher 919 out of 1394
Researcher 920 out of 1394
Researcher 921 out of 1394
Researcher 922 out of 1394
Researcher 923 out of 1394
Researcher 924 out of 1394
Researcher 925 out of 1394
Researcher 926 out of 1394
Researcher 927 out of 1394
Researcher 928 out of 1394
Researcher 929 out of 1394
Researcher 930 out of 1394
Researcher 931 out of 1394
Researcher 932 out of 1394
Researcher 933 out of 1394
Researcher 934 out of 1394
Researcher 935 out of 1394
Researcher 936 out of 1394
Researcher 937 out of 1394
Researcher 938 out of 1394
Researcher 939 out of 1394
Researcher 940 out of 1394
Researcher 941 out of 1394
Researcher 942 out of 1394
Researcher 943 out of 1394
Researcher 944 out of 1394
Researcher 945 out of 1394
Researcher 946 out of 1394
Researcher 947 out of 1394
Researcher 948 out of 1394
Researcher 949 out of 1394
Researcher 950 out of 1394
Researcher 951 out of 1394
Researcher 952 out of 1394
Researcher 953 out of 1394
R

Researcher 1213 out of 1394
Researcher 1214 out of 1394
Researcher 1215 out of 1394
Researcher 1216 out of 1394
Researcher 1217 out of 1394
Researcher 1218 out of 1394
Researcher 1219 out of 1394
Researcher 1220 out of 1394
Researcher 1221 out of 1394
Researcher 1222 out of 1394
Researcher 1223 out of 1394
Researcher 1224 out of 1394
Researcher 1225 out of 1394
Researcher 1226 out of 1394
Researcher 1227 out of 1394
Researcher 1228 out of 1394
Researcher 1229 out of 1394
Researcher 1230 out of 1394
Researcher 1231 out of 1394
Researcher 1232 out of 1394
Researcher 1233 out of 1394
Researcher 1234 out of 1394
Researcher 1235 out of 1394
Researcher 1236 out of 1394
Researcher 1237 out of 1394
Researcher 1238 out of 1394
Researcher 1239 out of 1394
Researcher 1240 out of 1394
Researcher 1241 out of 1394
Researcher 1242 out of 1394
Researcher 1243 out of 1394
Researcher 1244 out of 1394
Researcher 1245 out of 1394
Researcher 1246 out of 1394
Researcher 1247 out of 1394
Researcher 1248 out 

# Process JSON files

In [11]:
def process_json(obj):
    '''
    Function to process json recursively
    '''
    resources = []
    auth_res = []
    author = ''
    def process(obj, objType='', author=''):
        if isinstance(obj, dict):
            for k, v in obj.items():     
                if k == 'author':
                    author = v
                if k == 'title':
                    d = {'type':objType.strip()}
                    d.update(obj)
                    resources.append(d)
                    auth_res.append((author, obj['resID']))
                else:
                    if isinstance(v, (dict, list)):
                        process(v, k, author)
        elif isinstance(obj, list):
            for el in obj:
                process(el, objType, author)
    process(obj)
    return resources, auth_res

### Create tables to match author ID (invID) to resource ID (resID)

In [12]:
# DF researchers
with open(relative_path + 'outputs/researchers.json', 'r') as f:
    data = json.load(f)
    df_researchers = pd.json_normalize(data)

df_researchers.columns = ['invID']+[cname.split('.')[-1].replace(' ', '_').lower() for cname in df_researchers.columns[1:]]

# DF resources, DF auth-res
resources = []
auth_res = []
auth_res_publications = []
auth_res_projects = []
auth_res_others = []

with open(relative_path + 'outputs/publications.json', 'r') as f:
    data = json.load(f)
res, a_r = process_json(data)
resources.extend(res)
auth_res.extend(a_r)
auth_res_publications.extend(a_r)

# DF projects
with open(relative_path + 'outputs/projects.json', 'r') as f:
    data = json.load(f)
res, a_r = process_json(data)
resources.extend(res)
auth_res.extend(a_r)
auth_res_projects.extend(a_r)

# DF Others
with open(relative_path + 'outputs/others.json', 'r') as f:
    data = json.load(f)
res, a_r = process_json(data)
resources.extend(res)
auth_res.extend(a_r)
auth_res_others.extend(a_r)

df_resources = pd.DataFrame(resources)
df_resources = df_resources.drop_duplicates(subset='resID')
df_resources = df_resources[list(df_resources.columns[1:]) + [df_resources.columns[0]]]

In [13]:
df_auth_res = pd.DataFrame(auth_res, columns=['invID', 'resID'])
df_auth_res_publications = pd.DataFrame(auth_res_publications, columns=['invID', 'resID'])
df_auth_res_projects = pd.DataFrame(auth_res_projects, columns=['invID', 'resID'])
df_auth_res_others = pd.DataFrame(auth_res_others, columns=['invID', 'resID'])

# Save as a CSV file

In [14]:
df_researchers.to_csv(relative_path + 'outputs/researchers.csv', index=False)
df_resources.to_csv(relative_path + 'outputs/resources.csv', index=False)

df_auth_res_publications.to_csv(relative_path + 'outputs/auth_res_publications.csv', index=False)
df_auth_res_projects.to_csv(relative_path + 'outputs/auth_res_projects.csv', index=False)
df_auth_res_others.to_csv(relative_path + 'outputs/auth_res_others.csv', index=False)

# Save as a parquet file

In [15]:
df_researchers.to_parquet(relative_path + 'outputs/researchers.parquet', index=False)
df_resources.to_parquet(relative_path + 'outputs/resources.parquet', index=False)
df_auth_res.to_parquet(relative_path + '/outputs/auth_res.parquet', index=False)

df_auth_res_publications.to_parquet(relative_path + 'outputs/auth_res_publications.parquet', index=False)
df_auth_res_projects.to_parquet(relative_path + 'outputs/auth_res_projects.parquet', index=False)
df_auth_res_others.to_parquet(relative_path + 'outputs/auth_res_others.parquet', index=False)

# Formating our dataset and completing abstracts with 'scopus' database

In [16]:
"""
# Regular expression to extract DOI from the URL
doi_pattern = r'https://doi\.org/(?:http://dx\.doi\.org/)?(.+)'

# Extract DOI values from URLs in the 'doi' column
df_resources['doi'] = df_resources['doi'].str.extract(doi_pattern)
"""

"\n# Regular expression to extract DOI from the URL\ndoi_pattern = r'https://doi\\.org/(?:http://dx\\.doi\\.org/)?(.+)'\n\n# Extract DOI values from URLs in the 'doi' column\ndf_resources['doi'] = df_resources['doi'].str.extract(doi_pattern)\n"

### Formating 'df_resources' database: homogenizing NAs and filtering observations

In [17]:
"""
# Homogenizing the NAs to the same format
# Lista de nombres de las columnas en las que deseas reemplazar los valores nulos o vacíos por NaN
columns_to_process = ['doi', 'title', 'year', 'abstract', 'funding_entity', 'type']

# Iterar sobre las columnas y reemplazar los valores nulos o vacíos por NaN
for column in columns_to_process:
    df_resources[column] = df_resources[column].apply(lambda x: np.nan if x in [None, ''] else x)
    
"""

"\n# Homogenizing the NAs to the same format\n# Lista de nombres de las columnas en las que deseas reemplazar los valores nulos o vacíos por NaN\ncolumns_to_process = ['doi', 'title', 'year', 'abstract', 'funding_entity', 'type']\n\n# Iterar sobre las columnas y reemplazar los valores nulos o vacíos por NaN\nfor column in columns_to_process:\n    df_resources[column] = df_resources[column].apply(lambda x: np.nan if x in [None, ''] else x)\n    \n"

In [18]:
"""
# Filter the dataset by observations that have at least doi or at least abstract (so, an observation that can be completed if it has no abstract)
filtered_df_resources = df_resources[(df_resources['doi'].isna() & ~df_resources['abstract'].isna()) | (~df_resources['doi'].isna() & ~df_resources['abstract'].isna()) | (~df_resources['doi'].isna() & df_resources['abstract'].isna())]
filtered_df_resources.reset_index(drop=True, inplace=True)
"""

"\n# Filter the dataset by observations that have at least doi or at least abstract (so, an observation that can be completed if it has no abstract)\nfiltered_df_resources = df_resources[(df_resources['doi'].isna() & ~df_resources['abstract'].isna()) | (~df_resources['doi'].isna() & ~df_resources['abstract'].isna()) | (~df_resources['doi'].isna() & df_resources['abstract'].isna())]\nfiltered_df_resources.reset_index(drop=True, inplace=True)\n"

### Formating Scopus database

In [19]:
"""
data = pd.read_parquet('/Users/lcsanchez/Desktop/Research/Scopus/scopus_data.parquet')

filtered_data = data[['doi', 'description']]
"""

"\ndata = pd.read_parquet('/Users/lcsanchez/Desktop/Research/Scopus/scopus_data.parquet')\n\nfiltered_data = data[['doi', 'description']]\n"

### Joining databases

In [20]:
"""
# Perform a left join on 'doi' column
merged_df = pd.merge(filtered_df_resources, filtered_data, on='doi', how='left')
#merged_df

# Llenar NaN en la columna 'abstract' con el valor del abstract de SCOPUS ('description' column) si 'abstract' está vacía
merged_df['abstract'] = merged_df['abstract'].combine_first(merged_df['description'])

# Eliminamos los duplicados
merged_df.drop_duplicates(subset='resID', keep='first', inplace=True)

merged_df = merged_df.drop(columns=['description'])

# Save the merged table as a CSV file
merged_df.to_csv(relative_path + 'outputs/merged_table.csv', index=False)

"""

"\n# Perform a left join on 'doi' column\nmerged_df = pd.merge(filtered_df_resources, filtered_data, on='doi', how='left')\n#merged_df\n\n# Llenar NaN en la columna 'abstract' con el valor del abstract de SCOPUS ('description' column) si 'abstract' está vacía\nmerged_df['abstract'] = merged_df['abstract'].combine_first(merged_df['description'])\n\n# Eliminamos los duplicados\nmerged_df.drop_duplicates(subset='resID', keep='first', inplace=True)\n\nmerged_df = merged_df.drop(columns=['description'])\n\n# Save the merged table as a CSV file\nmerged_df.to_csv(relative_path + 'outputs/merged_table.csv', index=False)\n\n"