# 1. Import libraries and dependencies

In [2]:
import time
import pandas as pd
import json  
from pathlib import Path
import numpy as np
import time
import pyarrow.parquet as pq
from bs4 import BeautifulSoup
from tqdm import tqdm
import re

# 2. Define input and output directories

In [12]:
# raw_data2 = Path("/export/data_ml4ds/AI4U/Datasets/ResearchPortal/20231005/rawdata/researchportal.uc3m.es/display")
# parquet_data2 = Path("/export/data_ml4ds/AI4U/Datasets/ResearchPortal/20231005/parquet")

raw_data = Path("/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/display")
parquet_data = Path("/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/parquet")

parquet_data.mkdir(parents=True, exist_ok=True)

# 3. Table `researchers`

In [4]:
all_researchers_files = list(raw_data.glob("inv*"))
all_researchers = []

for file_path in tqdm(all_researchers_files):
    if file_path.is_file():
        with file_path.open("r") as fin:
            content = fin.read()

            # Crear un objeto BeautifulSoup
            soup = BeautifulSoup(content, "html.parser")

            element = soup.find(class_="categoriainv")

            # Nombre
            try:
                element = soup.find('span', {'itemprop': 'name'})
                name = element.get_text().strip().title()
            except:
                name = None
            
            # Categoría del Investigador
            try:
                element = soup.find(class_="categoriainv")
                cat = element.get_text().split(': ')[1].strip()
            except:
                cat = None

            # ORCID
            try:
                element = soup.find(class_='individual-orcid')
                # Buscar dentro de este elemento el primer enlace y obtener su 'href'
                enlace = element.find('a')
                if enlace and 'href' in enlace.attrs:
                    orcid = enlace['href'].split('/')[-1].strip()
            except:
                orcid = None

            # Scopus
            try:
                element = soup.find(id='scopusId-noRangeClass-List')
                scopus = element.find('a').get_text().strip()
            except:
                scopus = None
                

            # Positions
            try:
                element = soup.find_all(class_='currentPosition')
                for el in element:
                    if "Academic Department" in el.get_text():
                        dep = el.find('span', itemprop='name').get_text().strip()
                    if "Research Group" in el.get_text():
                        res_group = el.find('span', itemprop='name').get_text().strip()
            except:
                dep = None
                res_group = None

            # Research Areas
            try:
                element = soup.find('ul', {'id': 'individual-hasResearchArea'})
                subject = []
                # Si se encuentra el elemento ul, extraer los textos de los elementos li
                list_items = element.find_all('li', role="listitem")
                for item in list_items:
                    subject.append(item.get_text(strip=True))
            except:
                subject = None

            # email
            try:
                element = soup.find(class_='individual-emails')
                email = element.find('a').get_text().strip()
            except:
                email = None

            # Number of publications
            try:
                element = soup.find(id='publicationsGroup').find_all('a')
                element = [el for el in element if el.get('title') == "resource name"]
                n_publis = len(element)
            except:
                n_publis = 0

            # Projects_IP (investigador principal)
            try:
                element = soup.find(id='RO_0000053-PrincipalInvestigatorRole-List').find_all('a')
                element = [el for el in element if el.get('title') == "activity name"]
                n_IP = len(element)
            except:
                n_IP = 0

            # Projects_noIP (no investigador principal)
            try:
                element = soup.find(id='RO_0000053-InvestigatorRole-List').find_all('a')
                element = [el for el in element if el.get('title') == "activity name"]
                n_noIP = len(element)
            except:
                n_noIP = 0

        all_researchers.append([file_path.stem, name, email, cat, orcid, scopus, dep, res_group, subject, n_publis, n_IP, n_noIP])

columns = ['invID', 'Name', 'Email', 'Category','ORCID', 'Scopus', 'Department', 'Research Group', 'Subjects', 'no Publis', 'Projects IP', 'Projects no IP']
df = pd.DataFrame(all_researchers, columns=columns)

100%|███████████████████████████████████████| 1402/1402 [00:24<00:00, 56.26it/s]


In [5]:
print("Number of Researchers", len(df))
df.head()

Number of Researchers 1402


Unnamed: 0,invID,Name,Email,Category,ORCID,Scopus,Department,Research Group,Subjects,no Publis,Projects IP,Projects no IP
0,inv47957,"Boye, Celian",cboye@ing.uc3m.es,PhD Candidate,,,Aerospace Engineering,Plasmas and Space Propulsion Team (EP2),[Aeronautics],1,0,1
1,inv24080,"Ciller Tenreiro, Maria Carmen",mciller@hum.uc3m.es,Associate Professor,0000-0002-8156-2034,44461256900.0,Communication Studies,"Television-Cinema: memory, representation and ...",[Information Science],19,6,13
2,inv47054,"Alvarez San Roman, Mercedes",meralvar@hum.uc3m.es,Assistant Professor,0000-0001-9178-6835,,Communication Studies,"Television-Cinema: memory, representation and ...",[Information Science],17,0,2
3,inv47111,"Rey Lopez, Alejandro",alreyl@pa.uc3m.es,PhD Candidate,0000-0003-2239-6942,57226877780.0,Computer Science and Engineering,"Television-Cinema: memory, representation and ...","[Computer Science, Education, Information Scie...",2,0,0
4,inv18803,"Carrero Dominguez, Maria Carmen",ccarrero@der-pr.uc3m.es,Associate Professor,0000-0002-6320-505X,,Private Social and International Law,Research Group on Social Security and the Prev...,"[Education, Law]",36,0,7


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1402 entries, 0 to 1401
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   invID           1402 non-null   object
 1   Name            1402 non-null   object
 2   Email           1402 non-null   object
 3   Category        1389 non-null   object
 4   ORCID           1155 non-null   object
 5   Scopus          935 non-null    object
 6   Department      1402 non-null   object
 7   Research Group  1402 non-null   object
 8   Subjects        1125 non-null   object
 9   no Publis       1402 non-null   int64 
 10  Projects IP     1402 non-null   int64 
 11  Projects no IP  1402 non-null   int64 
dtypes: int64(3), object(9)
memory usage: 131.6+ KB


In [7]:
df.to_parquet(parquet_data.joinpath('researchers.parquet'))

In [8]:
file_p = "/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/parquet/researchers.parquet"
df = pd.read_parquet(file_p)
df

Unnamed: 0,invID,Name,Email,Category,ORCID,Scopus,Department,Research Group,Subjects,no Publis,Projects IP,Projects no IP
0,inv47957,"Boye, Celian",cboye@ing.uc3m.es,PhD Candidate,,,Aerospace Engineering,Plasmas and Space Propulsion Team (EP2),[Aeronautics],1,0,1
1,inv24080,"Ciller Tenreiro, Maria Carmen",mciller@hum.uc3m.es,Associate Professor,0000-0002-8156-2034,44461256900,Communication Studies,"Television-Cinema: memory, representation and ...",[Information Science],19,6,13
2,inv47054,"Alvarez San Roman, Mercedes",meralvar@hum.uc3m.es,Assistant Professor,0000-0001-9178-6835,,Communication Studies,"Television-Cinema: memory, representation and ...",[Information Science],17,0,2
3,inv47111,"Rey Lopez, Alejandro",alreyl@pa.uc3m.es,PhD Candidate,0000-0003-2239-6942,57226877780,Computer Science and Engineering,"Television-Cinema: memory, representation and ...","[Computer Science, Education, Information Scie...",2,0,0
4,inv18803,"Carrero Dominguez, Maria Carmen",ccarrero@der-pr.uc3m.es,Associate Professor,0000-0002-6320-505X,,Private Social and International Law,Research Group on Social Security and the Prev...,"[Education, Law]",36,0,7
...,...,...,...,...,...,...,...,...,...,...,...,...
1397,inv37512,"Fernandez Garcia, Eva",evfernan@clio.uc3m.es,Associate Professor,0000-0002-7729-7792,55990690100,Social Sciences,Histories of Global Latin Capitalisms (H-GLACIAL),"[Economics, History]",12,0,7
1398,inv40585,"Dias Dos Reis, Samira",sdias@emp.uc3m.es,Associate Professor,0000-0002-5910-5689,55635842100,Business Administration,"Group on Company Innovation, Organization and ...","[Business, Economics]",14,2,3
1399,inv15875,"Borrajo Millan, Daniel",dborrajo@ia.uc3m.es,Full Professor,0000-0001-5282-0463,6603908284,Computer Science and Engineering,Planning and Learning Group,[Computer Science],93,18,6
1400,inv18314,"Sanchez Segura, Maria Isabel",misanche@inf.uc3m.es,Full Professor,0000-0002-2339-7851,6506529394,Computer Science and Engineering,Knowledge Reusing,"[Computer Science, Education, Telecommunications]",83,7,13


# 4. Tables `publications` and `researchers_publications`

In [10]:
all_activities = list(raw_data.glob("act*"))

all_publis = []
res_publis = []

nact = 0

for file_path in tqdm(all_activities):
    if file_path.is_file():
        with file_path.open("r") as fin:
            content = fin.read()

            # Crear un objeto BeautifulSoup
            soup = BeautifulSoup(content, "html.parser")
            activity_type = soup.find(class_="display-title").get_text().strip()
            if activity_type in ['Articles', 'Book Chapters', 'Conference Contributions']:
                
                # Título
                try: 
                    title = soup.title.get_text()
                except:
                    title = None
                
                # DOI
                try:
                    doi = soup.find('a', title="Digital Object Identifier (DOI)").get_text()
                except:
                    doi = None

                # Year
                try:
                    year = int(soup.find(id='dateTimeValue-DateTimeValue-List').get_text().strip().split()[-1])
                except:
                    year = np.nan

                # Abstract
                try:
                    abstract = soup.find(id='abstract-noRangeClass-List').get_text().strip()
                except:
                    abstract = None

                # Keywords
                try:
                    keywords = soup.find(id='freetextKeyword-noRangeClass-List').get_text().strip()
                except:
                    keywords = None
                
                # Research Areas
                try:
                    research_areas_element = soup.find('h3', {'id': 'hasResearchArea'})
                    research_areas_list = research_areas_element.find_next_sibling('ul').find_all('li')
                    research_areas = [area.get_text(strip=True) for area in research_areas_list]

                except:
                    research_areas = None

                # Publisher
                try:
                    if activity_type == "Articles":
                        publication_venue_element = soup.find('h3', {'id': 'hasPublicationVenue'}).find_next_sibling('ul').find('li').find('a')
                        publication_venue = publication_venue_element.get_text(strip=True) if publication_venue_element else None
                        # print("Publication Venue: ", publication_venue)

                    elif activity_type in ["Book Chapters", "Conference Contributions"]:
                        publication_venue_element = soup.find('h3', {'id': 'publisher'}).find_next_sibling('ul').find('li').find('a')
                        publication_venue = publication_venue_element.get_text(strip=True) if publication_venue_element else None
                        # print("Publisher:", publication_venue)

                except:
                    publication_venue = None
                    # print("Publisher:", publication_venue)

                # ISSN
                try:
                    issn_element = soup.find('h3', {'id': 'issn'}).find_next_sibling('ul').find('li')
                    issn_number = issn_element.get_text(strip=True) if issn_element else None
                except:
                    issn_number = None

                # EISSN
                try:
                    eissn_element = soup.find('h3', {'id': 'eissn'}).find_next_sibling('ul').find('li')
                    eissn_number = eissn_element.get_text(strip=True) if eissn_element else None
                except:
                    eissn_number = None
                    
                # ISBN
                try:
                    isbn_element = soup.find('h3', {'id': 'uc3mIsbn'}).find_next_sibling('ul').find('li')
                    isbn_number = isbn_element.get_text(strip=True) if isbn_element else None
                except:
                    isbn_number = None

                    
                # Autores
                try:
                    authors = soup.find('ul', id='relatedBy-Authorship-List').find_all('li', role="listitem")
                    for idx,el in enumerate(authors):
                        if el.find('a'):
                            res_publis.append([file_path.stem, el.find('a')['href'].strip().split('.html')[0], idx+1])
                except:
                    pass

                all_publis.append([file_path.stem, activity_type, title, abstract, keywords, research_areas, doi, year, publication_venue, issn_number, eissn_number, isbn_number])

columns = ['actID', 'ActivityType', 'Title', 'Abstract', 'Keywords', 'Research Areas', 'DOI', 'Year', 'Publisher', 'ISSN', 'EISSN', 'ISBN']
df_publis = pd.DataFrame(all_publis, columns=columns)

columns = ['actID', 'invID', 'Order']
df_res_publis = pd.DataFrame(res_publis, columns=columns)

100%|████████████████████████████████████| 53064/53064 [05:49<00:00, 151.91it/s]


In [11]:
print("Number of Publications", len(df_publis))
df_publis.head(20)

Number of Publications 40244


Unnamed: 0,actID,ActivityType,Title,Abstract,Keywords,Research Areas,DOI,Year,Publisher,ISSN,EISSN,ISBN
0,act391965,Book Chapters,A GPU Accelerated High Performance Cloud Compu...,,,,,2011,INTECH,,,978-953-307-301-9
1,act407775,Conference Contributions,"Hecho imponible, exenciones y sujeto pasivo de...",,,,,2011,,,,
2,act390677,Conference Contributions,PET and SPECT imaging,,,,,2011,,,,
3,act319500,Articles,"Inflation, Price Competition, and Consumer Sea...","This paper studies an (S,s) pricing model in t...",,,https://doi.org/10.1016/j.jedc.2008.03.008,2008,JOURNAL OF ECONOMIC DYNAMICS & CONTROL,0165-1889,1879-1743,
4,act492421,Articles,"Gracias, nodriza: la estima de la lactancia y ...",En la documentación de reyes y reinas medieval...,nodriza; ayo; nutrix; nutritius; crianza; lact...,,,2017,Dilemata. Revista Internacional de Éticas Apli...,1989-7022,,
5,act459525,Conference Contributions,Analysis of the INCOSE Rules for Writing Good ...,,,,https://doi.org/10.1007/978-3-319-26109-6_21,2016,SPRINGER,,,978-3-319-26107-2
6,act559647,Book Chapters,Desarrollo normativo (Comentario a la Disposic...,,,[Law],,2021,THOMSON REUTERS,,,978-84-1346-102-1
7,act524377,Articles,Geometry from divergence functions and complex...,,almost complex structure; divergence functions...,,https://doi.org/10.1142/s021974991941020x,2020,INTERNATIONAL JOURNAL OF QUANTUM INFORMATION,0219-7499,,
8,act421217,Articles,"Effect of the equivalence ratio, Damköhler num...",The effect of the equivalence ratio on the sta...,differential diffusion; microchannel combustio...,[Industrial Engineering],https://doi.org/10.1016/j.combustflame.2013.11...,2014,Combustion and Flame,0010-2180,1556-2921,
9,act496931,Articles,Construction of specialized terminology : an e...,"In this paper, the morphological series crimin...",morphological series; special language; crimin...,,https://doi.org/10.5209/dice.62144,2018,Dicenda. Cuadernos de Filologia Hispanica,0212-2952,,


In [12]:
df_publis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40244 entries, 0 to 40243
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   actID           40244 non-null  object
 1   ActivityType    40244 non-null  object
 2   Title           40244 non-null  object
 3   Abstract        15626 non-null  object
 4   Keywords        15490 non-null  object
 5   Research Areas  9855 non-null   object
 6   DOI             16773 non-null  object
 7   Year            40244 non-null  int64 
 8   Publisher       30504 non-null  object
 9   ISSN            17897 non-null  object
 10  EISSN           11865 non-null  object
 11  ISBN            12001 non-null  object
dtypes: int64(1), object(11)
memory usage: 3.7+ MB


In [13]:
def clean_surrogates(text):
    if text:
        try:
            # Reemplazar los caracteres de sustitución, o simplemente eliminarlos
            return text.encode('utf-16', 'surrogatepass').decode('utf-16')
            return text
        except:
            return np.nan

# Aplicar esta función a todas las columnas de texto en tu DataFrame
df_publis['Abstract'] = df_publis['Abstract'].apply(clean_surrogates)
df_publis.to_parquet(parquet_data.joinpath('publications.parquet'))

In [14]:
print("Number of Signatures", len(df_res_publis))
df_res_publis.head()

Number of Signatures 48979


Unnamed: 0,actID,invID,Order
0,act391965,inv36190,5
1,act407775,inv17981,1
2,act390677,inv41129,1
3,act459525,inv38441,1
4,act459525,inv35699,3


In [15]:
df_res_publis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48979 entries, 0 to 48978
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   actID   48979 non-null  object
 1   invID   48979 non-null  object
 2   Order   48979 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.1+ MB


In [16]:
df_res_publis.to_parquet(parquet_data.joinpath('researchers_publications.parquet'))

# 4. Table `projects`

In [13]:
all_activities = list(raw_data.glob("act*"))

all_projects = []
res_projects = []

nact = 0

for file_path in tqdm(all_activities):
    if file_path.is_file():
        with file_path.open("r") as fin:
            content = fin.read()

            # Crear un objeto BeautifulSoup
            soup = BeautifulSoup(content, "html.parser")
            activity_type = soup.find(class_="display-title").get_text().strip()
            if activity_type == "Projects":
                # Título
                try: 
                    title = soup.title.get_text()
                except:
                    title = None

                try:
                    datetime_interval_element = soup.find('h3', {'id': 'dateTimeInterval'}).find_next_sibling('ul').find('li')
                    datetime_interval_text = datetime_interval_element.get_text(strip=True) if datetime_interval_element else None

                    # Extracting years from the text with improved regular expression
                    start_year, end_year = None, None
                    
                    if datetime_interval_text:
                        # Utilizar una expresión regular para extraer los años de inicio y fin
                        year_matches = re.findall(r'\b(\d{4})\b', datetime_interval_text)
                        start_year, end_year = map(int, year_matches) if len(year_matches) == 2 else (None, None)

                except:
                    start_year, end_year = None, None

                # Funding Entity
                try:
                    funding_entity_element = soup.find('h3', {'id': 'assignedBy'})

                    if funding_entity_element:
                        funding_entity = funding_entity_element.find_next('ul').find('a').get_text(strip=True)

                except:
                    funding_entity = None
                
                # Reference
                try:
                    reference_element = soup.find('h3', {'id': 'uc3mProjectReference'}).find_next_sibling('ul').find('li')
                    reference = reference_element.get_text(strip=True) if reference_element else None

                except:
                    reference = None
                
                # Type
                try:
                    type_element = soup.find('h3', {'id': 'uc3mProjectType'}).find_next_sibling('ul').find('li')
                    project_type = type_element.get_text(strip=True) if type_element else None
                
                except:
                    project_type = None
                
                # Role: no funciona
                try:
                    researcher_role_elements = soup.find('h3', {'id': 'relates-ResearcherRole'}).find_next_sibling('ul').find_all('li')

                    for role_element in researcher_role_elements:
                        researcher_name_element = role_element.find('a', title='name')
                        researcher_role_text = role_element.get_text()

                        if researcher_name_element:
                            researcher_id_match = re.search(r'inv(\d+)', researcher_name_element['href'])
                            researcher_id = researcher_id_match.group(0) if researcher_id_match else None
                        else:
                            researcher_id = None

                        researcher_role_match = re.compile(r'(Principal Researcher|Researcher)').search(researcher_role_text)
                        researcher_role = researcher_role_match.group(1) if researcher_role_match else None

                        res_projects.append([file_path.stem, researcher_id, researcher_role])

                except:
                    researcher_role = None
                
                all_projects.append([file_path.stem, title, start_year, end_year, funding_entity, reference, project_type])

columns = ['actID', 'Title', 'StartYear', 'EndYear', 'Funding Entity', 'Reference', 'Type']
df_projects = pd.DataFrame(all_projects, columns=columns)
# Replace None with np.nan in 'StartYear' and 'EndYear' columns
df_projects['StartYear'] = df_projects['StartYear'].replace({None: np.nan})
df_projects['EndYear'] = df_projects['EndYear'].replace({None: np.nan})

# Convert 'StartYear' and 'EndYear' columns to integers
df_projects['StartYear'] = df_projects['StartYear'].astype('Int64')
df_projects['EndYear'] = df_projects['EndYear'].astype('Int64')


columns = ['actID', 'invID', 'Role']
df_res_projects = pd.DataFrame(res_projects, columns=columns)

100%|████████████████████████████████████| 53064/53064 [03:57<00:00, 223.61it/s]


In [14]:
df_projects 

Unnamed: 0,actID,Title,StartYear,EndYear,Funding Entity,Reference,Type
0,act540466,Asistencia para la gestión del Madrid Site Rep...,2017,2017,"BOMBARDIER EUROPEAN INVESTMENTS, S.L.U.",P147706 Rev. 01,Technical Assessment and Assistance Contract
1,act538354,Realización de un recorrido accesible por la l...,2020,2020,AYUNTAMIENTO DE MADRID,Expediente 151202000194,R+D Contract
2,act540036,Obtención empírica de parámetros para validar ...,2011,2011,AIRBUS DEFENCE AND SPACE S.A.U.,Pedido E 8219383 K,R+D Contract
3,act538704,Diseño y Modelado de Sistemas Electrónicos aer...,2006,2009,MINISTERIO DE EDUCACION Y CIENCIA DIR. GRAL. I...,DPI2006-14866-C02-02,National Research Project
4,act547209,SIMBAT: Solutions for Intelligent Monitoring b...,2021,2023,AGENCIA ESTATAL DE INVESTIGACION (AEI),PDC2021-121567-C22,National Research Project
...,...,...,...,...,...,...,...
5423,act539614,Asesorar y capacitar a los participantes en as...,2019,2019,FONDO DE POBLACIÓN DE LAS NACIONES UNIDAS UNFPA,,Technical Assessment and Assistance Contract
5424,act538906,Metodología para la generación de evasiones en...,2011,2011,COMUNIDAD DE MADRID-UC3M,CCG10-UC3M/TIC-5570,Regional Research Project
5425,act538455,Realización de un recorrido accesible con audi...,2021,2021,AYUNTAMIENTO DE MADRID,Contrato Menor Expediente 151202100140,R+D Contract
5426,act556319,Diseño y fabricación de protecciones avanzadas...,2020,2023,FABRICA ESPAÑOLA DE CONFECCIONES,,R+D Contract


In [15]:
print("Number of Projects", len(df_projects))
df_projects.head()

Number of Projects 5428


Unnamed: 0,actID,Title,StartYear,EndYear,Funding Entity,Reference,Type
0,act540466,Asistencia para la gestión del Madrid Site Rep...,2017,2017,"BOMBARDIER EUROPEAN INVESTMENTS, S.L.U.",P147706 Rev. 01,Technical Assessment and Assistance Contract
1,act538354,Realización de un recorrido accesible por la l...,2020,2020,AYUNTAMIENTO DE MADRID,Expediente 151202000194,R+D Contract
2,act540036,Obtención empírica de parámetros para validar ...,2011,2011,AIRBUS DEFENCE AND SPACE S.A.U.,Pedido E 8219383 K,R+D Contract
3,act538704,Diseño y Modelado de Sistemas Electrónicos aer...,2006,2009,MINISTERIO DE EDUCACION Y CIENCIA DIR. GRAL. I...,DPI2006-14866-C02-02,National Research Project
4,act547209,SIMBAT: Solutions for Intelligent Monitoring b...,2021,2023,AGENCIA ESTATAL DE INVESTIGACION (AEI),PDC2021-121567-C22,National Research Project


In [16]:
df_projects.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5428 entries, 0 to 5427
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   actID           5428 non-null   object
 1   Title           5428 non-null   object
 2   StartYear       5418 non-null   Int64 
 3   EndYear         5418 non-null   Int64 
 4   Funding Entity  5428 non-null   object
 5   Reference       2634 non-null   object
 6   Type            5428 non-null   object
dtypes: Int64(2), object(5)
memory usage: 307.6+ KB


In [17]:
df_projects.to_parquet(parquet_data.joinpath('projects.parquet'))

In [18]:
df_res_projects

Unnamed: 0,actID,invID,Role
0,act540466,inv19840,Principal Researcher
1,act540466,inv37778,Researcher
2,act540466,,Researcher
3,act538354,inv16754,Principal Researcher
4,act538354,inv17929,Principal Researcher
...,...,...,...
29323,act538005,inv35623,Principal Researcher
29324,act538005,inv44226,Principal Researcher
29325,act538005,inv16622,Researcher
29326,act538005,inv40320,Researcher


In [19]:
print("Number of Signatures", len(df_res_projects))
df_res_projects.head()

Number of Signatures 29328


Unnamed: 0,actID,invID,Role
0,act540466,inv19840,Principal Researcher
1,act540466,inv37778,Researcher
2,act540466,,Researcher
3,act538354,inv16754,Principal Researcher
4,act538354,inv17929,Principal Researcher


In [20]:
df_res_projects.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29328 entries, 0 to 29327
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   actID   29328 non-null  object
 1   invID   16107 non-null  object
 2   Role    29328 non-null  object
dtypes: object(3)
memory usage: 687.5+ KB


In [21]:
df_res_projects.to_parquet(parquet_data.joinpath('researchers_projects.parquet'))

In [28]:
#projects = pd.read_parquet('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/parquet/projects.parquet')
#publications = pd.read_parquet('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/parquet/publications.parquet')
#res_proj = projects = pd.read_parquet('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/parquet/researchers_projects.parquet')
#res_pub = projects = pd.read_parquet('/Users/lcsanchez/Desktop/Research/researchportal.uc3m.es/parquet/researchers_publications.parquet')
