# 1. Import libraries and dependencies

In [1]:
import time
import pandas as pd
import json  
from pathlib import Path
import numpy as np
import time
import pyarrow.parquet as pq
from bs4 import BeautifulSoup
from tqdm import tqdm

# 2. Define input and output directories

In [2]:
raw_data = Path("/export/data_ml4ds/AI4U/Datasets/ResearchPortal/20231005/rawdata/researchportal.uc3m.es/display")
parquet_data = Path("/export/data_ml4ds/AI4U/Datasets/ResearchPortal/20231005/parquet")

parquet_data.mkdir(parents=True, exist_ok=True)

In [12]:
all_researchers_files = list(raw_data.glob("inv*"))
all_researchers = []

for file_path in tqdm(all_researchers_files):
    if file_path.is_file():
        with file_path.open("r") as fin:
            content = fin.read()

            # Crear un objeto BeautifulSoup
            soup = BeautifulSoup(content, "html.parser")

            element = soup.find(class_="categoriainv")

            # Nombre
            try:
                element = soup.find('span', {'itemprop': 'name'})
                name = element.get_text().strip().title()
            except:
                name = None
            
            # Categoría del Investigador
            try:
                element = soup.find(class_="categoriainv")
                cat = element.get_text().split(': ')[1].strip()
            except:
                cat = None

            # ORCID
            try:
                element = soup.find(class_='individual-orcid')
                # Buscar dentro de este elemento el primer enlace y obtener su 'href'
                enlace = element.find('a')
                if enlace and 'href' in enlace.attrs:
                    orcid = enlace['href'].split('/')[-1].strip()
            except:
                orcid = None

            # Scopus
            try:
                element = soup.find(id='scopusId-noRangeClass-List')
                if element:
                    enlace = element.find('a')
                    if enlace:
                        scopus = enlace.get_text().strip()
            except:
                scopus = None

            # Positions
            try:
                element = soup.find_all(class_='currentPosition')
                for el in element:
                    if "Academic Department" in el.get_text():
                        dep = el.find('span', itemprop='name').get_text().strip()
                    if "Research Group" in el.get_text():
                        res_group = el.find('span', itemprop='name').get_text().strip()
            except:
                dep = None
                res_group = None

            # Research Areas
            try:
                element = soup.find('ul', {'id': 'individual-hasResearchArea'})
                subject = []
                # Si se encuentra el elemento ul, extraer los textos de los elementos li
                list_items = element.find_all('li', role="listitem")
                for item in list_items:
                    subject.append(item.get_text(strip=True))
            except:
                subject = None

            # email
            try:
                element = soup.find(class_='individual-emails')
                email = element.find('a').get_text().strip()
            except:
                email = None

            # Number of publications
            try:
                element = soup.find(id='publicationsGroup').find_all('a')
                element = [el for el in element if el.get('title') == "resource name"]
                n_publis = len(element)
            except:
                n_publis = 0

            # Projects_IP
            try:
                element = soup.find(id='RO_0000053-PrincipalInvestigatorRole-List').find_all('a')
                element = [el for el in element if el.get('title') == "activity name"]
                n_IP = len(element)
            except:
                n_IP = 0

            # Projects_noIP
            try:
                element = soup.find(id='RO_0000053-InvestigatorRole-List').find_all('a')
                element = [el for el in element if el.get('title') == "activity name"]
                n_noIP = len(element)
            except:
                n_noIP = 0

        all_researchers.append([file_path.stem, name, email, cat, orcid, dep, res_group, subject, n_publis, n_IP, n_noIP])

columns = ['invID', 'Name', 'Email', 'Category', 'ORCID', 'Department', 'Research Group', 'Subjects', 'no Publis', 'Projects IP', 'Projects no IP']
df = pd.DataFrame(all_researchers, columns=columns)

100%|██████████| 1393/1393 [01:31<00:00, 15.28it/s]


In [15]:
print("Number of Researchers", len(df))
df.head()

Number of Researchers 1393


Unnamed: 0,invID,Name,Email,Category,ORCID,Department,Research Group,Subjects,no Publis,Projects IP,Projects no IP
0,inv17981,"Gonzalez-Cuellar Serrano, Maria Luisa",mlgonzal@der-pu.uc3m.es,Full Professor,0000-0002-0999-7711,Public State Law,Research Group on Financial and Tax Law,[Law],41,7,5
1,inv35577,"Quintana Montero, David",dquintan@inf.uc3m.es,Associate Professor,0000-0003-0320-1695,Computer Science and Engineering,Evolutionary Computation and Neural Networks (...,"[Computer Science, Economics, Robotics and Ind...",37,7,8
2,inv48535,"Varela Garcia, Nicolas",nvarela@pa.uc3m.es,PhD Candidate,0000-0002-9135-5338,Social Sciences,Histories of Global Latin Capitalisms (H-GLACIAL),,0,0,0
3,inv40978,"Gutierrez Fernandez, Eric",egutie1@ing.uc3m.es,Assistant Professor,0000-0002-2901-441X,Electronic Technology,Histories of Global Latin Capitalisms (H-GLACIAL),"[Education, Electronics, Telecommunications]",27,4,7
4,inv35940,"Gonzalez Rodriguez, Pedro",pgonzale@ing.uc3m.es,Associate Professor,0000-0002-1378-273X,Mathematics,Grupo de Métodos Numéricos y Aplicaciones,"[Biology and Biomedicine, Fission, Mathematics...",26,1,6


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1393 entries, 0 to 1392
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   invID           1393 non-null   object
 1   Name            1393 non-null   object
 2   Email           1278 non-null   object
 3   Category        1380 non-null   object
 4   ORCID           1150 non-null   object
 5   Department      1393 non-null   object
 6   Research Group  1393 non-null   object
 7   Subjects        1113 non-null   object
 8   no Publis       1393 non-null   int64 
 9   Projects IP     1393 non-null   int64 
 10  Projects no IP  1393 non-null   int64 
dtypes: int64(3), object(8)
memory usage: 119.8+ KB


In [6]:
df.to_parquet(parquet_data.joinpath('researchers.parquet'))

# 3. Tables `Patents` and `researchers_patents`

In [41]:
all_activities = list(raw_data.glob("act*"))

all_patents = []
res_patents = []

nact = 0

for file_path in tqdm(all_activities):
    if file_path.is_file():
        with file_path.open("r") as fin:
            content = fin.read()

            # Crear un objeto BeautifulSoup
            soup = BeautifulSoup(content, "html.parser")
            activity_type = soup.find(class_="display-title").get_text().strip()
            if activity_type in ['Patents']:
                
                # Título
                try: 
                    title = soup.title.get_text()
                except:
                    title = None
                    
                # ID
                try:
                    ID = soup.find('ul', {'id': 'patentNumber-noRangeClass-List'}).find('li').text.strip()
                except:
                    ID = None 
                
                
                # Date field
                try:
                    date_field = soup.find('ul', {'id': 'dateFiled-DateTimeValue-List'}).find('li').text.strip()
                except:
                    date_field = None
                    
                # Date issued
                try:
                    date_issued = soup.find('ul', {'id': 'dateIssued-DateTimeValue-List'}).find('li').text.strip()
                except:
                    date_issued = None
                
                # List of authors
                try:
                    authors = soup.find('ul', id="relatedBy-UC3MParticipation-List").find_all('li', role="listitem")
                    for idx,el in enumerate(authors):
                        if el.find('a'):
                            res_patents.append([file_path.stem, el.find('a')['href'].strip().split('.html')[0], idx+1])
                except:
                    pass
                
                all_patents.append([file_path.stem, title, ID, date_field, date_issued])

                
columns = ['actID', 'PatentTitle', 'PatentID', 'DateField', 'DateIssued']
df_patents= pd.DataFrame(all_patents, columns=columns)

columns = ['actID', 'invID', 'Order']
df_res_patents = pd.DataFrame(res_patents, columns=columns)


100%|██████████| 53063/53063 [07:49<00:00, 113.11it/s]


In [39]:
print("Number of Patents", len(df_patents))
df_patents.head()

Number of Patents 114


Unnamed: 0,actID,PatentTitle,PatentID,DateField,DateIssued
0,act445698,Dispositivo de sujeción de muestras para micro...,ES2634913,"March 29, 2016","May 31, 2018"
1,act487538,Dispositivo de elección aleatoria,ES1213405,"April 26, 2018","August 16, 2018"
2,act427723,Procedimiento y dispositivo para sincronizar s...,ES2370218,"May 20, 2010","October 5, 2012"
3,act445147,Procedimiento de diagnóstico de transformadore...,ES2452936,"January 5, 2012","March 20, 2015"
4,act347446,Sistema de captación sonora,ES2365475,"March 24, 2010","August 23, 2012"


In [42]:
df_patents.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114 entries, 0 to 113
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   actID        114 non-null    object
 1   PatentTitle  114 non-null    object
 2   PatentID     108 non-null    object
 3   DateField    114 non-null    object
 4   DateIssued   114 non-null    object
dtypes: object(5)
memory usage: 4.6+ KB


In [49]:
df_res_patents = pd.merge(df_res_patents, df_patents[['actID', 'PatentID']], on='actID', how='left')

In [50]:
df_res_patents.head(8)

Unnamed: 0,actID,invID,Order,PatentID
0,act445698,inv41933,1,ES2634913
1,act487538,inv40172,1,ES1213405
2,act487538,inv37197,2,ES1213405
3,act427723,inv17929,3,ES2370218
4,act445147,inv16997,1,ES2452936
5,act347446,inv35658,1,ES2365475
6,act495429,inv41531,1,
7,act495429,inv41528,2,


In [43]:
df_res_patents.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232 entries, 0 to 231
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   actID   232 non-null    object
 1   invID   232 non-null    object
 2   Order   232 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 5.6+ KB


In [60]:
df_res_patents.to_parquet(parquet_data.joinpath('researchers_patents.parquet'))
df_patents.to_parquet(parquet_data.joinpath('patents.parquet'))

PermissionError: [Errno 13] Permission denied: '/export/data_ml4ds/AI4U/Datasets/ResearchPortal/20231005/parquet/researchers_patents.parquet'

In [61]:
df_res_patents.to_parquet('/export/usuarios_ml4ds/mbalairon/uc3m_patents/researchers_patents.parquet')
df_patents.to_parquet('/export/usuarios_ml4ds/mbalairon/uc3m_patents/patents.parquet')