# CORDIS
## IMPORTS

In [11]:
from pathlib import Path
import pyspark.sql.functions as F
from pyspark.sql import Row
from pyspark.sql.functions import unix_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, ArrayType, DateType,BooleanType
from pyspark.sql.functions import col, regexp_replace, when, coalesce, count, to_date
from tqdm import tqdm
import pandas as pd

## Define directorios y cargar parquets:

In [12]:
dir_data = Path('/export/data_ml4ds/AI4U/Datasets/cordis/20240411/rawdata') 
dir_parquet_cordis = Path('/export/data_ml4ds/AI4U/Datasets/cordis/20240411/parquet')
dir_parquet_researchportal = Path('/export/data_ml4ds/AI4U/Datasets/ResearchPortal/projects.parquet')

In [13]:
version = '20240411'
df_cordis_projects = spark.read.parquet("file:///export/data_ml4ds/AI4U/Datasets/cordis/{}/parquet/projects.parquet/".format(version))
df_cordis_organizations = spark.read.parquet("file:///export/data_ml4ds/AI4U/Datasets/cordis/{}/parquet/organizations.parquet/".format(version))
df_researchPortal_projects = spark.read.parquet("file:///export/data_ml4ds/AI4U/Datasets/ResearchPortal/20240321/parquet/projects.parquet/".format(version))

In [14]:
print('Number of projects available:', df_cordis_projects.count())
df_cordis_projects.printSchema()
#df_cordis_projects.show(n=1, truncate=120, vertical=True)

print('Number of projects available:', df_researchPortal_projects.count())
#df_researchPortal_projects.printSchema()
#df_researchPortal_projects.show(n=1, truncate=120, vertical=True)

Number of projects available: 72449
root
 |-- projectID: long (nullable = true)
 |-- acronym: string (nullable = true)
 |-- status: string (nullable = true)
 |-- title: string (nullable = true)
 |-- startDate: timestamp (nullable = true)
 |-- endDate: timestamp (nullable = true)
 |-- totalCost: string (nullable = true)
 |-- ecMaxContribution: string (nullable = true)
 |-- ecSignatureDate: timestamp (nullable = true)
 |-- frameworkProgramme: string (nullable = true)
 |-- masterCall: string (nullable = true)
 |-- subCall: string (nullable = true)
 |-- fundingScheme: string (nullable = true)
 |-- nature: string (nullable = true)
 |-- objective: string (nullable = true)
 |-- contentUpdateDate: timestamp (nullable = true)
 |-- rcn: long (nullable = true)
 |-- grantDoi: string (nullable = true)
 |-- topic: string (nullable = true)
 |-- topic_title: string (nullable = true)
 |-- countryContr: string (nullable = true)
 |-- orgContr: string (nullable = true)
 |-- coordinatorCountry: string (nul

In [15]:
# PASARLO A PANDAS
# Convertir valores para que no falle
df_cordis_projects = df_cordis_projects.withColumn('startDate_numeric', unix_timestamp('startDate'))
df_cordis_projects = df_cordis_projects.withColumn('endDate_numeric', unix_timestamp('endDate'))
df_cordis_projects = df_cordis_projects.withColumn('ecSignatureDate_numeric', unix_timestamp('ecSignatureDate'))
df_cordis_projects = df_cordis_projects.withColumn('contentUpdateDate_numeric', unix_timestamp('contentUpdateDate'))
df_cordis_projects = df_cordis_projects.drop('startDate', 'endDate', 'ecSignatureDate', 'contentUpdateDate')

df_cordis_organizations = df_cordis_organizations.withColumn('SME', col('SME').cast('string'))
df_cordis_organizations = df_cordis_organizations.withColumn('contentUpdateDate_numeric', unix_timestamp('contentUpdateDate'))
df_cordis_organizations = df_cordis_organizations.drop('SME', 'contentUpdateDate')

# CORDIS Y RS
df_cordis_projects_pd = df_cordis_projects.toPandas()
df_researchPortal_projects_pd = df_researchPortal_projects.toPandas()
df_cordis_organizations_pd = df_cordis_organizations.toPandas()

                                                                                

## 1. Caso en el que salen que pertenecen a la uc3m en CORDIS

In [16]:
# 1 SACAR (DE ORGANIZATIONS) UNA LISTA DE LOS PROJECT_IDS QUE SON DE LA UC3M
df_uc3m = df_cordis_organizations_pd[df_cordis_organizations_pd['name'].str.contains('UNIVERSIDAD CARLOS III DE MADRID', case=False)]
project_ids = df_uc3m.loc[976, 'projectID']
df_uc3m.head() # 257 projects 

Unnamed: 0,organisationID,vatNumber,name,shortName,activityType,street,postCode,city,country,nutsCode,geolocation,organizationURL,projectID,contentUpdateDate_numeric
976,999899572.0,ESQ2818029G,UNIVERSIDAD CARLOS III DE MADRID,UC3M,HES,CALLE MADRID 126,28903,Getafe (Madrid),ES,ES300,"40.3163966,-3.7271484",http://www.uc3m.es,"[213740, 211329, 115337, 318115, 609666, 61048...",1709738058


In [17]:
# 2 LOCALIZAR ESOS PROJECTS_IDS EN LA TABLA DE PROJECTS EN CORDIS (HAY 257 QUE PERTENECEN A LA UC3M)
df_cordis_projects_pd_uc3m = df_cordis_projects_pd[df_cordis_projects_pd['projectID'].isin(project_ids)]
#df_cordis_projects_pd_uc3m

In [18]:
# 3 DEFINIR LA FUNCION PARA MATCHEAR YA QUE EL PROJECT_ID (CORDIS) SUELE ESTAR METIDO EN LA REFERENCE (RP)
def match_df_con_lista(df_researchPortal_projects, project_ids, df_cordis_projects_pd_uc3m):
    
    # Crear una lista para almacenar las filas coincidentes
    matched_rows = []
    
    # Iterar sobre cada fila del DataFrame
    for idx, fila in df_researchPortal_projects_pd.iterrows():
        # Obtener la referencia de la fila actual
        reference = fila['Reference']
        
        if pd.notna(reference):
            # Iterar sobre cada project ID en la lista de project_ids
            for project_id in project_ids:
                # Verificar si la referencia contiene algún project ID de la lista
                if str(project_id) in reference:
                    # Si hay coincidencia, agregar la fila a la lista de filas coincidentes
                    matched_row = {
                        'Reference': reference,
                        'projectID': project_id,
                        'actID': fila['actID'], 
                        'objective': df_cordis_projects_pd_uc3m[df_cordis_projects_pd_uc3m['projectID'] == project_id]['objective'].iloc[0], 
                        'TitleCORDIS': df_cordis_projects_pd_uc3m[df_cordis_projects_pd_uc3m['projectID'] == project_id]['title'].iloc[0],
                        'TitleRP': fila['Title']
                    }
                    matched_rows.append(matched_row)
    
    # Convertir la lista de filas coincidentes en un DataFrame de pandas
    df_matched = pd.DataFrame(matched_rows)
    
    return df_matched

In [19]:
# 4 APLICAR FUNCION Y VER QUE 250 DE 257 LOS TENEMOS EN RP)
df_matched_uc3m_pd = match_df_con_lista(df_researchPortal_projects_pd, project_ids, df_cordis_projects_pd_uc3m)
df_matched_uc3m_pd.to_csv('df_matched_uc3m.csv', index=False) 
df_matched_uc3m_pd # 252

Unnamed: 0,Reference,projectID,actID,objective,TitleCORDIS,TitleRP
0,101123298,101123298,act562587,NextENERGEIA plans to develop and test a proof...,Simulating the effects of low-carbon investmen...,LOWCARBON_INVESTMENT\t- Simulating the effects...
1,PITN-GA-2013-608129,608129,act540051,The current macroeconomic and financial crisis...,Macroeconomics and Financial History,MACROHIST - Macroeconomics and Financial History
2,101114795,101114795,act559742,Contrails and aviation-induced cloudiness effe...,Artificial Neural Networks for the Prediction ...,ECONTRAIL - Artificial Neural Networks for the...
3,GA-115337,115337,act544458,Drug development in TB requires new integrated...,Model-based preclinical development of anti-tu...,PreDict-TB:Model-based preclinical development...
4,GA-871370,871370,act544679,The Web economy has been revolutionized by unp...,PIMCITY: BUILDING THE NEXT GENERATION PERSONAL...,PIMCITY: Building the next generation personal...
...,...,...,...,...,...,...
247,GA-859881,859881,act544677,Abstract: 5G-DIVE targets end-to-end 5G trials...,5G-DIVE: eDge Intelligence for Vertical Experi...,5G-DIVE: eDge Intelligence for Vertical Experi...
248,101059984,101059984,act554165,The novel development of Physics-Informed Neur...,Physics-informed nEuRal networks for SEVERe wE...,PERSEVERE-Physics-informed nEuRal networks for...
249,101021377; SEP- 210695221,101021377,act544758,Users often get exposed to security and privac...,"Enhancing Digital Security, Privacy and TRUST ...","TRUSTaWARE - Enhancing Digital Security, Priva..."
250,GA-101052200,101052200,act559628,EUROfusion’s updated Fusion Research Roadmap a...,Implementation of activities described in the ...,EUROFUSION 2023-WPMAT: MAT-T.02.02-T003_Develo...


## 2. Caso en el que buscamos que crucen por titulos sin tener en cuenta la organizacion a la que pertenecen

### Definir y ejecutar matching

In [28]:
# 1 DEFINIR FUNCIONES PARA NORMALIZAR TITULOS Y HACER MATCHING
import string

def clean_string(string):
    '''
    Function for cleaning the string removing special characters

    Parameteres:
    string -> String to be cleaned
    
    Returns:
    string -> Cleaned inputed string
    '''
    
    # Lower case
    string = string.lower()
    
    # Remove ','
    string = string.replace(",", "")
    
    # Define characters to remplace (accents)
    changes = {
        'á': 'a', 'é': 'e', 'í': 'i', 'ó': 'o', 'ú': 'u',
        'à': 'a', 'è': 'e', 'ì': 'i', 'ò': 'o', 'ù': 'u',
        'â': 'a', 'ê': 'e', 'î': 'i', 'ô': 'o', 'û': 'u',
        'ä': 'a', 'ë': 'e', 'ï': 'i', 'ö': 'o', 'ü': 'u',
        'ã': 'a', 'õ': 'o', 'ñ': 'n', 'ç': 'c',
    }
    # Perform the change
    for accent, not_accent in changes.items():
        string = string.replace(accent, not_accent)
        
    return string

def normalize_title(title):
    """
    Normaliza un título.

    Parámetros:
    title (str): El título a normalizar.

    Retorna:
    str: El título normalizado.
    """
    # Convertir a minúsculas
    if title is None:
        return None
    else:   
        title = clean_string(title)
        
        # Quitar signos de puntuación
        title = title.translate(str.maketrans('', '', string.punctuation))
        return title

# Registrar la UDF
normalize_title_udf = F.udf(normalize_title, StringType())


def match(df_researchPortal_projects, df_cordis):
    df_researchPortal_norm = df_researchPortal_projects.withColumn("tituloNormalizado_RP", normalize_title_udf(df_researchPortal_projects["title"]))
    df_cordis_norm = df_cordis.withColumn("tituloNormalizado_CORDIS", normalize_title_udf(df_cordis["title"]))

    schema = StructType([
    StructField("actID", StringType(), nullable=True),
    StructField("tituloNormalizado_CORDIS", StringType(), nullable=True),
    StructField("tituloNormalizado_RP", StringType(), nullable=True),    
    StructField("Reference", StringType(), nullable=True),
    StructField("projectID", StringType(), nullable=True)])

    # Crear el DataFrame vacío con el esquema especificado
    df_matched = spark.createDataFrame([], schema)
    
    total_filas = df_researchPortal_norm.count()
    for fila1 in tqdm(df_researchPortal_norm.collect(), total=total_filas, desc="Progreso"):
        #f1 = fila1.asDict()
        tituloNormalizado_RP = fila1.asDict()['tituloNormalizado_RP']
        for fila2 in df_cordis_norm.collect():
            #f2 = fila2.asDict()
            tituloNormalizado_CORDIS = fila2.asDict()['tituloNormalizado_CORDIS']
            
            # Chequea si el titulo es el igual o uno esta dentro de otro o viceversa
            if (tituloNormalizado_CORDIS == tituloNormalizado_RP) or (tituloNormalizado_RP in tituloNormalizado_CORDIS) or (tituloNormalizado_CORDIS in tituloNormalizado_RP):
                new_row = Row(
                actID = fila1.asDict()['actID'],
                tituloNormalizado_CORDIS=tituloNormalizado_CORDIS,
                tituloNormalizado_RP=tituloNormalizado_RP,
                Reference = fila1.asDict()['Reference'],
                projectID = fila2.asDict()['projectID'])
                
                # Añadir a df_matched
                df_new_row = spark.createDataFrame([new_row], schema=schema)
                df_matched = df_matched.union(df_new_row)
    
    return df_matched

#df_researchPortal_projects.show(n=1, truncate=120, vertical=True)
#df_cordis.show(n=1, truncate=120, vertical=True)

spark = SparkSession.builder \
    .appName("Sesion1") \
    .getOrCreate()

In [22]:
###### NO EJECUTAR #######
# 2 EJECUTAR FUNCION Y GUARDAR
#df_matched = match(df_researchPortal_projects, df_cordis_projects)
df_matched_pd = df_matched.toPandas()

df_matched_pd.head()
print('Number of projects in the Research Portal:', df_researchPortal_projects.count()) # 5625
print('Number of projects matched by title:', df_matched_pd.count()) # 2299

# 3 AÑADIR COLUMNA OBJETIVO
df_matched_obj = df_matched_pd.merge(df_cordis_projects_pd[['projectID', 'objective']], on='projectID', how='left')

# 4 DECIDIR COMO FLITRAR PARA QUEDARME CON MENOS
df_matched_obj['num_palabras_titulo_CORDIS'] = df_matched_obj['tituloNormalizado_CORDIS'].str.split().str.len()
df_matched_obj['num_palabras_titulo_RP'] = df_matched_obj['tituloNormalizado_RP'].str.split().str.len()
indices_eliminar = df_matched_obj.loc[((df_matched_obj['num_palabras_titulo_CORDIS'] >= 1) & (df_matched_obj['num_palabras_titulo_CORDIS'] <= 2)) | ((df_matched_obj['num_palabras_titulo_RP'] >= 1) & (df_matched_obj['num_palabras_titulo_RP'] <= 2))].index

df_matched_obj_filtrado = df_matched_obj.drop(indices_eliminar) # 2299, si lo filtro(<2)  serian 218

# GUARDARLO
#df_matched_obj_filtrado.to_csv('df_matched_pd.csv', index=False) 
df_matched_obj_filtrado

NameError: name 'match' is not defined

In [9]:
# 5 CARGAR
df_matched_pd = pd.read_csv('df_matched_pd.csv')

df_matched_pd.drop(columns=['objective_x', 'objective_y'], inplace=True)
df_matched_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 218 entries, 0 to 217
Data columns (total 7 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   actID                       218 non-null    object
 1   tituloNormalizado_CORDIS    218 non-null    object
 2   tituloNormalizado_RP        218 non-null    object
 3   Reference                   214 non-null    object
 4   projectID                   218 non-null    int64 
 5   num_palabras_titulo_CORDIS  218 non-null    int64 
 6   num_palabras_titulo_RP      218 non-null    int64 
dtypes: int64(3), object(4)
memory usage: 12.0+ KB


## 3. Unir ambos cruces para sacar el maximo valor de correspondencias

In [20]:
df_matched_pd = pd.read_csv('df_matched_pd.csv')
df_matched_pd.drop(columns=['num_palabras_titulo_CORDIS', 'num_palabras_titulo_RP'], inplace=True)

df_matched_pd = df_matched_pd.rename(columns={
    'Reference': 'Reference',
    'projectID': 'projectID',
    'actID': 'actID',
    'objective_x': 'objective',
    'tituloNormalizado_CORDIS': 'TitleCORDIS',
    'tituloNormalizado_RP': 'TitleRP' }) [['actID', 'Reference', 'projectID', 'TitleRP', 'TitleCORDIS', 'objective']]

df_matched_pd.head() # 218 (4 vacios en REFERENCE)

Unnamed: 0,actID,Reference,projectID,TitleRP,TitleCORDIS,objective
0,act562587,101123298,101123298,lowcarboninvestment\t simulating the effects o...,simulating the effects of lowcarbon investment...,NextENERGEIA plans to develop and test a proof...
1,act540051,PITN-GA-2013-608129,608129,macrohist macroeconomics and financial history,macroeconomics and financial history,The current macroeconomic and financial crisis...
2,act552578,,278368,aercost 5613 innovative pr impact test campa...,impact test campaign,"""IMPTEST involves a campaign for gas gun impac..."
3,act559742,101114795,101114795,econtrail artificial neural networks for the ...,artificial neural networks for the prediction ...,Contrails and aviation-induced cloudiness effe...
4,act544458,GA-115337,115337,predicttbmodelbased preclinical development of...,modelbased preclinical development of antitube...,Drug development in TB requires new integrated...


In [21]:
df_matched_uc3m_pd = pd.read_csv('df_matched_uc3m.csv')[['actID', 'Reference', 'projectID', 'TitleRP', 'TitleCORDIS', 'objective']]
df_matched_uc3m_pd.head() # 252

Unnamed: 0,actID,Reference,projectID,TitleRP,TitleCORDIS,objective
0,act562587,101123298,101123298,LOWCARBON_INVESTMENT\t- Simulating the effects...,Simulating the effects of low-carbon investmen...,NextENERGEIA plans to develop and test a proof...
1,act540051,PITN-GA-2013-608129,608129,MACROHIST - Macroeconomics and Financial History,Macroeconomics and Financial History,The current macroeconomic and financial crisis...
2,act559742,101114795,101114795,ECONTRAIL - Artificial Neural Networks for the...,Artificial Neural Networks for the Prediction ...,Contrails and aviation-induced cloudiness effe...
3,act544458,GA-115337,115337,PreDict-TB:Model-based preclinical development...,Model-based preclinical development of anti-tu...,Drug development in TB requires new integrated...
4,act544679,GA-871370,871370,PIMCITY: Building the next generation personal...,PIMCITY: BUILDING THE NEXT GENERATION PERSONAL...,The Web economy has been revolutionized by unp...


In [22]:
# LOS QUE ME COINCIDEN EN AMBOS CASOS
df_concatenado = pd.concat([df_matched_uc3m_pd, df_matched_pd], ignore_index=True)
df_concatenado_dup = df_concatenado[df_concatenado.duplicated(subset=['actID'], keep=False)].sort_values(by='actID')
df_concatenado_dup #192
df_concatenado = df_concatenado.drop_duplicates(subset=['actID'], ignore_index=True)
df_concatenado.head()

Unnamed: 0,actID,Reference,projectID,TitleRP,TitleCORDIS,objective
0,act562587,101123298,101123298,LOWCARBON_INVESTMENT\t- Simulating the effects...,Simulating the effects of low-carbon investmen...,NextENERGEIA plans to develop and test a proof...
1,act540051,PITN-GA-2013-608129,608129,MACROHIST - Macroeconomics and Financial History,Macroeconomics and Financial History,The current macroeconomic and financial crisis...
2,act559742,101114795,101114795,ECONTRAIL - Artificial Neural Networks for the...,Artificial Neural Networks for the Prediction ...,Contrails and aviation-induced cloudiness effe...
3,act544458,GA-115337,115337,PreDict-TB:Model-based preclinical development...,Model-based preclinical development of anti-tu...,Drug development in TB requires new integrated...
4,act544679,GA-871370,871370,PIMCITY: Building the next generation personal...,PIMCITY: BUILDING THE NEXT GENERATION PERSONAL...,The Web economy has been revolutionized by unp...


In [23]:
df_concatenado.merge(df_researchPortal_projects_pd, on='actID', how='inner') # 278

Unnamed: 0,actID,Reference_x,projectID,TitleRP,TitleCORDIS,objective,Title,StartYear,EndYear,Funding_Entity,Reference_y,Type,Keywords
0,act562587,101123298,101123298,LOWCARBON_INVESTMENT\t- Simulating the effects...,Simulating the effects of low-carbon investmen...,NextENERGEIA plans to develop and test a proof...,LOWCARBON_INVESTMENT\t- Simulating the effects...,2024.0,2025.0,EUROPEAN COMMISSION RESEARCH EXECUTIVE AGENCY,101123298,European Research Project,
1,act540051,PITN-GA-2013-608129,608129,MACROHIST - Macroeconomics and Financial History,Macroeconomics and Financial History,The current macroeconomic and financial crisis...,MACROHIST - Macroeconomics and Financial History,2013.0,2017.0,EUROPEAN COMMISSION RESEARCH EXECUTIVE AGENCY,PITN-GA-2013-608129,European Research Project,
2,act559742,101114795,101114795,ECONTRAIL - Artificial Neural Networks for the...,Artificial Neural Networks for the Prediction ...,Contrails and aviation-induced cloudiness effe...,ECONTRAIL - Artificial Neural Networks for the...,2023.0,2025.0,EUROPEAN COMMISSION RESEARCH EXECUTIVE AGENCY,101114795,European Research Project,
3,act544458,GA-115337,115337,PreDict-TB:Model-based preclinical development...,Model-based preclinical development of anti-tu...,Drug development in TB requires new integrated...,PreDict-TB:Model-based preclinical development...,2012.0,2017.0,EUROPEAN COMMISSION RESEARCH EXECUTIVE AGENCY,GA-115337,European Research Project,
4,act544679,GA-871370,871370,PIMCITY: Building the next generation personal...,PIMCITY: BUILDING THE NEXT GENERATION PERSONAL...,The Web economy has been revolutionized by unp...,PIMCITY: Building the next generation personal...,2019.0,2022.0,EUROPEAN COMMISSION RESEARCH EXECUTIVE AGENCY,GA-871370,European Research Project,"[personal information management systems, pers..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,act544513,FP7-612336,287863,trendminerenlargedlargescale crosslingual tren...,largescale crosslingual trend mining and summa...,The recent massive growth in online media and ...,"TRENDMINER_ENLARGED:Large-scale, Cross-lingual...",2013.0,2014.0,EUROPEAN COMMISSION RESEARCH EXECUTIVE AGENCY,FP7-612336,European Research Project,
274,act544043,TEC2010-12005-E,258053,medieval multimedia transport for mobile video...,multimedia transport for mobile video applicat...,Video is a major challenge for the future Inte...,MEDIEVAL: Multimedia Transport for mobile Vide...,2011.0,2012.0,MINISTERIO DE CIENCIA E INNOVACION,TEC2010-12005-E,Complementary National Fundings for Research A...,
275,act554657,PID2021-123969NB-I00,230844,emergence of generic scale invariance in dynam...,dynamical complex systems,"""The program exchange concerns researchers in ...",Emergence of Generic Scale Invariance in Dynam...,2022.0,2025.0,AGENCIA ESTATAL DE INVESTIGACION (AEI),PID2021-123969NB-I00,National Research Project,"[ruido, no-equilibrio, sistemas complejos, inv..."
276,act543301,AYA2010-09908-E,263014,magdrive magneticsuperconductor cryogenic nonc...,magneticsuperconductor cryogenic noncontact ha...,"""The objective of this project is to design an...",MAGDRIVE: Magnetic-Superconductor Cryogenic No...,2009.0,2012.0,MINISTERIO DE CIENCIA E INNOVACION,AYA2010-09908-E,Complementary National Fundings for Research A...,


In [24]:
# VER CUALES CRUZAN CON LAS QUE ESTAN FINACIADAS POR LA COMISION EUROPEA:
df_EuropeanComision = df_researchPortal_projects_pd[df_researchPortal_projects_pd['Funding_Entity'] == 'EUROPEAN COMMISSION RESEARCH EXECUTIVE AGENCY']
#df_EuropeanComision #289

In [25]:
# LAS QUE TIENE EUROPEAR COMISION FINANCIANDO Y LAS HEMOS ENCONTRADO YA
matched_df_comision = pd.merge(df_EuropeanComision, df_concatenado, on='actID', how='inner')
matched_df_comision.head() # 244 

Unnamed: 0,actID,Title,StartYear,EndYear,Funding_Entity,Reference_x,Type,Keywords,Reference_y,projectID,TitleRP,TitleCORDIS,objective
0,act562587,LOWCARBON_INVESTMENT\t- Simulating the effects...,2024.0,2025.0,EUROPEAN COMMISSION RESEARCH EXECUTIVE AGENCY,101123298,European Research Project,,101123298,101123298,LOWCARBON_INVESTMENT\t- Simulating the effects...,Simulating the effects of low-carbon investmen...,NextENERGEIA plans to develop and test a proof...
1,act540051,MACROHIST - Macroeconomics and Financial History,2013.0,2017.0,EUROPEAN COMMISSION RESEARCH EXECUTIVE AGENCY,PITN-GA-2013-608129,European Research Project,,PITN-GA-2013-608129,608129,MACROHIST - Macroeconomics and Financial History,Macroeconomics and Financial History,The current macroeconomic and financial crisis...
2,act559742,ECONTRAIL - Artificial Neural Networks for the...,2023.0,2025.0,EUROPEAN COMMISSION RESEARCH EXECUTIVE AGENCY,101114795,European Research Project,,101114795,101114795,ECONTRAIL - Artificial Neural Networks for the...,Artificial Neural Networks for the Prediction ...,Contrails and aviation-induced cloudiness effe...
3,act544458,PreDict-TB:Model-based preclinical development...,2012.0,2017.0,EUROPEAN COMMISSION RESEARCH EXECUTIVE AGENCY,GA-115337,European Research Project,,GA-115337,115337,PreDict-TB:Model-based preclinical development...,Model-based preclinical development of anti-tu...,Drug development in TB requires new integrated...
4,act544679,PIMCITY: Building the next generation personal...,2019.0,2022.0,EUROPEAN COMMISSION RESEARCH EXECUTIVE AGENCY,GA-871370,European Research Project,"[personal information management systems, pers...",GA-871370,871370,PIMCITY: Building the next generation personal...,PIMCITY: BUILDING THE NEXT GENERATION PERSONAL...,The Web economy has been revolutionized by unp...


In [26]:
matched_df_comision = df_EuropeanComision.merge(df_concatenado, on='actID', how='outer',  indicator=True)
non_matched_df = matched_df_comision[matched_df_comision.isnull().any(axis=1)]

df_EuropeanComision_only = matched_df_comision[matched_df_comision['_merge'] == 'left_only']
df_EuropeanComision_only.drop(columns='_merge', inplace=True) #45
df_EuropeanComision_only= df_EuropeanComision_only[['actID','Title' , 'Reference_x']]

#df_EuropeanComision_only

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_EuropeanComision_only.drop(columns='_merge', inplace=True) #45


In [29]:
schema = StructType([
    StructField('actID', StringType(), True),
    StructField('Title', StringType(), True),
    StructField('Reference_x', StringType(), True),
])

rows = [Row(**record) for record in df_EuropeanComision_only.to_dict(orient='records')]
df_EuropeanComision_only_spark = spark.createDataFrame(df_EuropeanComision_only, schema=schema)
df_matched_EuropeanComision = match(df_EuropeanComision_only_spark.select('actID', col('Title').alias('title'), col('Reference_x').alias('Reference')), df_cordis_projects)
df_matched_EuropeanComision_pd = df_matched_EuropeanComision.toPandas()
df_matched_EuropeanComision_pd # NO CRUZA NINGUNO

Progreso: 100%|█████████████████████████████████| 45/45 [03:09<00:00,  4.22s/it]


Unnamed: 0,actID,tituloNormalizado_CORDIS,tituloNormalizado_RP,Reference,projectID


## 4. Añadir en la tabla ResearchPortal el objective de los projects que han cruzado

In [31]:
df_researchPortal_projects_pd

Unnamed: 0,actID,Title,StartYear,EndYear,Funding_Entity,Reference,Type,Keywords
0,act555183,Passive Radar: Advanced detection algorithms f...,2023.0,2025.0,OFFICE OF NAVAL RESEARCH GLOBAL,N62909-23-1-2002,European Research Project,
1,act537953,Análisis coste/utilidad de la implementación d...,2017.0,2018.0,FUNDACIÓN DE LOS BANCOS Y CAJAS DE CECA (FUNCAS),,R+D Contract,[análisis coste utilidad; atención temprana; b...
2,act543662,Asesoría técnica sobre escenarios previstos co...,2014.0,2015.0,CREARA CONSULTORES S.L.,,Technical Assessment and Assistance Contract,
3,act539991,Información bibliométrica y cienciométrica sob...,2014.0,2014.0,INSTITUTO VALENCIANO DE INVESTIGACIONES ECONÓM...,,Technical Assessment and Assistance Contract,
4,act540815,Sistema Integrado Voz-Datos para el Canal de D...,2006.0,2006.0,THALES ESPAÑA SISTEMAS SAU,,R+D Contract,
...,...,...,...,...,...,...,...,...
5620,act541361,Ajuste del modelo de valoración y gestión estr...,2013.0,2013.0,"NTT DATA SPAIN, S.L.U.",Nº Pedido 21959,R+D Contract,
5621,act538381,European Accounting Review Broadening Research...,2020.0,2024.0,EUROPEAN ACCOUNTING ASSOCIATION,,Technical Assessment and Assistance Contract,[accounting; european accounting; accounting s...
5622,act541205,Elaboración del Informe de Seguimiento mensual...,2008.0,2008.0,"GENERALITAT VALENCIANA CONSELLERIA D`ECONOMIA,...",Exp. 38/2008,R+D Contract,
5623,act543254,Catalogación de Contenidos Educativos del Serv...,2006.0,2007.0,FUNDACION SANTA MARIA EDICIONES SM,,Technical Services Contract,


In [32]:
# SE AÑADE LA COLUMNA OBJECTIVE: 
matched_df_def = pd.merge(df_researchPortal_projects_pd, df_concatenado[['actID', 'objective', 'projectID']], on='actID', how='inner')

matched_df_def.to_csv('outputs/CORDIS_conObjetivo.csv', index=False) 
matched_df_def.to_parquet('outputs/CORDIS_conObjetivo.parquet')

matched_df_def.head() # 278

Unnamed: 0,actID,Title,StartYear,EndYear,Funding_Entity,Reference,Type,Keywords,objective,projectID
0,act562587,LOWCARBON_INVESTMENT\t- Simulating the effects...,2024.0,2025.0,EUROPEAN COMMISSION RESEARCH EXECUTIVE AGENCY,101123298,European Research Project,,NextENERGEIA plans to develop and test a proof...,101123298
1,act540051,MACROHIST - Macroeconomics and Financial History,2013.0,2017.0,EUROPEAN COMMISSION RESEARCH EXECUTIVE AGENCY,PITN-GA-2013-608129,European Research Project,,The current macroeconomic and financial crisis...,608129
2,act552578,AERCOST - 5613 Innovative PR - Impact test cam...,2022.0,2022.0,"AIRBUS OPERATIONS, S.L.",,R+D Contract,[impacto; protecciones; velocidad de deformación],"""IMPTEST involves a campaign for gas gun impac...",278368
3,act559742,ECONTRAIL - Artificial Neural Networks for the...,2023.0,2025.0,EUROPEAN COMMISSION RESEARCH EXECUTIVE AGENCY,101114795,European Research Project,,Contrails and aviation-induced cloudiness effe...,101114795
4,act544458,PreDict-TB:Model-based preclinical development...,2012.0,2017.0,EUROPEAN COMMISSION RESEARCH EXECUTIVE AGENCY,GA-115337,European Research Project,,Drug development in TB requires new integrated...,115337
