In [1]:
import pandas as pd
import numpy as np
from fuzzywuzzy import process



# 1. Load datasets and Similarity matrices

In [3]:
# load databases
df_eic = pd.read_parquet('/export/data_ml4ds/AI4U/Datasets/work_programmes/EIC_work_programmes.parquet')
df_horizon = pd.read_parquet('/export/data_ml4ds/AI4U/Datasets/work_programmes/horizon_work_programmes.parquet')

df_publications = pd.read_parquet('/export/usuarios_ml4ds/mbalairon/github/recommendation_system/dataset/publications_topics.parquet')
df_publications_researchers = pd.read_parquet('/export/usuarios_ml4ds/mbalairon/github/data_ingest/UC3M ResearchPortal/Outputs/researchers_publications.parquet')
df_researchers = pd.read_parquet('/export/usuarios_ml4ds/mbalairon/github/data_ingest/UC3M ResearchPortal/Outputs/researchers.parquet')

# join together al the calls (igual que cuando creamos las matrices de similitud)
df_eic['Call'] = df_eic['id']
df_calls = pd.concat([df_horizon[['Call', 'Work Programme']], df_eic[['Call', 'Work Programme']]], axis=0).reset_index(drop=True)

# load the validation set
df_val = pd.read_excel('/export/usuarios_ml4ds/mbalairon/github/recommendation_system_validation/validation_set.xlsx')

  warn(msg)


In [4]:
# load similarity matrixs
similarity_embeddings_mean_imp = pd.read_parquet('/export/usuarios_ml4ds/mbalairon/github/recommendation_system/similarity_matrices_researchers/similarity_embeddings_mean_imp.parquet')
similarity_BERT_mean_imp = pd.read_parquet('/export/usuarios_ml4ds/mbalairon/github/recommendation_system/similarity_matrices_researchers/similarity_BERT_mean_imp.parquet')
similarity_tfidf_mean_imp = pd.read_parquet('/export/usuarios_ml4ds/mbalairon/github/recommendation_system/similarity_matrices_researchers/similarity_tfidf_mean_imp.parquet')
similarity_battacharyya_mean_imp = pd.read_parquet('/export/usuarios_ml4ds/mbalairon/github/recommendation_system/similarity_matrices_researchers/similarity_bhattacharyya_mean_imp.parquet')

# 2. Obtain Recommendations

In [5]:
def match(similarities, researcher, df_calls, n=667):
    '''
    Function for obtaining the ranking of researchers given a call 
    
    similarities -> df with all the smmilarities between researchers and calls
    researcher -> Researcher of interest
    n -> Number of recommendations we are interested in 
    df_calls -> Dataframe with the information about the calls
    '''
    
    ranking = similarities.transpose()[researcher].sort_values(ascending=False).fillna(0)
    ranking = pd.DataFrame(ranking).reset_index()
    id_calls = ranking['index'].to_list()
    similarities = ranking[researcher].to_list()
    id_calls = pd.DataFrame({'Call': id_calls, 'similarity': similarities})
    df_ranking_calls = pd.merge(id_calls, df_calls, on='Call', how='inner')

    return df_ranking_calls.head(n)

def recommendation_system(method, agg_method, researcher, calls, n=1129,
                         path='/export/usuarios_ml4ds/mbalairon/github/recommendation_system/similarity_matrices_researchers/similarity_{}_{}.parquet'):
    '''
    function for obtaining the recommendations
    
    path -> Path to the file containing the similarity matrix
    method -> Method selected to calculate the similarities 
    agg_method -> Agregation method selected for calculating the similarties between calls and researchers
    '''
    
    similarities = pd.read_parquet(path.format(method, agg_method))
    ranking = match(similarities, researcher, calls, n)
    return ranking[['Call', 'Work Programme', 'similarity']]

In [6]:
ranking = recommendation_system(method='tfidf', agg_method='mean_imp', researcher='inv43190', calls=df_calls)
ranking.head(1)

Unnamed: 0,Call,Work Programme,similarity
0,HORIZON-HLTH-2023-DISEASE-03-17,health,0.239693


In [17]:
def invertir_nombre_apellido(full_name):
    apellido, nombre = full_name.split(', ')
    return nombre + ' ' + apellido

# Aplica la función a la columna 'name' del DataFrame
df_researchers['Nombre y Apellidos IP'] = df_researchers['Name'].apply(invertir_nombre_apellido)



In [60]:
prueba = df_val.merge(df_researchers[['Nombre y Apellidos IP', 'invID']], on='Nombre y Apellidos IP')
prueba

Unnamed: 0,Año Fecha Fin Convocatoria,Área,Nombre y Apellidos IP,Departamento,Acrónimo,Línea prioritia/panel/topic,ESTADO,invID
0,2023,"Cluster 5. Clima, energia y movilidad",Rauno Cavallaro,Aeroespacial,MISSION,HORIZON-CL5-2023-D5-01-09,DENEGADA,inv44624
1,2024,"Cluster 5. Clima, energia y movilidad",Rauno Cavallaro,Aeroespacial,CORRECT-NOX,HORIZON-CL5-2024-D5-01-07,PREPARACIÓN,inv44624
2,2023,"Cluster 5. Clima, energia y movilidad",Marco Raiola,Aeroespacial,SURF,HORIZON-CL5-2023-D3-01-05,DENEGADA,inv43379
3,2023,"Cluster 5. Clima, energia y movilidad",Andrea Cini,Aeroespacial,QSAVED,HORIZON-CL5-2023-D6-01-11,DENEGADA,inv46956
4,2024,"Cluster 4. Mundo digital, industria y espacio",Antonio Soria Verdugo,Ingeniería Térmica y de Fluidos,HILARIOUS,HORIZON-CL4-2024-TWIN-TRANSITION-01-46,PRESENTADA,inv37839


In [73]:
def get_validation_results(df_val, df_researchers, df_calls, method, agg_method):
    '''
    Función para calcular las métricas de similitud además de preparar el dataset para el SI
    
    df_val -> DataFrame con las propuestas de validación del SI (lo que han solicitado de verdad los investigadores)
    df_researchers -> DataSet con todos los datos relativos a los investigadores
    df_calls -> DataSet con todos los datos relativos a las convocatorias
    method -> Method selected to calculate the similarities 
    agg_method -> Agregation method selected for calculating the similarties between calls and researchers
    
    '''
    new_rows  = []
    for i in range(df_val.shape[0]):
        invID = df_val['invID'][i]
        call_validation = df_val['Línea prioritia/panel/topic'][i]

        # obtener metadatos de interés para el SI
        acronimo = df_val['Acrónimo'][i]
        department = df_researchers[df_researchers['invID']==invID].reset_index()['Department'][0]

        # obtener el ranking para el investigador dado y quedarnos solo con las filas por encima de la call de validación
        ranking = recommendation_system(method=method, agg_method=agg_method, researcher=invID, calls=df_calls)
        total_calls = ranking.shape[0]
        max_similarity = ranking['similarity'][0]
        indice_valor_exacto = ranking.loc[ranking['Call'] == call_validation].index[0]
        ranking = ranking.iloc[:indice_valor_exacto + 1]

        # calcular el score basado en la posicón
        posicion = ranking.shape[0]
        score_posicion = 1 - (posicion / total_calls)

        # calcular el score basado en la métrica de similitud
        similarity = ranking['similarity'][posicion-1]
        score_similarity = 1 - ((max_similarity - similarity)/max_similarity)

        # calcular el score basado en el cluster
        cluster_correcto = ranking['Work Programme'][posicion-1]
        count_cluster_correctos = 0
        for i in range(ranking.shape[0]):
            if ranking['Work Programme'][i] == cluster_correcto:
                count_cluster_correctos += 1

        score_cluster = count_cluster_correctos/ranking.shape[0]

        # añadir la fila a la lista de diccionarios
        new_rows.append({'invID': invID,
                         'call': call_validation,
                         'acronimo': acronimo,
                         'department': department,
                         'posicion': posicion,
                         'similitud': similarity,
                         'metodo': method + '_' +agg_method,
                         'score_posicion': score_posicion,
                         'score_similarity': score_similarity,
                         'score_cluster': score_cluster,
                         'score': (score_posicion + score_similarity + score_cluster) / 3})

    # crear un nuevo DataFrame con las filas añadidas
    df_validation_results = pd.DataFrame(new_rows)
    
    return df_validation_results


In [75]:
df_val_results = get_validation_results(df_val=prueba, df_researchers=df_researchers, df_calls=df_calls, method='tfidf', agg_method='mean_imp')

In [76]:
df_val_results

Unnamed: 0,invID,call,acronimo,department,posicion,similitud,metodo,score_posicion,score_similarity,score_cluster,score
0,inv44624,HORIZON-CL5-2023-D5-01-09,MISSION,Aerospace Engineering,4,0.081775,tfidf_mean_imp,0.99393,0.605578,1.0,0.866503
1,inv44624,HORIZON-CL5-2024-D5-01-07,CORRECT-NOX,Aerospace Engineering,26,0.048797,tfidf_mean_imp,0.960546,0.361358,0.615385,0.645763
2,inv43379,HORIZON-CL5-2023-D3-01-05,SURF,Aerospace Engineering,459,0.0,tfidf_mean_imp,0.30349,0.0,0.24183,0.181773
3,inv46956,HORIZON-CL5-2023-D6-01-11,QSAVED,Aerospace Engineering,26,0.040336,tfidf_mean_imp,0.960546,0.464376,0.730769,0.718564
4,inv37839,HORIZON-CL4-2024-TWIN-TRANSITION-01-46,HILARIOUS,Thermal and Fluids Engineering,150,0.03917,tfidf_mean_imp,0.772382,0.27097,0.226667,0.42334
