In [42]:
import pandas as pd
import numpy as np
from fuzzywuzzy import process



# 1. Load datasets and Similarity matrices

In [12]:
# load databases
df_eic = pd.read_parquet('/export/data_ml4ds/AI4U/Datasets/work_programmes/EIC_work_programmes.parquet')
df_horizon = pd.read_parquet('/export/data_ml4ds/AI4U/Datasets/work_programmes/horizon_work_programmes.parquet')

df_publications = pd.read_parquet('/export/usuarios_ml4ds/mbalairon/github/recommendation_system/dataset/publications_topics.parquet')
df_publications_researchers = pd.read_parquet('/export/usuarios_ml4ds/mbalairon/github/data_ingest/UC3M ResearchPortal/Outputs/researchers_publications.parquet')
df_researchers = pd.read_parquet('/export/usuarios_ml4ds/mbalairon/github/data_ingest/UC3M ResearchPortal/Outputs/researchers.parquet')

# join together al the calls (igual que cuando creamos las matrices de similitud)
df_eic['Call'] = df_eic['id']
df_calls = pd.concat([df_horizon[['Call', 'Work Programme']], df_eic[['Call', 'Work Programme']]], axis=0).reset_index(drop=True)

# load the validation set
df_val = pd.read_excel('/export/usuarios_ml4ds/mbalairon/github/validation_recommendation_system/validation_set.xlsx')

In [15]:
# load similarity matrixs
similarity_embeddings_mean_imp = pd.read_parquet('/export/usuarios_ml4ds/mbalairon/github/recommendation_system/similarity_matrices_researchers/similarity_embeddings_mean_imp.parquet')
similarity_BERT_mean_imp = pd.read_parquet('/export/usuarios_ml4ds/mbalairon/github/recommendation_system/similarity_matrices_researchers/similarity_BERT_mean_imp.parquet')
similarity_tfidf_mean_imp = pd.read_parquet('/export/usuarios_ml4ds/mbalairon/github/recommendation_system/similarity_matrices_researchers/similarity_tfidf_mean_imp.parquet')
similarity_battacharyya_mean_imp = pd.read_parquet('/export/usuarios_ml4ds/mbalairon/github/recommendation_system/similarity_matrices_researchers/similarity_bhattacharyya_mean_imp.parquet')

# 2. Obtain Recommendations

In [18]:
def match(similarities, researcher, df_calls, n=667):
    '''
    Function for obtaining the ranking of researchers given a call 
    
    similarities -> df with all the smmilarities between researchers and calls
    researcher -> Researcher of interest
    n -> Number of recommendations we are interested in 
    df_calls -> Dataframe with the information about the calls
    '''
    
    ranking = similarities.transpose()[researcher].sort_values(ascending=False).fillna(0)
    ranking = pd.DataFrame(ranking).reset_index()
    id_calls = ranking['index'].to_list()
    similarities = ranking[researcher].to_list()
    id_calls = pd.DataFrame({'Call': id_calls, 'similarity': similarities})
    df_ranking_calls = pd.merge(id_calls, df_calls, on='Call', how='inner')

    return df_ranking_calls.head(n)

def recommendation_system(method, agg_method, researcher, calls, n=1129,
                         path='/export/usuarios_ml4ds/mbalairon/github/recommendation_system/similarity_matrices_researchers/similarity_{}_{}.parquet'):
    '''
    function for obtaining the recommendations
    
    path -> Path to the file containing the similarity matrix
    method -> Method selected to calculate the similarities 
    agg_method -> Agregation method selected for calculating the similarties between calls and researchers
    '''
    
    similarities = pd.read_parquet(path.format(method, agg_method))
    ranking = match(similarities, researcher, calls, n)
    return ranking[['Call', 'Work Programme', 'similarity']]

In [24]:
ranking = recommendation_system(method='tfidf', agg_method='mean_imp', researcher='inv43190', calls=df_calls)
ranking.head(1)

Unnamed: 0,Call,Work Programme,similarity
0,HORIZON-HLTH-2023-DISEASE-03-17,health,0.239693


In [39]:
df_researchers[df_researchers['invID']=='inv49042'].head()

Unnamed: 0,invID,Name,Email,Category,ORCID,Scopus,Department,Research Group,Subjects,no Publis,Projects IP,Projects no IP,Nombre y Apellidos IP
804,inv49042,"Sanchez Macian Perez, Alfonso Alejandro",alfonsan@it.uc3m.es,Associate Professor,0000-0002-2220-0594,23398740500,Telematic Engineering,Microeconomics and Industrial Organization,,0,1,4,Alfonso Alejandro Sanchez Macian Perez


In [29]:
def invertir_nombre_apellido(full_name):
    apellido, nombre = full_name.split(', ')
    return nombre + ' ' + apellido

# Aplica la función a la columna 'name' del DataFrame
df_researchers['Nombre y Apellidos IP'] = df_researchers['Name'].apply(invertir_nombre_apellido)



In [47]:
'Alfonso Sanchez' in 'Alfonso Alejandro Sanchez Macian Perez'

True

In [45]:
prueba = df_val.merge(df_researchers[['Nombre y Apellidos IP', 'invID']], on='Nombre y Apellidos IP')
prueba

Unnamed: 0,Año Fecha Fin Convocatoria,Área,Nombre y Apellidos IP,Departamento,Acrónimo,Línea prioritia/panel/topic,ESTADO,invID
0,2023,"Cluster 5. Clima, energia y movilidad",Rauno Cavallaro,Aeroespacial,MISSION,HORIZON-CL5-2023-D5-01-09,DENEGADA,inv44624
1,2024,"Cluster 5. Clima, energia y movilidad",Rauno Cavallaro,Aeroespacial,CORRECT-NOX,HORIZON-CL5-2024-D5-01-07,PREPARACIÓN,inv44624
2,2023,"Cluster 5. Clima, energia y movilidad",Marco Raiola,Aeroespacial,SURF,HORIZON-CL5-2023-D3-01-05,DENEGADA,inv43379
3,2023,"Cluster 5. Clima, energia y movilidad",Andrea Cini,Aeroespacial,QSAVED,HORIZON-CL5-2023-D6-01-11,DENEGADA,inv46956
4,2024,"Cluster 4. Mundo digital, industria y espacio",Antonio Soria Verdugo,Ingeniería Térmica y de Fluidos,HILARIOUS,HORIZON-CL4-2024-TWIN-TRANSITION-01-46,PRESENTADA,inv37839


In [32]:
df_val.shape

(51, 7)

In [35]:
df_val.head()

Unnamed: 0,Año Fecha Fin Convocatoria,Área,Nombre y Apellidos IP,Departamento,Acrónimo,Línea prioritia/panel/topic,ESTADO
0,2023,"Cluster 5. Clima, energia y movilidad",Rauno Cavallaro,Aeroespacial,MISSION,HORIZON-CL5-2023-D5-01-09,DENEGADA
1,2023,"Cluster 5. Clima, energia y movilidad",Marco Raiola,Aeroespacial,SURF,HORIZON-CL5-2023-D3-01-05,DENEGADA
2,2023,"Cluster 4. Mundo digital, industria y espacio",Alfonso Sanchez Mecian,Ingeniería Telemática,CiTrusVerse,HORIZON-CL4-2023-HUMAN-01-05,DENEGADA
3,2023,"Cluster 4. Mundo digital, industria y espacio",Fernando Garcia,Ingeniería de Sistemas y Automática,PLIADES,HORIZON-CL4-2023-DATA-01-02,APROBADA
4,2023,"Cluster 4. Mundo digital, industria y espacio",Abdulla Al Kaff,Ingeniería de Sistemas y Automática,CORAL,HORIZON-CL4-2023-DIGITAL-EMERGING-01-01,DENEGADA
