In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from funciones_recommendation_system import get_datasets, match_researcher_call, recommendation_system_researcher_call, match_call_researcher, recommendation_system_call_researcher

# 1. Load datasets and Similarity matrices

In [2]:
# load databases
# load databases
path = '/export/data_ml4ds/AI4U/Datasets/'
version = '20231005' # dado que todas las similarity matrices están calculadas con la versión anterior, usaremos los datos de la versión previa 

df_publications, df_publications_researchers, df_researchers, df_calls = get_datasets(path, version)

# load the validation set
df_val = pd.read_excel('/export/usuarios_ml4ds/mafuello/Github/recommendation_system_validation/validation_set.xlsx')

PermissionError: [Errno 13] Permission denied: '/export/data_ml4ds/AI4U/Datasets/ResearchPortal/20231005/parquet/publications.parquet'

In [51]:
df_publications.shape

(41292, 12)

# 2. Obtain Recommendations

In [3]:
def get_validation_results(df_val, df_researchers, df_calls, method, agg_method, recommendations):
    '''
    Función para calcular las métricas de similitud además de preparar el dataset para el SI.
    Se obtienen scores para las recomendacions de calls para un investigador dado
    
    df_val -> DataFrame con las propuestas de validación del SI (lo que han solicitado de verdad los investigadores)
    df_researchers -> DataSet con todos los datos relativos a los investigadores
    df_calls -> DataSet con todos los datos relativos a las convocatorias
    method -> Method selected to calculate the similarities 
    agg_method -> Agregation method selected for calculating the similarties between calls and researchers
    recommendations -> Must be either researchers or calls. It indicates if we are looking for recommendations of researchers or calls
    
    '''
    
    new_rows  = []
    total_iterations = df_val.shape[0]
    for i in tqdm(range(total_iterations), desc="Processing", unit=" rows", ncols=100):
        invID = df_val['invID'][i]
        call = df_val['Línea prioritia/panel/topic'][i]
        
        # in case we look for recommendations of calls given a researcher
        if recommendations == 'calls':                 
            # get the recommendations 
            ranking = recommendation_system_researcher_call(method=method, agg_method=agg_method, researcher=invID, calls=df_calls)
            total_recommendations = ranking.shape[0]
            indice_valor_exacto = ranking.loc[ranking['Call'] == call].index[0]
            ranking = ranking.iloc[:indice_valor_exacto + 1]

        # in case we look for recommendations of researchers given a call
        if recommendations == 'researchers':         
            # get the recommendations 
            ranking = recommendation_system_call_researcher(method=method, agg_method=agg_method, call=call, researchers=df_researchers)
            total_recommendations = ranking.shape[0]
            indice_valor_exacto = ranking.loc[ranking['invID'] == invID].index[0]
            ranking = ranking.iloc[:indice_valor_exacto + 1]
        
        posicion = ranking.shape[0]
        max_similarity = ranking['similarity'][0]
        similarity = ranking['similarity'][posicion-1]

        # obtain the necessary metadata SI
        acronimo = df_val['Acrónimo'][i]
        department = df_researchers[df_researchers['invID']==invID].reset_index()['Department'][0]
        cluster = df_calls[df_researchers['invID']==invID].reset_index()['Department'][0]
        
        if posicion>5:
            first_recommendations = ranking['Call'].head(5).tolist()

        else:
            first_recommendations = ranking['Call'].tolist()      

        first_recommendations = ' '.join(primeras_calls)
        
        # compute score based on similarity
        score_similarity = 1 - ((max_similarity - similarity)/max_similarity)    
        
        # compute score based on position
        if similarity == 0: 
            score_posicion = 0 
        else:
            score_posicion = 1 - (posicion / total_recommendations)

        # calcular el score basado en el cluster
        cluster_correcto = ranking['Work Programme'][posicion-1]
        count_cluster_correctos = 0
        for i in range(ranking.shape[0]):
            if ranking['Work Programme'][i] == cluster_correcto:
                count_cluster_correctos += 1

        score_cluster = count_cluster_correctos/ranking.shape[0]
        
      
        # añadir la fila a la lista de diccionarios
        new_rows.append({'invID': invID,
                         'topic': call_validation,
                         'Acrónimo': acronimo,
                         'Departamento': department,
                         'Cluster': cluster,
                         'Posición': posicion,
                         'Similitud': similarity,
                         'Método de Recomendación': method + '_' + agg_method,
                         'Primeras Recomendaciones':  first_recommendations,
                         'Recomendando': recommendations,
                         'score_posicion': score_posicion,
                         'score_similitud': score_similarity,
                         'score_cluster': score_cluster,
                         'score': score})

    # crear un nuevo DataFrame con las filas añadidas
    df_validation_results = pd.DataFrame(new_rows).sort_values(by='score', ascending=False).fillna(0)
    
    return df_validation_results


In [43]:
def get_validation_results_researcher_call(df_val, df_researchers, df_calls, method, agg_method):
    '''
    Función para calcular las métricas de similitud además de preparar el dataset para el SI.
    Se obtienen scores para las recomendacions de calls para un investigador dado
    
    df_val -> DataFrame con las propuestas de validación del SI (lo que han solicitado de verdad los investigadores)
    df_researchers -> DataSet con todos los datos relativos a los investigadores
    df_calls -> DataSet con todos los datos relativos a las convocatorias
    method -> Method selected to calculate the similarities 
    agg_method -> Agregation method selected for calculating the similarties between calls and researchers
    
    '''
    
    new_rows  = []
    total_iterations = df_val.shape[0]
    for i in tqdm(range(total_iterations), desc="Processing", unit=" rows", ncols=100):
        invID = df_val['invID'][i]
        call_validation = df_val['Línea prioritia/panel/topic'][i]

        # obtener metadatos de interés para el SI
        acronimo = df_val['Acrónimo'][i]
        department = df_researchers[df_researchers['invID']==invID].reset_index()['Department'][0]

        # obtener el ranking para el investigador dado y quedarnos solo con las filas por encima de la call de validación
        try:
            ranking = recommendation_system_researcher_call(method=method, agg_method=agg_method, researcher=invID, calls=df_calls)
            total_calls = ranking.shape[0]
            max_similarity = ranking['similarity'][0]
            indice_valor_exacto = ranking.loc[ranking['Call'] == call_validation].index[0]
            ranking = ranking.iloc[:indice_valor_exacto + 1]

            # calcular el score basado en la posicón
            posicion = ranking.shape[0]

            # sacar las cinco primeras calls de cada investigador
            if posicion>5:
                primeras_calls = ranking['Call'].head(5).tolist()

            else:
                primeras_calls = ranking['Call'].tolist()      
            
            primeras_calls = ' '.join(primeras_calls)

            # calcular el score basado en la métrica de similitud
            similarity = ranking['similarity'][posicion-1]
            score_similarity = 1 - ((max_similarity - similarity)/max_similarity)

            if similarity == 0:
                score_posicion = 0
            else:
                score_posicion = 1 - (posicion / total_calls)

            # calcular el score basado en el cluster
            cluster_correcto = ranking['Work Programme'][posicion-1]
            count_cluster_correctos = 0
            for i in range(ranking.shape[0]):
                if ranking['Work Programme'][i] == cluster_correcto:
                    count_cluster_correctos += 1

            score_cluster = count_cluster_correctos/ranking.shape[0]
            score = (score_posicion + score_similarity + score_cluster) / 3
            
            
        # provisional (meter los codigos de los proyectos)   
        except:
            #print('Investigador: ', invID,  ' no tiene publicaciones')
            posicion = None
            similitud = None
            primeras_calls = None
            score_posicion = None
            score_similitarity = None
            score_cluster = None
            score = None
            

        # añadir la fila a la lista de diccionarios
        new_rows.append({'invID': invID,
                         'call': call_validation,
                         'acronimo': acronimo,
                         'department': department,
                         'posicion': posicion,
                         'similitud': similarity,
                         'metodo': method + '_' + agg_method,
                         '5 primeros':  primeras_calls,
                         'score_posicion': score_posicion,
                         'score_similarity': score_similarity,
                         'score_cluster': score_cluster,
                         'score': score})

    # crear un nuevo DataFrame con las filas añadidas
    df_validation_results = pd.DataFrame(new_rows).sort_values(by='score', ascending=False).fillna(0)
    
    return df_validation_results


In [44]:
def get_validation_results_call_researcher(df_val, df_researchers, df_calls, method, agg_method):
    '''
    Función para calcular las métricas de similitud además de preparar el dataset para el SI.
    Se obtienen scores para las recomendacions de researchers para una call dada
    
    df_val -> DataFrame con las propuestas de validación del SI (lo que han solicitado de verdad los investigadores)
    df_researchers -> DataSet con todos los datos relativos a los investigadores
    df_calls -> DataSet con todos los datos relativos a las convocatorias
    method -> Method selected to calculate the similarities 
    agg_method -> Agregation method selected for calculating the similarties between calls and researchers
    
    '''
    
    new_rows  = []
    total_iterations = df_val.shape[0]
    for i in tqdm(range(total_iterations), desc="Processing", unit=" rows", ncols=100):
        call = df_val['Línea prioritia/panel/topic'][i]
        print(call)
        inv_validation = df_val['invID'][i]
        print(inv_validation)
        
        # obtener metadatos de interés para el SI
        acronimo = df_val['Acrónimo'][i]
        area = df_calls[df_calls['Call']==call].reset_index()['Work Programme'][0]

        # obtener el ranking para la call dada y quedarnos solo con las filas por encima del researcher de validación
        try:
            ranking = recommendation_system_call_researcher(method=method, agg_method=agg_method, call=call, researchers=df_researchers)
            total_researchers = ranking.shape[0]
            max_similarity = ranking['similarity'][0]
            indice_valor_exacto = ranking.loc[ranking['invID'] == inv_validation].index[0]
            ranking = ranking.iloc[:indice_valor_exacto + 1]

            # calcular el score basado en la posicón
            posicion = ranking.shape[0]

            # sacar las cinco primeras calls de cada investigador
            if posicion>5:
                primeros_researchers = ranking['invID'].head(5).tolist()

            else:
                primeros_researchers = ranking['invID'].tolist()      
            
            primeros_researchers = ' '.join(primeras_calls)

            # calcular el score basado en la métrica de similitud
            similarity = ranking['similarity'][posicion-1]
            score_similarity = 1 - ((max_similarity - similarity)/max_similarity)

            if similarity == 0:
                score_posicion = 0
            else:
                score_posicion = 1 - (posicion / total_calls)

            # calcular el score basado en el cluster
            cluster_correcto = ranking['Work Programme'][posicion-1]
            count_cluster_correctos = 0
            for i in range(ranking.shape[0]):
                if ranking['Work Programme'][i] == cluster_correcto:
                    count_cluster_correctos += 1

            score_cluster = count_cluster_correctos/ranking.shape[0]
            score = (score_posicion + score_similarity + score_cluster) / 3
            
            
        # provisional (meter los codigos de los proyectos)   
        except:
            print('Investigador: ', inv_validation,  ' no tiene publicaciones')
            posicion = None
            similitud = None
            primeras_calls = None
            score_posicion = None
            score_similitarity = None
            score_cluster = None
            score = None
            

        # añadir la fila a la lista de diccionarios
        new_rows.append({'invID': inv_validation,
                         'call': call,
                         'acronimo': acronimo,
                         'Área': area,
                         'posicion': posicion,
                         'similitud': similarity,
                         'metodo': method + '_' + agg_method,
                         '5 primeros':  primeros_researchers,
                         'score_posicion': score_posicion,
                         'score_similarity': score_similarity,
                         'score_cluster': score_cluster,
                         'score': score})

    # crear un nuevo DataFrame con las filas añadidas
    df_validation_results = pd.DataFrame(new_rows).sort_values(by='score', ascending=False).fillna(0)
    
    return df_validation_results


In [45]:
df = recommendation_system_call_researcher(method=methods[0], agg_method=agg_methods[0], call='HORIZON-CL5-2023-D5-01-09', researchers=df_researchers)

In [47]:
df

Unnamed: 0,invID,Department,Research Group,Subjects,no Publis,similarity
0,inv41280,Bioengineering,Biomedical Imaging and Instrumentation Group,"[Aeronautics, Biology and Biomedicine, Chemist...",496,74.716667
1,inv15625,Telematic Engineering,GAST- Telematic Applications and Services Group,"[Computer Science, Education, Information Scie...",358,62.023636
2,inv17274,Computer Science and Engineering,Applied Artificial Intelligence Group (GIAA),"[Computer Science, Economics, Electronics, Law...",283,55.232977
3,inv17271,Electronic Technology,Displays and Photonic Applications Group,"[Biology and Biomedicine, Chemistry, Computer ...",207,51.528464
4,inv16920,Computer Science and Engineering,"Computer, Communications and Systems Architecture","[Biology and Biomedicine, Computer Science, Me...",258,50.476661
...,...,...,...,...,...,...
1080,inv19431,Private Law,"Real Estate, Registry and Building Law","[Architecture, History, Law, Sociology]",72,0.000000
1081,inv19634,Private Law,Research Group on Non-contractual Liability,[Law],55,0.000000
1082,inv17539,Public State Law,Research Group on Financial and Tax Law,[Law],47,0.000000
1083,inv48660,Communication Studies,"Television-Cinema: memory, representation and ...",[Information Science],2,0.000000


In [24]:
get_validation_results_call_researcher(df_val=df_val, df_researchers=df_researchers, df_calls=df_calls, method=methods[0], agg_method=agg_methods[0])

Processing:   0%|                                                         | 0/51 [00:00<?, ? rows/s]


HORIZON-CL5-2023-D5-01-09
inv44624
Investigador:  inv44624  no tiene publicaciones


UnboundLocalError: local variable 'similarity' referenced before assignment

In [None]:
'HORIZON-CL5-2023-D5-01-09'

In [8]:
methods = ['BERT', 'tfidf', 'bhattacharyya', 'embeddings']
agg_methods = ['sum', 'mean', 'mean_imp'] 
val_results_calls_recommendations = {}

df_val = df_val.rename(columns={'research portal': 'invID'})

# compute scores for recomendations of calls for a given researcher
for i in methods:
    for j in agg_methods:
        val_results_calls_recommendations[f"{i}_{j}_val_results"] = get_validation_results_researcher_call(df_val=df_val, df_researchers=df_researchers, df_calls=df_calls, method=i, agg_method=j)

# Unificar todos los dataframes
dfs = []
for key, df in val_results_calls_recommendations.items():
    # Agregar el DataFrame a la lista
    dfs.append(df)

df_val_total_calls_recommendations = pd.concat(dfs, ignore_index=True).sort_values(by = 'score', ascending = False).fillna(0)
df_val_total_calls_recommendations.head(5)

Processing: 100%|████████████████████████████████████████████████| 51/51 [00:05<00:00,  8.76 rows/s]
Processing: 100%|████████████████████████████████████████████████| 51/51 [00:05<00:00,  8.86 rows/s]
Processing: 100%|████████████████████████████████████████████████| 51/51 [00:05<00:00,  8.89 rows/s]
Processing: 100%|████████████████████████████████████████████████| 51/51 [00:05<00:00,  8.91 rows/s]
Processing: 100%|████████████████████████████████████████████████| 51/51 [00:05<00:00,  9.10 rows/s]
Processing: 100%|████████████████████████████████████████████████| 51/51 [00:05<00:00,  9.03 rows/s]
Processing: 100%|████████████████████████████████████████████████| 51/51 [00:05<00:00,  8.84 rows/s]
Processing: 100%|████████████████████████████████████████████████| 51/51 [00:05<00:00,  8.99 rows/s]
Processing: 100%|████████████████████████████████████████████████| 51/51 [00:05<00:00,  8.85 rows/s]
Processing: 100%|████████████████████████████████████████████████| 51/51 [00:05<00:00,  8.9

Unnamed: 0,invID,call,acronimo,department,posicion,similitud,metodo,5 primeros,score_posicion,score_similarity,score_cluster,score
0,inv44624,HORIZON-CL5-2023-D5-01-09,MISSION,Aerospace Engineering,1.0,9.22406,BERT_sum,HORIZON-CL5-2023-D5-01-09,0.998483,1.0,1.0,0.999494
207,inv21183,HORIZON-CL4-2023-DIGITAL-EMERGING-01-01,Multonomy,Systems Engineering and Automation,1.0,0.095047,tfidf_mean,HORIZON-CL4-2023-DIGITAL-EMERGING-01-01,0.998483,1.0,1.0,0.999494
102,inv44624,HORIZON-CL5-2023-D5-01-09,MISSION,Aerospace Engineering,1.0,0.614937,BERT_mean_imp,HORIZON-CL5-2023-D5-01-09,0.998483,1.0,1.0,0.999494
103,inv21183,HORIZON-CL4-2023-DIGITAL-EMERGING-01-01,Multonomy,Systems Engineering and Automation,1.0,0.654342,BERT_mean_imp,HORIZON-CL4-2023-DIGITAL-EMERGING-01-01,0.998483,1.0,1.0,0.999494
153,inv44624,HORIZON-CL5-2023-D5-01-09,MISSION,Aerospace Engineering,1.0,0.490651,tfidf_sum,HORIZON-CL5-2023-D5-01-09,0.998483,1.0,1.0,0.999494


In [22]:
get_validation_results_call_researcher(df_val=df_val, df_researchers=df_researchers, df_calls=df_calls, method=methods[0], agg_method=agg_methods[0])

Processing:   0%|                                                         | 0/51 [00:00<?, ? rows/s]

HORIZON-CL5-2023-D5-01-09
Investigador:  inv44624  no tiene publicaciones





UnboundLocalError: local variable 'similarity' referenced before assignment

In [14]:
# compute scores for recomendations of researchers for a given call
for i in methods:
    for j in agg_methods:
        val_results_researchers_recommendations[f"{i}_{j}_val_results"] = get_validation_results_call_researcher(df_val=df_val, df_researchers=df_researchers, df_calls=df_calls, method=i, agg_method=j)

# Unificar todos los dataframes
dfs = []
for key, df in val_results_researchers_recommendations.items():
    # Agregar el DataFrame a la lista
    dfs.append(df)

df_val_total_researchers_recommendations = pd.concat(dfs, ignore_index=True).sort_values(by = 'score', ascending = False).fillna(0)
df_val_total_researchers_recommendations.head(5)

Processing:   0%|                                                         | 0/51 [00:00<?, ? rows/s]


UnboundLocalError: local variable 'similarity' referenced before assignment