# Recomendador de artigos - Open Alex

## Importação das bibliotecas

In [1]:
import pandas as pd
import os
import requests


pd.options.display.max_columns = 999

## Funções

In [71]:
def filtrar_dataframe_por_termos(df, termos):
    
    termos_formatados = [f"`{termo}`" if ' ' in termo else termo for termo in termos]
    
    filtro = " or ".join(f"{termo} > 0" for termo in termos_formatados)
    
    df_filtrado = df.query(filtro).loc[:, ['doi', 'title','resumo', 'publication_date', 'open_access'] + termos]
    
    return df_filtrado

def filtrar_dataframe_por_acesso_aberto(df,resposta):
    
    if resposta == 'Sim':
        df = df.query('open_access == True')
    
    return df

def criar_coluna_score(df):
    
    df['score'] = df.iloc[:,4:].sum(axis=1)
    
    return df

def atribuir_fator_termo_score(df, termos):
    df_copy = df.copy()
    
    df_copy['title'] = df_copy['title'].fillna('')
    
    for term in termos:
        mask = df_copy['title'].str.upper().str.contains(term.upper())
        df_copy.loc[mask, 'score'] *= 2
        
        mask = df_copy['resumo'].str.upper().str.contains(term.upper())
        df_copy.loc[mask, 'score'] *= 1.5
    
    df_copy = df_copy.sort_values(by='score', ascending=False)
    
    return df_copy

def atribuir_fator_termo_similar_score(df, termos_similares):
    df_copy = df.copy()
    
    df_copy['title'] = df_copy['title'].fillna('')
    
    for term in termos_similares:
        mask = df_copy['title'].str.upper().str.contains(term.upper())
        df_copy.loc[mask, 'score'] *= 1.5
        
        mask = df_copy['resumo'].str.upper().str.contains(term.upper())
        df_copy.loc[mask, 'score'] *= 1.25
    
    df_copy = df_copy.sort_values(by='score', ascending=False)
    
    return df_copy


def normalizar_score(df):
    min_score = df['score'].min()
    max_score = df['score'].max()


    df['score_normalizado'] = ((df['score'] - min_score) / (max_score - min_score))

    return df

## Coleta de dados

In [9]:
df = pd.read_parquet('/home/franciscofoz/Documents/GitHub/recomendador-artigos-OpenAlex-GPT/df_concatenado.parquet')


In [14]:
df.shape[0]

35215

In [72]:
df_filtrado = filtrar_dataframe_por_termos(df,['Economics'])

df_filtrado = filtrar_dataframe_por_acesso_aberto(df_filtrado,'Não')

df_filtrado = criar_coluna_score(df_filtrado)

df_filtrado = atribuir_fator_termo_score(df_filtrado,['Econometria'])

df_filtrado = atribuir_fator_termo_similar_score(df_filtrado,
                                                 ['Econometrics', 'Regression analysis', 
                                                  'Time series analysis', 'Statistical modeling',
                                                  'Hypothesis testing','Econometria', 'Análise de regressão',
                                                  'Análise de séries temporais', 'Modelagem estatística',
                                                  'Teste de hipótese'])

df_filtrado = normalizar_score(df_filtrado)

print(df_filtrado.shape[0],'\t',df_filtrado.shape[0]/df.shape[0])
df_filtrado.head(10)


1639 	 0.0465426664773534


Unnamed: 0,doi,title,resumo,publication_date,open_access,Economics,score,score_normalizado
17797,https://doi.org/10.31153/js.v25i2.992,CONTRIBUTION OF INDONESIAN NATIONAL STANDARD (...,&lt;p&gt;Gross Domestic Product (GDP) is the n...,2023-10-04,True,0.6208,2.026,1.0
19444,https://doi.org/10.31092/jpkn.v5i1.2315,The Effect of Per Capita Income and the Agricu...,The purpose of this research is to analyze the...,2023-10-03,True,0.6049,2.006125,0.990022
19643,https://doi.org/10.29235/1818-9806-2023-9-3-22,Regression analysis of the efficiency of the u...,The author’s methodology of analysis has been ...,2023-10-03,True,0.2649,1.89735,0.93541
18679,https://doi.org/10.33395/owner.v7i4.1937,The Influence of Original Local Government Rev...,This study aims to examine how the influence o...,2023-10-02,True,0.5055,1.881875,0.927641
10156,https://doi.org/10.9734/jemt/2023/v29i101153,Determinant of Educated Unemployment Rate in W...,This research was conducted to find out how th...,2023-10-02,True,0.4535,1.816875,0.895007
15959,https://doi.org/10.3126/jom.v6i1.58889,Monetary Policy and Economic Growth of SAARC C...,The debate on economic policies continues in b...,2023-10-03,True,0.7908,1.7908,0.881916
34193,https://doi.org/10.1007/s00181-023-02501-y,Uncertainty and long-run economy: the role of ...,"Abstract In this paper, we study the effects o...",2023-10-04,True,0.7829,1.7829,0.87795
32175,https://doi.org/10.32609/j.ruje.9.111967,Factors of global inflation in 2021–2022,The paper examines the factors of global infla...,2023-10-03,True,0.7754,1.7754,0.874184
8387,https://doi.org/10.1111/infi.12441,Bubble detective: City‐level analysis of house...,Abstract This paper investigates house price d...,2023-10-03,True,0.7733,1.7733,0.87313
19351,https://doi.org/10.17159/1727-3781/2023/v26i0a...,A Comparative Analysis of the Treatment of Inf...,Inflation is often defined as a continuous and...,2023-10-03,True,0.7712,1.7712,0.872076


In [76]:
df_filtrado_top = df_filtrado.head(5).reset_index(drop=True).loc[:,['doi','title','resumo','publication_date']]

for i in range(5):
    print(f"'TÍTULO': {df_filtrado_top['title'].iloc[i]}")
    print(f"'RESUMO': {df_filtrado_top['resumo'].iloc[i]}")
    print(f"'DATA DE PUBLICAÇÃO': {df_filtrado_top['publication_date'].iloc[i]}")
    print(f"'DOI': {df_filtrado_top['doi'].iloc[i]}")    
    print(100*'-')

'TÍTULO': CONTRIBUTION OF INDONESIAN NATIONAL STANDARD (SNI) ON GROSS DOMESTIC PRODUCTS (GDP)
'RESUMO': &lt;p&gt;Gross Domestic Product (GDP) is the number of goods and services produced by a country in is certain period as is measuring tool for a country's economic development. GDP comprises many factors, including national household consumption, investment, state is exports, the imports. Standards are inherent consumption, the a the produced, consumed, the nationally the internationally traded. This study aims to determine a effect number standards on GDP. The method used of econometrics through case studies of Indonesia of considering of independent goods namely fixed capital, and and workers, patents, and Indonesian National Standard (SNI), while and dependent factor and and and results showed that services 1% percent increase by SNI, by a a in labor could in Indonesia's in in 0.3%, 0.08%, 0.04%, increase 0.4 %, with alpha 5% from 1998 in 2017, respectively. With an average SNI gro

#PROMPT:

'''

A partir desse termo ['Econometria']. 

Gere 5 termos relacionados em inglês e suas respectivas traduções em português. 

Responda com uma única lista Python.

Como nesse exemplo: 
['"Artificial Intelligence","Inteligência Artificial"]

Não responda mais nada além da lista.

'''

'''
Sou uma biblioteca que gostaria de recomendar novos papers para seus usuários. 

A partir desses papers publicados na última semana:

[]

Crie um e-mail recomendando cada um desses papers, como uma newletter científica promovendo cada um dos papers.

Insira um parágrafo inicial no um e-mail, com um resumo do conteúdo.

NÃO TRADUZA OS TÍTULOS.

Este é um e-mail pessoal para ['Francisco Foz']

Responda sempre:
- TÍTULO:
- RESUMO:
- DATA DE PUBLICAÇÃO:
- DOI:
'''

Faça 