# Recomendador de artigos - Open Alex

## Importação das bibliotecas

In [11]:
import pandas as pd
import os
import requests


pd.options.display.max_columns = 999

## Funções

In [12]:
def filtrar_dataframe_por_termos(df, termos):
    
    termos_formatados = [f"`{termo}`" if ' ' in termo else termo for termo in termos]
    
    filtro = " or ".join(f"{termo} > 0" for termo in termos_formatados)
    
    df_filtrado = df.query(filtro).loc[:, ['doi', 'title','resumo', 'publication_date', 'open_access'] + termos]
    
    return df_filtrado

def filtrar_dataframe_por_acesso_aberto(df,resposta):
    
    if resposta == 'Sim':
        df = df.query('open_access == True')
    
    return df

def criar_coluna_score(df):
    
    df['score'] = df.iloc[:,4:].sum(axis=1)
    
    return df

def atribuir_fator_termo_score(df, termos):
    df_copy = df.copy()
    
    df_copy['title'] = df_copy['title'].fillna('')
    
    for term in termos:
        mask = df_copy['title'].str.upper().str.contains(term.upper())
        df_copy.loc[mask, 'score'] *= 2
        
        mask = df_copy['resumo'].str.upper().str.contains(term.upper())
        df_copy.loc[mask, 'score'] *= 1.5
    
    df_copy = df_copy.sort_values(by='score', ascending=False)
    
    return df_copy

def atribuir_fator_termo_similar_score(df, termos_similares):
    df_copy = df.copy()
    
    df_copy['title'] = df_copy['title'].fillna('')
    
    for term in termos_similares:
        mask = df_copy['title'].str.upper().str.contains(term.upper())
        df_copy.loc[mask, 'score'] *= 1.5
        
        mask = df_copy['resumo'].str.upper().str.contains(term.upper())
        df_copy.loc[mask, 'score'] *= 1.25
    
    df_copy = df_copy.sort_values(by='score', ascending=False)
    
    return df_copy


def normalizar_score(df):
    min_score = df['score'].min()
    max_score = df['score'].max()


    df['score_normalizado'] = ((df['score'] - min_score) / (max_score - min_score))

    return df

## Coleta de dados

In [13]:
df = pd.read_parquet('/home/franciscofoz/Documents/GitHub/recomendador-artigos-OpenAlex-GPT/df_concatenado.parquet')


In [14]:
df.shape[0]

19511

In [15]:
df_filtrado = filtrar_dataframe_por_termos(df,['Economics'])

df_filtrado = filtrar_dataframe_por_acesso_aberto(df_filtrado,'Não')

df_filtrado = criar_coluna_score(df_filtrado)

df_filtrado = atribuir_fator_termo_score(df_filtrado,['Econometria'])

df_filtrado = atribuir_fator_termo_similar_score(df_filtrado,
                                                 ['Econometrics', 'Regression analysis', 
                                                  'Time series analysis', 'Statistical modeling',
                                                  'Hypothesis testing','Econometria', 'Análise de regressão',
                                                  'Análise de séries temporais', 'Modelagem estatística',
                                                  'Teste de hipótese'])

df_filtrado = normalizar_score(df_filtrado)

print(df_filtrado.shape[0],'\t',df_filtrado.shape[0]/df.shape[0])
df_filtrado.head(10)


956 	 0.048998001127569066


Unnamed: 0,doi,title,resumo,publication_date,open_access,Economics,score,score_normalizado
6874,https://doi.org/10.31092/jpkn.v5i1.2315,The Effect of Per Capita Income and the Agricu...,The purpose of this research is to analyze the...,2023-10-03,True,0.6049,2.006125,1.0
12758,https://doi.org/10.4038/ija.v3i1.52,Factors Affecting Share Prices of Finance Comp...,The price of a share is influenced by The vari...,2023-10-05,True,0.4991,1.873875,0.932208
3334,https://doi.org/10.1080/00036846.2023.2267823,Household income and tourism expenditure: an u...,Although a growing number of studies have inve...,2023-10-06,True,0.7969,1.7969,0.892751
1604,https://doi.org/10.3126/jom.v6i1.58889,Monetary Policy and Economic Growth of SAARC C...,The debate on economic policies continues in b...,2023-10-03,True,0.7908,1.7908,0.889624
12701,https://doi.org/10.15388/ekon.2023.102.2.3,The Estimation of Traditional Phillips Curve,This article presents theoretical foundations ...,2023-10-04,True,0.7857,1.7857,0.887009
15010,https://doi.org/10.1007/s00181-023-02501-y,Uncertainty and long-run economy: the role of ...,"Abstract In this paper, we study the effects o...",2023-10-04,True,0.7829,1.7829,0.885574
6781,https://doi.org/10.17159/1727-3781/2023/v26i0a...,A Comparative Analysis of the Treatment of Inf...,Inflation is often defined as a continuous and...,2023-10-03,True,0.7712,1.7712,0.879577
14208,https://doi.org/10.15388/ekon.2023.102.2.1,Does Tax Effort Moderate the Effect of Governm...,Our research study aims to analyze the effect ...,2023-10-04,True,0.7615,1.7615,0.874604
13760,https://doi.org/10.3389/ffgc.2023.1237597,Economic and financial instruments of forest m...,The forest bioeconomy becomes a feature of The...,2023-10-03,True,0.3937,1.742125,0.864673
15294,https://doi.org/10.55493/5002.v13i12.4891,Unpacking the interconnectedness between macro...,This study aims to link the interconnectedness...,2023-10-03,True,0.7378,1.7378,0.862456


In [None]:
df_filtrado_top = df_filtrado.head(5).reset_index(drop=True).loc[:,['doi','title','resumo','publication_date']]

for i in range(5):
    print(f"'TÍTULO': {df_filtrado_top['title'].iloc[i]}")
    print(f"'RESUMO': {df_filtrado_top['resumo'].iloc[i]}")
    print(f"'DATA DE PUBLICAÇÃO': {df_filtrado_top['publication_date'].iloc[i]}")
    print(f"'DOI': {df_filtrado_top['doi'].iloc[i]}")    
    print(100*'-')

'TÍTULO': CONTRIBUTION OF INDONESIAN NATIONAL STANDARD (SNI) ON GROSS DOMESTIC PRODUCTS (GDP)
'RESUMO': &lt;p&gt;Gross Domestic Product (GDP) is the number of goods and services produced by a country in is certain period as is measuring tool for a country's economic development. GDP comprises many factors, including national household consumption, investment, state is exports, the imports. Standards are inherent consumption, the a the produced, consumed, the nationally the internationally traded. This study aims to determine a effect number standards on GDP. The method used of econometrics through case studies of Indonesia of considering of independent goods namely fixed capital, and and workers, patents, and Indonesian National Standard (SNI), while and dependent factor and and and results showed that services 1% percent increase by SNI, by a a in labor could in Indonesia's in in 0.3%, 0.08%, 0.04%, increase 0.4 %, with alpha 5% from 1998 in 2017, respectively. With an average SNI gro

#PROMPT:

'''

A partir desse termo ['Econometria']. 

Gere 5 termos relacionados em inglês e suas respectivas traduções em português. 

Responda com uma única lista Python.

Como nesse exemplo: 
['"Artificial Intelligence","Inteligência Artificial"]

Não responda mais nada além da lista.

'''

'''
Sou uma biblioteca que gostaria de recomendar novos papers para seus usuários. 

A partir desses papers publicados na última semana:

[]

Crie um e-mail recomendando cada um desses papers, como uma newletter científica promovendo cada um dos papers.

Insira um parágrafo inicial no um e-mail, com um resumo do conteúdo.

NÃO TRADUZA OS TÍTULOS.

Este é um e-mail pessoal para ['Francisco Foz']

Responda sempre:
- TÍTULO:
- RESUMO:
- DATA DE PUBLICAÇÃO:
- DOI:
'''

Faça 