# Recomendador de artigos - Open Alex

## Importação das bibliotecas

In [108]:
import pandas as pd
import time

pd.options.display.max_columns = 999

## Funções

In [2]:
def filtrar_dataframe_por_termos(df, termos):
    
    termos_formatados = [f"`{termo}`" if ' ' in termo else termo for termo in termos]
    
    filtro = " or ".join(f"{termo} > 0" for termo in termos_formatados)
    
    df_filtrado = df.query(filtro).loc[:, ['doi', 'title','abstract', 'publication_date', 'open_access'] + termos]
    
    return df_filtrado

def filtrar_dataframe_por_acesso_aberto(df,resposta):
    
    if resposta == 'Sim':
        df = df.query('open_access == True')
    
    return df

def criar_coluna_score(df):
    
    df['score'] = df.iloc[:,4:].sum(axis=1)
    
    return df

def atribuir_fator_termo_score(df, termos):
    df_copy = df.copy()
    
    df_copy['title'] = df_copy['title'].fillna('')
    
    for term in termos:
        mask = df_copy['title'].str.upper().str.contains(term.upper())
        df_copy.loc[mask, 'score'] *= 2
        
        mask = df_copy['abstract'].str.upper().str.contains(term.upper())
        df_copy.loc[mask, 'score'] *= 1.5
    
    df_copy = df_copy.sort_values(by='score', ascending=False)
    
    return df_copy

def atribuir_fator_termo_similar_score(df, termos_similares):
    df_copy = df.copy()
    
    df_copy['title'] = df_copy['title'].fillna('')
    
    for term in termos_similares:
        mask = df_copy['title'].str.upper().str.contains(term.upper())
        df_copy.loc[mask, 'score'] *= 1.5
        
        mask = df_copy['abstract'].str.upper().str.contains(term.upper())
        df_copy.loc[mask, 'score'] *= 1.25
    
    df_copy = df_copy.sort_values(by='score', ascending=False)
    
    return df_copy


def normalizar_score(df):
    min_score = df['score'].min()
    max_score = df['score'].max()


    df['score_normalizado'] = ((df['score'] - min_score) / (max_score - min_score))

    return df

## Verificação dos dados

In [3]:
df = pd.read_parquet('../data/processed/df_concatenado.parquet')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59437 entries, 0 to 59436
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   doi                    59437 non-null  object 
 1   title                  59437 non-null  object 
 2   abstract               59437 non-null  object 
 3   publication_date       59437 non-null  object 
 4   open_access            59437 non-null  bool   
 5   concepts               59437 non-null  object 
 6   Computer science       59437 non-null  float64
 7   Mathematics            59437 non-null  float64
 8   Physics                59437 non-null  float64
 9   Biology                59437 non-null  float64
 10  Chemistry              59437 non-null  float64
 11  Political science      59437 non-null  float64
 12  Engineering            59437 non-null  float64
 13  Materials science      59437 non-null  float64
 14  Philosophy             59437 non-null  float64
 15  Bu

In [5]:
df.query('abstract == ""').shape[0]

10851

In [6]:
df.query('abstract == ""').shape[0] / df.shape[0]

0.18256304995205008

In [7]:
concepts = []
qtd_de_valores_nao_nulos = []
pct_valores_nao_nulos = []

for i,j in df.iloc[:,6:].items():
    concepts.append(i)
    qtd_de_valores_nao_nulos.append(df.shape[0] - sum(j==0))
    pct_valores_nao_nulos.append(round(((df.shape[0] - sum(j==0))/ df.shape[0]),2))
    
    
pd.DataFrame({'concepts':concepts,
              'nao_nulos':qtd_de_valores_nao_nulos,
              'percentual':pct_valores_nao_nulos}).sort_values(by='percentual',ascending=False)

Unnamed: 0,concepts,nao_nulos,percentual
12,Medicine,19176,0.32
0,Computer science,17706,0.3
3,Biology,11332,0.19
2,Physics,8156,0.14
4,Chemistry,8315,0.14
10,Psychology,8282,0.14
6,Engineering,7170,0.12
7,Materials science,6878,0.12
1,Mathematics,7093,0.12
5,Political science,6382,0.11


In [8]:
for i in df.columns:
    print(i)

doi
title
abstract
publication_date
open_access
concepts
Computer science
Mathematics
Physics
Biology
Chemistry
Political science
Engineering
Materials science
Philosophy
Business
Psychology
Art
Medicine
Geography
Geology
Economics
Sociology
Environmental science
History


In [9]:
df_filtrado = filtrar_dataframe_por_termos(df,['Economics'])

df_filtrado = filtrar_dataframe_por_acesso_aberto(df_filtrado,'Não')

df_filtrado = criar_coluna_score(df_filtrado)

df_filtrado = atribuir_fator_termo_score(df_filtrado,['Econometria'])

df_filtrado = atribuir_fator_termo_similar_score(df_filtrado,
                                                 ['Econometrics', 'Regression analysis', 
                                                  'Time series analysis', 'Statistical modeling',
                                                  'Hypothesis testing','Econometria', 'Análise de regressão',
                                                  'Análise de séries temporais', 'Modelagem estatística',
                                                  'Teste de hipótese'])

df_filtrado = normalizar_score(df_filtrado)

In [10]:
def filtrar_escolha(areas,acesso_aberto,termo,termo_similar):
    
    df_filtrado = filtrar_dataframe_por_termos(df,areas)

    df_filtrado = filtrar_dataframe_por_acesso_aberto(df_filtrado,acesso_aberto)

    df_filtrado = criar_coluna_score(df_filtrado)

    df_filtrado = atribuir_fator_termo_score(df_filtrado,termo)

    df_filtrado = atribuir_fator_termo_similar_score(df_filtrado,termo_similar)

    df_filtrado = normalizar_score(df_filtrado)

    return df_filtrado.head(10)

In [11]:
df_filtrado_top = df_filtrado.head(5).reset_index(drop=True).loc[:,['doi','title','abstract','publication_date']]

for i in range(5):
    print(f"'TÍTULO': {df_filtrado_top['title'].iloc[i]}")
    print(f"'RESUMO': {df_filtrado_top['abstract'].iloc[i]}")
    print(f"'DATA DE PUBLICAÇÃO': {df_filtrado_top['publication_date'].iloc[i]}")
    print(f"'DOI': {df_filtrado_top['doi'].iloc[i]}")    
    print(100*'-')

'TÍTULO': Fiscal space and government-spending cyclicality: Disparity between the procyclical and the countercyclical
'RESUMO': Abstract This study employs quantile regression analysis on a dataset encompassing 160 countries spanning the period from 1990 to 2020. Its primary objective is study investigate analysis relationship between fiscal space and various conditional quantiles of government-spending cyclicality. Unlike prior literature, which predominantly centers on government debt sustainability, our on introduces a comprehensive perspective to encompassing space, the dimensions that have received comparatively limited attention. These the include sovereign balance sheet vulnerability, contingent liabilities arising the risks associated with external the private sector debt, the market perceptions the the risk. Our the suggests dimensions from to is statistically significant only at fiscal upper part fiscal space and and cyclicality distribution, i.e., fiscally procyclical countr

#PROMPT:

79 TOKENS - CUSTO: $ 0.00011850000000000001
'''

A partir desse termo ['econometrics']. 
Gere 5 termos relacionados (como um tesauro) em inglês e suas respectivas traduções em português. 

Responda com uma única lista Python.

Como nesse exemplo: 
['"Artificial Intelligence","Inteligência Artificial"]

Não responda mais nada além da lista.

'''
    TOKENS: 68 TOKENS - CUSTO: $ 0.00013600000000000003
    RESPOSTA
        ['Quantitative Analysis', 'Análise Quantitativa', 'Statistical Modeling', 'Modelagem Estatística', 'Regression Analysis', 'Análise de Regressão', 'Time Series Analysis', 'Análise de Séries Temporais', 'Econometric Models', 'Modelos Econométricos']


90 TOKENS - CUSTO: $ 0.000138
'''
A partir de cada termo ['econometrics','policy','taxa de câmbio']. 
Gere 5 termos relacionados (como um tesauro) em inglês e suas respectivas traduções em português. 

Responda com uma única lista Python todos os termos.

Como nesse exemplo: 
['"Artificial Intelligence","Inteligência Artificial"]

Não responda mais nada além da lista.
'''

    TOKENS: 197 TOKENS - CUSTO: $ 0.00039400000000000004
    RESPOSTA 
    ['econometrics', 'econometria', 'econometric analysis', 'análise econométrica', 'econometric models', 'modelos econométricos', 'econometric methods', 'métodos econométricos', 'economic modeling', 'modelagem econômica',
    'policy', 'política', 'government policy', 'política governamental', 'public policy', 'política pública', 'foreign policy', 'política externa', 'economic policy', 'política econômica',
    'taxa de câmbio', 'exchange rate', 'taxa de câmbio', 'currency exchange rate', 'taxa de câmbio de moeda', 'foreign exchange rate', 'taxa de câmbio estrangeira', 'exchange rate fluctuations', 'flutuações na taxa de câmbio', 'exchange rate system', 'sistema de taxa de câmbio']


TOKENS INPUT:

    80 A 120 

    Custo = 0.00012 a 0.00018

TOKENS OUTPUT:

    70 a 220

    Custo = 0.00014 a 0.00044

Custo total = 0.00026 a 0.00062
Custo por 1.000: 0.26 a 0.62
Custo por 10.000: 2.6 a 6.2
Custo por 100.000: 26 a 62

Sou uma biblioteca que gostaria de recomendar novos papers para seus usuários. 

A partir desses papers publicados na última semana:

[
### [Fiscal space and government-spending cyclicality: Disparity between the procyclical and the countercyclical](https://doi.org/10.21203/rs.3.rs-3376767/v1)

**Data de Publicação**: 2023-10-06

**Resumo**: Abstract This study employs quantile regression analysis on a dataset encompassing 160 countries spanning the period from 1990 to 2020. Its primary objective is study investigate analysis relationship between fiscal space and various conditional quantiles of government-spending cyclicality. Unlike prior literature, which predominantly centers on government debt sustainability, our on introduces a comprehensive perspective to encompassing space, the dimensions that have received comparatively limited attention. These the include sovereign balance sheet vulnerability, contingent liabilities arising the risks associated with external the private sector debt, the market perceptions the the risk. Our the suggests dimensions from to is statistically significant only at fiscal upper part fiscal space and and cyclicality distribution, i.e., fiscally procyclical countries. We also find that, in and and countries, conditional share of foreign currency sovereign of total of procyclical fiscally in of of share held by nonresidents of of government-spending government-spending total government-spending cyclicality. short-term government government government in that share government ratio total debt debt debt tax revenue, natural resource dependence, debt inflation rate are debt debt higher debt procyclicality. In contrast, factors such as in sovereign rating, financial depth, associated associated with with external external debt, external debt, debt, in share total lower short-term are JEC codes: H30, H60, H63

### [The Effect of Per Capita Income and the Agricultural Sector on Goods and Services Tax Receipts with Economic Growth as Moderation in BRICS Countries](https://doi.org/10.31092/jpkn.v5i1.2315)

**Data de Publicação**: 2023-10-03

**Resumo**: The purpose of this research is to analyze the effect The per capita income and The agricultural sector on goods The services tax revenues with economic growth as The moderation in of combined economy countries, BRICS (Brazil, Russia, India, China, of South Africa). and variables used the of study are of ratio of this research research is revenue, is is income, to contribution to in to sector, the to percentage the the growth. the data source comes from the World Bank Data for and period 2010 the 2018. the the method the the quantitative using panel the effect multiple linear regression analysis techniques. effect Random Effect Model goods a model selected based tax services per testing. Simultaneously, all per have model significant per capita capita and capita and revenues. Partially, data a and and panel economic and and used and and negative have significant agricultural agricultural agricultural on on This on revenues. expected goods provide insight goods goods governments a services services countries services making policies tax optimize tax tax revenues economic economic through growth in BRICS variables income, income, sector, sector, growth.

### [AN ANALYTICAL STUDY OF PUBLIC HEALTH EXPENDITURE AND ECONOMIC GROWTH IN EASTERN STATES OF INDIA](https://doi.org/10.36713/epra14543)

**Data de Publicação**: 2023-10-05

**Resumo**: This study examines a comparative analysis of the trend and pattern This healthcare expenditure in four major eastern states This India. The study is completely built on secondary sources study data from 1991 to 2020 taken of study RBI Database, a World Bank Database a various governmental reports. from of has used descriptive statistics, Log-linear regression, least square of of graphical methods for analysis. of of shows of percentage share of total health of the GSDP the an increasing trend. According the the regression analysis, the spending the the strong favourable effect the the the trend health and The and and A rise and and state GSDP expenditure helps has accomplish in maximum productive capacity in human capital in thereby increase four productivity eastern market structure. So, sound states plays India. significant role on sources economic growth to development has from nation. to paper makes health case that governments should take immediate steps to produce alternative income to different health including foreign grants an alternative tax revenue. KEY WORDS: Economic growth, Health- Expenditure, Regression, Time series

### [Regression analysis of the efficiency of the use of production resources of a large-scale agroindustrial enterprise, depending on specialization and location](https://doi.org/10.29235/1818-9806-2023-9-3-22)

**Data de Publicação**: 2023-10-03

**Resumo**: The author’s methodology of analysis has been developed, which makes it possible to calculate the efficiency The of use the production resources of all main types of agricultural production, depending on of regional location. of distribution of large-scale agroindustrial enterprises by specialization of of revealed. Selected factors that have has greatest impact been the effectiveness the activities. A factorial a nalysis the t he efficiency depending on on large-scale enterprise a agroindustrial specialization and location was carried out.

### [The Influence of Original Local Government Revenue, Specific Allocation Fund on Government Capital Expenditures in Southeast Sulawesi Disrict/ City](https://doi.org/10.33395/owner.v7i4.1937)

**Data de Publicação**: 2023-10-02

**Resumo**: This study aims to examine how the influence of Local Own Revenue and Special Allocation Funds on District/City Government Capital Expenditures in Southeast Sulawesi Province. The type study data used study this study is secondary data. study source study to the the in the The comes from Realization Reports the used this Expenditure Budget, data the the the the the influence of 2015-2021 of 17 Regencies/Cities of in from of Analysis of testing of of of Local Own Revenue panel and regression analysis using Eviews 10 software. and results Special Special Capital found that Special Regional Original Income variable has a significant effect on Allocation Allocation Allocation Funds with Funds in level this 0.0000, significant a variable Fund is on on data Province. Sulawesi Southeast in effect significant on a data with has 0.0000. As well as level variable Capital Capital Capital significant Capital this a Simultaneously or jointly have in The The variable this Expenditure is Expenditure Expenditure that that that Regional 0.00000. Original limitations variable Income effect are variable there significant still many factors are a with Expenditures, therefore it level hoped a further researchers can add other variables significant make them more varied.

]

Crie um parágrafo inicial de um e-mail resumindo o conteúdo geral de todos os papers, como uma newsletter científica fazendo publicidade.


Dê continuidade a esse início:

'''
Olá,

Nossa biblioteca está de volta para trazer as últimas descobertas e insights fresquinhos da pesquisa científica publicada na última semana. 

'''

Seja conciso.


'''
Olá,

Nossa biblioteca está de volta para trazer as últimas descobertas e insights fresquinhos da pesquisa científica publicada na última semana. Nossos destaques incluem estudos que exploram a relação entre espaço fiscal e ciclicidade nos gastos governamentais, o impacto da renda per capita e do setor agrícola nas receitas de impostos sobre bens e serviços em países do BRICS, análises sobre os gastos com saúde e crescimento econômico nos estados do leste da Índia, a eficiência na utilização de recursos de produção em empresas agroindustriais em grande escala e o efeito da receita governamental local nas despesas de capital do governo no sudeste de Sulawesi. Essas pesquisas oferecem uma visão abrangente de questões econômicas e financeiras atuais, fornecendo valiosos insights para formuladores de políticas e acadêmicos.
'''

Sou uma biblioteca que gostaria de recomendar novos papers para seus usuários. 

A partir desses papers publicados na última semana:

[

]

Crie um parágrafo inicial de um e-mail resumindo o conteúdo geral de todos os papers, como uma newsletter científica fazendo publicidade.


Dê continuidade a esse início:

'''
Olá,

Nossa biblioteca está de volta para trazer as últimas descobertas e insights fresquinhos da pesquisa científica publicada na última semana. 

'''

Seja conciso.

TOKENS INPUT:

    1450 A 2500

    Custo = 0.0022 a 0.004

TOKENS OUTPUT:

    190 a 300

    Custo = 0.00038 a 0.0006

Custo total = 0.0026 a 0.0046
Custo por 1.000: 2.6 a 4.6
Custo por 10.000: 26 a 46
Custo por 100.000: 260 a 460

CUSTO TOTAL

Custo total = 0.00286 a 0.00522
Custo por 1.000: 2.86 a 5.22
Custo por 10.000: 28.6 a 52.2
Custo por 100.000: 286 a 522


Dólar = $ 5.0

Custo por 1.000: 1.43 a 2.61
Custo por 10.000: 143 a 261
Custo por 100.000: 1430 a 2610


In [21]:
522 * 5


2610

In [4]:
(300/1000)*	0.002 

0.0006

In [2]:
(2500/1000)*	0.0015 

0.00375

In [12]:
import json
def chave_open_ai():
    
    with open('../../../credentials_open_ai.json','r') as json_file:
        dados = json.load(json_file)
        api_key = dados.get('OPEN_AI_API_KEY')
        
        
    return api_key    

In [13]:
import openai

openai.api_key = chave_open_ai()

In [14]:
termo = ['Econometria']
termo

['Econometria']

In [45]:
import time

In [100]:
def chama_api_gera_termos(termo):
    
    resposta = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                {
                    "role": "system",
                    "content": "Você é uma bibliotecária, especialista em linguagem documentária (tesauros)."
                },
                {
                    "role": "user",
                    "content": f"A partir desses termos {termo}. \nGere 5 termos relacionados (como um tesauro) para cada um deles, em inglês e suas respectivas traduções em português, mas retorne todos em conjunto.\n\nResponda com uma única lista Python todos os termos.\n\nComo nesse exemplo: \n['Artificial Intelligence,'Inteligência Artificial']\n\nNão responda mais nada além da lista."

                },
                ],
                temperature=1.05,
                max_tokens=256,
                top_p=1,
                frequency_penalty=0,
                presence_penalty=0,
                stop=["30"]
            )
    return resposta

def gera_termos_relacionados(termo):
    tentativas = 0

    while tentativas < 3:
        tentativas += 1

        try:
            resposta = chama_api_gera_termos(termo)
            
            termos_relacionados = resposta.get('choices')[0].get('message').get('content')
            lista_termos_relacionados = eval(termos_relacionados)
            
            return lista_termos_relacionados

        except TimeoutError:
            print("Tempo limite excedido. Tentando novamente...")
            continue 
        
        except SyntaxError:
            print("Erro de sintaxe. Tentando novamente...")
            continue 
        
        except openai.error.APIError as error:
            print(f'Erro de API. {error}')

    print("Número máximo de tentativas alcançado. Não foi possível obter uma resposta.")
    return None
            


# AJUSTAR TEMPO DE TENTATIVA

In [109]:
start_time = time.time()
api = chama_api_gera_termos(['filosofia'])

if time.time() - start_time > 20:
    print('Erro')
    

THREADING e Streamlit

KeyboardInterrupt: 

In [106]:
D = gera_termos_relacionados(['fiolosofia','religião','espírito'])

In [107]:
D

['philosophy',
 'religion',
 'spirit',
 'metaphysics',
 'ethics',
 'ontology',
 'epistemology',
 'theology',
 'spirituality',
 'soul',
 'filosofia',
 'religião',
 'espírito',
 'metafísica',
 'ética',
 'ontologia',
 'epistemologia',
 'teologia',
 'espiritualidade',
 'alma']

In [44]:
type(D)

list