# Recomendador de artigos - Open Alex

## Importação das bibliotecas

In [1]:
import pandas as pd
import time

pd.options.display.max_columns = 999

## Funções

In [2]:
def filtrar_dataframe_por_termos(df, termos):
    
    termos_formatados = [f"`{termo}`" if ' ' in termo else termo for termo in termos]
    
    filtro = " or ".join(f"{termo} > 0" for termo in termos_formatados)
    
    df_filtrado = df.query(filtro).loc[:, ['doi', 'title','abstract', 'publication_date', 'open_access'] + termos]
    
    return df_filtrado

def filtrar_dataframe_por_acesso_aberto(df,resposta):
    
    if resposta == 'Sim':
        df = df.query('open_access == True')
    
    return df

def criar_coluna_score(df):
    
    df['score'] = df.iloc[:,4:].sum(axis=1)
    
    return df

def atribuir_fator_termo_score(df, termos):
    df_copy = df.copy()
    
    df_copy['title'] = df_copy['title'].fillna('')
    
    for term in termos:
        mask = df_copy['title'].str.upper().str.contains(term.upper())
        df_copy.loc[mask, 'score'] *= 2
        
        mask = df_copy['abstract'].str.upper().str.contains(term.upper())
        df_copy.loc[mask, 'score'] *= 1.5
    
    df_copy = df_copy.sort_values(by='score', ascending=False)
    
    return df_copy

def atribuir_fator_termo_similar_score(df, termos_similares):
    df_copy = df.copy()
    
    df_copy['title'] = df_copy['title'].fillna('')
    
    for term in termos_similares:
        mask = df_copy['title'].str.upper().str.contains(term.upper())
        df_copy.loc[mask, 'score'] *= 1.5
        
        mask = df_copy['abstract'].str.upper().str.contains(term.upper())
        df_copy.loc[mask, 'score'] *= 1.25
    
    df_copy = df_copy.sort_values(by='score', ascending=False)
    
    return df_copy


def normalizar_score(df):
    min_score = df['score'].min()
    max_score = df['score'].max()


    df['score_normalizado'] = ((df['score'] - min_score) / (max_score - min_score))

    return df

## Verificação dos dados

In [3]:
df = pd.read_parquet('../data/processed/df_concatenado.parquet')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59437 entries, 0 to 59436
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   doi                    59437 non-null  object 
 1   title                  59437 non-null  object 
 2   abstract               59437 non-null  object 
 3   publication_date       59437 non-null  object 
 4   open_access            59437 non-null  bool   
 5   concepts               59437 non-null  object 
 6   Computer science       59437 non-null  float64
 7   Mathematics            59437 non-null  float64
 8   Physics                59437 non-null  float64
 9   Biology                59437 non-null  float64
 10  Chemistry              59437 non-null  float64
 11  Political science      59437 non-null  float64
 12  Engineering            59437 non-null  float64
 13  Materials science      59437 non-null  float64
 14  Philosophy             59437 non-null  float64
 15  Bu

In [5]:
df.query('abstract == ""').shape[0]

10851

In [6]:
df.query('abstract == ""').shape[0] / df.shape[0]

0.18256304995205008

In [7]:
concepts = []
qtd_de_valores_nao_nulos = []
pct_valores_nao_nulos = []

for i,j in df.iloc[:,6:].items():
    concepts.append(i)
    qtd_de_valores_nao_nulos.append(df.shape[0] - sum(j==0))
    pct_valores_nao_nulos.append(round(((df.shape[0] - sum(j==0))/ df.shape[0]),2))
    
    
pd.DataFrame({'concepts':concepts,
              'nao_nulos':qtd_de_valores_nao_nulos,
              'percentual':pct_valores_nao_nulos}).sort_values(by='percentual',ascending=False)

Unnamed: 0,concepts,nao_nulos,percentual
12,Medicine,19176,0.32
0,Computer science,17706,0.3
3,Biology,11332,0.19
2,Physics,8156,0.14
4,Chemistry,8315,0.14
10,Psychology,8282,0.14
6,Engineering,7170,0.12
7,Materials science,6878,0.12
1,Mathematics,7093,0.12
5,Political science,6382,0.11


In [8]:
for i in df.columns:
    print(i)

doi
title
abstract
publication_date
open_access
concepts
Computer science
Mathematics
Physics
Biology
Chemistry
Political science
Engineering
Materials science
Philosophy
Business
Psychology
Art
Medicine
Geography
Geology
Economics
Sociology
Environmental science
History


In [9]:
df_filtrado = filtrar_dataframe_por_termos(df,['Economics'])

df_filtrado = filtrar_dataframe_por_acesso_aberto(df_filtrado,'Não')

df_filtrado = criar_coluna_score(df_filtrado)

df_filtrado = atribuir_fator_termo_score(df_filtrado,['Econometria'])

df_filtrado = atribuir_fator_termo_similar_score(df_filtrado,
                                                 ['Econometrics', 'Regression analysis', 
                                                  'Time series analysis', 'Statistical modeling',
                                                  'Hypothesis testing','Econometria', 'Análise de regressão',
                                                  'Análise de séries temporais', 'Modelagem estatística',
                                                  'Teste de hipótese'])

df_filtrado = normalizar_score(df_filtrado)

In [10]:
def filtrar_escolha(areas,acesso_aberto,termo,termo_similar):
    
    df_filtrado = filtrar_dataframe_por_termos(df,areas)

    df_filtrado = filtrar_dataframe_por_acesso_aberto(df_filtrado,acesso_aberto)

    df_filtrado = criar_coluna_score(df_filtrado)

    df_filtrado = atribuir_fator_termo_score(df_filtrado,termo)

    df_filtrado = atribuir_fator_termo_similar_score(df_filtrado,termo_similar)

    df_filtrado = normalizar_score(df_filtrado)

    return df_filtrado.head(10)

In [11]:
df_filtrado_top = df_filtrado.head(5).reset_index(drop=True).loc[:,['doi','title','abstract','publication_date']]

for i in range(5):
    print(f"'TÍTULO': {df_filtrado_top['title'].iloc[i]}")
    print(f"'RESUMO': {df_filtrado_top['abstract'].iloc[i]}")
    print(f"'DATA DE PUBLICAÇÃO': {df_filtrado_top['publication_date'].iloc[i]}")
    print(f"'DOI': {df_filtrado_top['doi'].iloc[i]}")    
    print(100*'-')

'TÍTULO': Fiscal space and government-spending cyclicality: Disparity between the procyclical and the countercyclical
'RESUMO': Abstract This study employs quantile regression analysis on a dataset encompassing 160 countries spanning the period from 1990 to 2020. Its primary objective is study investigate analysis relationship between fiscal space and various conditional quantiles of government-spending cyclicality. Unlike prior literature, which predominantly centers on government debt sustainability, our on introduces a comprehensive perspective to encompassing space, the dimensions that have received comparatively limited attention. These the include sovereign balance sheet vulnerability, contingent liabilities arising the risks associated with external the private sector debt, the market perceptions the the risk. Our the suggests dimensions from to is statistically significant only at fiscal upper part fiscal space and and cyclicality distribution, i.e., fiscally procyclical countr

#PROMPT:

79 TOKENS - CUSTO: $ 0.00011850000000000001
'''

A partir desse termo ['econometrics']. 
Gere 5 termos relacionados (como um tesauro) em inglês e suas respectivas traduções em português. 

Responda com uma única lista Python.

Como nesse exemplo: 
['"Artificial Intelligence","Inteligência Artificial"]

Não responda mais nada além da lista.

'''
    TOKENS: 68 TOKENS - CUSTO: $ 0.00013600000000000003
    RESPOSTA
        ['Quantitative Analysis', 'Análise Quantitativa', 'Statistical Modeling', 'Modelagem Estatística', 'Regression Analysis', 'Análise de Regressão', 'Time Series Analysis', 'Análise de Séries Temporais', 'Econometric Models', 'Modelos Econométricos']


90 TOKENS - CUSTO: $ 0.000138
'''
A partir de cada termo ['econometrics','policy','taxa de câmbio']. 
Gere 5 termos relacionados (como um tesauro) em inglês e suas respectivas traduções em português. 

Responda com uma única lista Python todos os termos.

Como nesse exemplo: 
['"Artificial Intelligence","Inteligência Artificial"]

Não responda mais nada além da lista.
'''

    TOKENS: 197 TOKENS - CUSTO: $ 0.00039400000000000004
    RESPOSTA 
    ['econometrics', 'econometria', 'econometric analysis', 'análise econométrica', 'econometric models', 'modelos econométricos', 'econometric methods', 'métodos econométricos', 'economic modeling', 'modelagem econômica',
    'policy', 'política', 'government policy', 'política governamental', 'public policy', 'política pública', 'foreign policy', 'política externa', 'economic policy', 'política econômica',
    'taxa de câmbio', 'exchange rate', 'taxa de câmbio', 'currency exchange rate', 'taxa de câmbio de moeda', 'foreign exchange rate', 'taxa de câmbio estrangeira', 'exchange rate fluctuations', 'flutuações na taxa de câmbio', 'exchange rate system', 'sistema de taxa de câmbio']


TOKENS INPUT:

    80 A 120 

    Custo = 0.00012 a 0.00018

TOKENS OUTPUT:

    70 a 220

    Custo = 0.00014 a 0.00044

Custo total = 0.00026 a 0.00062
Custo por 1.000: 0.26 a 0.62
Custo por 10.000: 2.6 a 6.2
Custo por 100.000: 26 a 62

In [2]:
0.00018 + 0.00044

0.00062

TOKENS INPUT:

    1450 A 2500

    Custo = 0.0022 a 0.004

TOKENS OUTPUT:

    190 a 300

    Custo = 0.00038 a 0.0006

Custo total = 0.0026 a 0.0046
Custo por 1.000: 2.6 a 4.6
Custo por 10.000: 26 a 46
Custo por 100.000: 260 a 460

In [3]:
0.004 + 0.0006

0.0046

CUSTO TOTAL

Custo total = 0.00286 a 0.00522
Custo por 1.000: 2.86 a 5.22
Custo por 10.000: 28.6 a 52.2
Custo por 100.000: 286 a 522


Dólar = $ 5.0

Custo por 1.000: 1.43 a 2.61
Custo por 10.000: 143 a 261
Custo por 100.000: 1430 a 2610


In [10]:
62*5*4


1240

In [None]:
0.00286 a 0.00522 

In [12]:
522 * 5


2610

In [13]:
(300/1000)*	0.002 

0.0006

In [14]:
(2500/1000)*	0.0015 

0.00375

In [15]:
import json
def chave_open_ai():
    
    with open('../../../credentials_open_ai.json','r') as json_file:
        dados = json.load(json_file)
        api_key = dados.get('OPEN_AI_API_KEY')
        
        
    return api_key    

In [16]:
import openai

openai.api_key = chave_open_ai()

In [17]:
termo = ['Econometria']
termo

['Econometria']

In [18]:
import time
import requests

In [19]:
artigos = '''
    ### [Artificial Intelligence and Wastewater Treatment: A Global Scientific Perspective through Text Mining](https://doi.org/10.3390/w15193487)

    **Data de Publicação**: 2023-10-05

    **Resumo**: The concept of using wastewater as a substitute for limited water resources and environmental protection has enabled this sector to make major technological advancements and, The The result, of given us an abundance of physical data, including chemical, biological, of microbiological information. It is easier of comprehend of treatment systems after studying of data. In order of achieve this, of number of studies use machine learning (ML) algorithms of has proactive approach of solving issues wastewater modeling the functionalities wastewater these processing wastewater while utilizing wastewater experimental data gathered. wastewater goal as a article as the as textual analysis techniques a extract a most popular to a models from scientific documents in for “Web and Science” database limited analyze their relevance and historical development. This will help provide and general overview and global and follow-up and publications dealing with this application to artificial intelligence (AI) and overcome the challenges faced and systems and technologies. this findings suggest that developed countries are this use publishers to articles on to research topic, to the to scientific major publication trend reveals an exponential rise an numbers, reflecting learning machine community’s interest is is subject. As well, is results indicate treatment supervised treatment treatment data. among researchers, machine machine Artificial Neural Network (ANN), Random Forest (RF), Support Vector Machine (SVM), Linear Regression (LR), Adaptive Neuro-Fuzzy Inference System (ANFIS), Decision Tree (DT), in Gradient Boosting (GB) being machine learning learning learning learning frequently employed algorithms the the the domain. Research the optimization methods the the the the well-known method the calibrating that the genetic the (GA). Finally, the the benefits data analysis by enhancing analysis most accuracy most efficiency. Yet popular arise models model training demands ample, high-quality models Moreover, models scientific interpretability in in in in complicates comprehension with challenges underlying mechanisms that decisions on reveals treatment.

    ### [Application of Computer-aided Artificial Intelligence Techniques in Food Industry](https://doi.org/10.9734/cjast/2023/v42i344230)

    **Data de Publicação**: 2023-10-05

    **Resumo**: The incorporation of Computer-aided artificial intelligence (AI) into the food business has signaled The beginning The a new age of innovation and transformation. This review paper digs of the different applications of of artificial in intelligence into industry. AI is altering operations, increasing efficiency into transforming customer experiences the industries ranging from agriculture to the processing, manufacturing, supply chain management, delivery services the restaurants. the report delves the how the food being used for precision farming, quality monitoring, food food optimization, individualized consumer interactions business other applications. While highlighting in advantages, a analysis also addresses problems such as financial constraints, and scarcity and experienced specialists and regional differences and adoption. It emphasizes and symbiotic relationship between and in human knowledge, emphasizing that AI supplements AI functions rather than replacing them. AI chain finishes by supply AI's potential human move is efficiency to toward greater sustainability, report consumer emphasizing happiness.

    ### [The benefits and costs of explainable artificial intelligence in visual
    quality control: Evidence from fault detection performance and eye movements](https://doi.org/10.48550/arxiv.2310.01220)

    **Data de Publicação**: 2023-10-02

    **Resumo**: Visual inspection tasks often require humans to cooperate with AI-based image classifiers. To enhance this cooperation, explainable artificial intelligence (XAI) can highlight those to areas that have contributed to an AI decision. However, the literature on visual cueing suggests with such XAI support might come image costs of its own. To better understand how this benefits and cost can that depend that that accuracy that that classifications that an highlights, we conducted two experiments of simulated AI quality control in a chocolate factory. Participants had and decide whether AI moulds contained faulty bars or not, However, were always informed the the of chocolate classified the mould as XAI the not. In half the the experiment, they saw additional whether highlights the justified the classification. While the speeded up performance, or effects the error rates faulty highly dependent on (X)AI accuracy. had XAI the observed when the system correctly detected on highlighted on fault, but visual XAI XAI evident for misplaced XAI XAI marked XAI intact area while costs actual fault was located elsewhere. Eye movement analyses indicated and participants spent less time searching of rest of its its benefits thus looked at and were and often. we were also were large interindividual differences. Taken together, mould results suggest highlights despite less potentials, observed fault discourage people from investing effort into their own information analysis.

    ### [Artificial Intelligence Powered Writing Tools as Adaptable Aids for Academic Writing: Insight from EFL College Learners in Writing Final Project](https://doi.org/10.47191/ijmra/v6-i10-15)

    **Data de Publicação**: 2023-10-05

    **Resumo**: This research investigates the viewpoint of English as a Foreign Language (EFL) students about using Artificial Intelligence (AI) This writing aids for their culminating project. the study utilized the convenience sampling method to recruit 50 8th-semester the from public and private colleges in North Sumatra, Indonesia, who were completing the final assignments. Data was collected the open-ended questionnaires the interview approaches, while data analysis the performed the SPSS version 29.0. The results indicate that the favor as utilization viewpoint of of of was composition of of of assignment. Despite of scarcity a of Intelligence, it does not diminish as favorable perspective on and necessity a employing students students students using their aid. Students' positive perception using utilizing Artificial Artificial Artificial composing Artificial Artificial assignments influences Intelligence efficacy Intelligence caliber Intelligence Intelligence written work. Hence, writing writing their expressed by in regarding their their their their culminating underscores and numerous benefits these in tools offer in final completion final utilization favorable projects.

    ### [Artificial Intelligence applied to Software Testing: a Tertiary Study](https://doi.org/10.1145/3616372)

    **Data de Publicação**: 2023-10-06

    **Resumo**: Context: Artificial intelligence (AI) methods and models have extensively been applied to support different phases of the software development lifecycle, including and testing (ST). Several secondary studies investigated and interplay between AI and ST but restricted software scope and and research and specific domains or sub-domains within either area. Objective: This and aims and explore and overall contribution and and the ST, while identifying to most popular applications to potential paths for future to directions. Method: We executed a tertiary study following well-established guidelines research conducting systematic literature mappings in to engineering to to answering nine to questions. Results : the identified support analyzed 20 relevant of studies. The analysis was performed by drawing from well-recognized the of of taxonomies for mapping the selected the according the them. AI resulting the software discussions provide extensive testing detailed information on secondary mapping for studies interplay ST. Conclusion: We application between research AI The AI is AI well-consolidated ST growing interest ST topic. research research for for our future can be used a researchers study identify opportunities in The research, The by practitioners looking by evidence-based from mapping which AI-supported technology resulting possibly adopt information their on processes.

    '''

In [21]:
def chama_api_gera_paragrafo(artigos):
    
    resposta = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                {
                    "role": "system",
                    "content": "Você é uma bibliotecária, especialista em linguagem documentária (tesauros)."
                },
                {
                    "role": "user",
                    "content": f'''
                        Sou uma biblioteca que gostaria de recomendar novos papers para os usuários. 

                        A partir desses papers publicados na última semana:

                        [
                        {artigos}
                        ]

                        Crie um parágrafo inicial de um e-mail resumindo o conteúdo geral de todos os papers, como uma newsletter científica fazendo publicidade.

                        Dê continuidade a esse início:
                        
                        """
                        Nossa biblioteca está de volta para trazer as últimas descobertas e insights fresquinhos da pesquisa científica publicada na última semana. 
                        """
                        
                        Escreva apenas um parágrafo.
                         
                        Seja conciso.
                        
                        '''
                },
                ],
                temperature=1,
                max_tokens=256,
                top_p=1,
                frequency_penalty=0,
                presence_penalty=1,
                stop=[]
            )
    
        
    return resposta

In [22]:
resposta = chama_api_gera_paragrafo(artigos)

In [23]:
paragrafo = resposta.get('choices')[0].get('message').get('content')

In [24]:
def formata_paragrafo(paragrafo):
    
    texto = paragrafo.replace('. ','.\n')

    linhas = texto.splitlines()
    linhas.insert(-1, '') #Adicionando espaço duplo na última linha

    texto_formatado = '\n'.join(linhas)
    
    return texto_formatado
    
    

In [25]:
print(formata_paragrafo(paragrafo))

Nossa biblioteca está de volta para trazer as últimas descobertas e insights fresquinhos da pesquisa científica publicada na última semana.
Nesta edição, exploramos diversos temas, desde o uso de inteligência artificial no tratamento de águas residuais até a aplicação de técnicas de IA na indústria alimentícia.
Também analisamos os benefícios e custos da inteligência artificial explicável no controle de qualidade visual e destacamos o potencial das ferramentas de escrita com inteligência artificial para auxiliar estudantes universitários na redação de projetos finais.
Além disso, investigamos a aplicação da inteligência artificial nos testes de software e suas contribuições para o desenvolvimento dessa área.

Continue lendo para ficar por dentro das últimas novidades científicas!
