# Recomendador de artigos - Open Alex e GPT

In [1]:
import requests
import pandas as pd
from datetime import datetime,timedelta,date
import os

pd.options.display.max_columns = 999

In [3]:
def coletar_dados_e_salvar(data_inicial,data_fim):
    
    cursor = '*'
            
    contador = 1
    contador_erro = 0
    
    diretorio = f'datasets_{data_inicial}_to_{data_fim}'
    
    if not os.path.exists(diretorio):
        os.makedirs(diretorio)
    
    while cursor != None:
        url = f'https://api.openalex.org/works?filter=from_publication_date:{data_inicial},to_publication_date:{data_fim},type:Article&per-page=200&cursor={cursor}'

        
        try:
            requisicao = requests.get(url)
            pagina_com_resultados = requisicao.json()
            
        except (requests.exceptions.RequestException, ValueError) as e:
            contador_erro += 1 
            print(f"Erro na página {contador} (Erro {contador_erro}): {e}")
            continue 
        
        resultados = pagina_com_resultados.get('results', [])
        
        df = pd.DataFrame(resultados)
        parquet_arquivo = os.path.join(diretorio, f'registros_{data_inicial}_to_{data_fim}_{contador}.parquet')
        df.to_parquet(parquet_arquivo, index=False)
        
        cursor = pagina_com_resultados['meta'].get('next_cursor')
        
        contador += 1
    
    print(f"Total de erros: {contador_erro}")

In [2]:
def concatenar_arquivos_parquet(folder_path):

    dataframes = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.parquet'):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_parquet(file_path)
            dataframes.append(df)

    df_concatenado = pd.concat(dataframes, ignore_index=True)
    
    df_concatenado.to_parquet('df_concatenado.parquet',index=False)
    

In [3]:
def extrair_concepts_scores(df):
    concept_data = []

    for concepts in df['concepts']:
        concept_scores = {}
        for concept in concepts:
            if concept['level'] < 3 and concept['score'] > 0.51:
                concept_scores[concept['display_name']] = concept['score']
        concept_data.append(concept_scores)

    df_concepts = pd.DataFrame(concept_data, index=df.index).fillna(0).round(4)

    df_final = pd.concat([df, df_concepts], axis=1)

    return df_final

In [88]:
#coletar_dados_e_salvar(data_inicial,data_final)

Total de erros: 0


In [5]:
def get_source_name(primary_location):
    if primary_location is None:
        return None
    source_info = primary_location.get('source', None)
    
    if source_info is None:
        return None
    display_name = source_info.get('display_name', None)
    
    return display_name



def transformar_df(df):
    
    df = df.loc[:,['doi','title','publication_date','created_date','primary_location','open_access','concepts','cited_by_count']]
    
    df_filtrado = df.copy()
    
    df_filtrado.loc[:,'source_title'] = df_filtrado['primary_location'].apply(get_source_name)
    
    df_filtrado.loc[:,'open_access_status'] = df_filtrado['open_access'].apply(lambda x: x.get('is_oa',None))
    
    df_filtrado = extrair_concepts_scores(df_filtrado)
    
    df_filtrado.drop(columns=['primary_location','open_access','concepts'],inplace=True)
    
    return df_filtrado

In [6]:
def ler_e_transformar_arquivos_diretorio(diretorio_input, diretorio_output):
    
    if not os.path.exists(diretorio_output):
        os.makedirs(diretorio_output)
    
    arquivos = [arquivo for arquivo in os.listdir(diretorio_input) if arquivo.endswith(".parquet")]
    
    try:
        for arquivo_parquet in arquivos:
            caminho_input = os.path.join(diretorio_input, arquivo_parquet)
            df_original = pd.read_parquet(caminho_input)  
            df_processado = transformar_df(df_original) 
            
            output_file = os.path.splitext(arquivo_parquet)[0] + ".parquet"
            caminho_output = os.path.join(diretorio_output, output_file)
            
            df_processado.to_parquet(caminho_output, index=False)

    except Exception as error:
        print(arquivo_parquet,error)

In [7]:
def concatenar_arquivos_parquet(diretorio_input):

    dataframes = []

    for arquivo in os.listdir(diretorio_input):
        if arquivo.endswith('.parquet'):
            file_path = os.path.join(diretorio_input, arquivo)
            df = pd.read_parquet(file_path)
            dataframes.append(df)

    df_concatenado = pd.concat(dataframes, ignore_index=True)
    
    return df_concatenado

In [68]:
diretorio_input = 'datasets_2023-09-21_to_2023-09-28'
diretorio_output = 'datasets_2023-09-21_to_2023-09-28_processados'

ler_e_transformar_arquivos_diretorio(diretorio_input,diretorio_output)

___

In [9]:
#concatenar_arquivos_parquet('datasets_2023-09-21_to_2023-09-28_processados').to_parquet('df_concatenado.parquet')

In [12]:
df_concatenado = pd.read_parquet('/home/franciscofoz/Documents/GitHub/recomendador-artigos-OpenAlex-GPT/df_concatenado.parquet')

In [22]:
concept = 'Computer science'

df_concatenado.query('`Computer science` > 0 ')[
    ['doi','title','publication_date','cited_by_count',concept]
    ].sort_values(by='Computer science',ascending=False).head(20)

Unnamed: 0,doi,title,publication_date,cited_by_count,Computer science
7565,https://doi.org/10.1007/s11227-023-05637-x,Saliency-based dual-attention network for unsu...,2023-09-22,0,0.9236
9068,https://doi.org/10.1007/s11227-023-05662-w,Parallel implementations of post-quantum leigh...,2023-09-22,0,0.923
13324,https://doi.org/10.1007/s11042-023-16893-7,Fine-grained person-based image captioning via...,2023-09-23,0,0.9219
47434,https://doi.org/10.1007/s11042-023-16906-5,ORLEP: an efficient offline reinforcement lear...,2023-09-22,0,0.9214
41023,https://doi.org/10.1145/3604629,Prisma: A Tierless Language for Enforcing Cont...,2023-09-23,0,0.9212
28624,https://doi.org/10.1007/s10586-023-04138-z,Adaptive cloud resource allocation for large-s...,2023-09-25,0,0.9211
8489,https://doi.org/10.1007/s10489-023-04940-7,STemGAN: spatio-temporal generative adversaria...,2023-09-22,0,0.9204
37572,https://doi.org/10.1007/s11042-023-16820-w,Optimal Meta-Heuristic Elastic Scheduling (OME...,2023-09-26,0,0.9201
33661,https://doi.org/10.1007/s11042-023-16287-9,First-flexible interactive retrieval system fo...,2023-09-21,0,0.919
11745,https://doi.org/10.1007/s11042-023-17075-1,Intelligent and sustainable techniques for mul...,2023-09-26,0,0.9149


In [13]:
for i in df_concatenado.columns:
    print(i)

doi
title
publication_date
created_date
cited_by_count
source_title
open_access_status
Reliability engineering
Nuclear power plant
Fault (geology)
Failure mode and effects analysis
Transformational leadership
Psychology
Urban agglomeration
Environmental science
Corrosion
Water quality
Materials science
Coating
Composite material
Composite number
Adhesion
Biology
Genetics
Evolutionary biology
Population
Sarcoma
Cancer
Angiogenesis
Cancer research
Medicine
Immunosuppression
Drug delivery
Bioavailability
Pharmacology
Entropy (arrow of time)
Water resources
Computer science
Degree (music)
Athletes
Recreation
Calendula officinalis
Officinalis
Chemistry
Photosynthesis
Seedling
Botany
Transformative learning
Organic matter
Fusarium
Curing of tobacco
Gene
Photovoltaic system
Computer data storage
Kinematics
Torsion (gastropod)
Congruence (geometry)
Moiety
Alkyl
Dissolution
Gut flora
Obesity
Urbanization
Context (archaeology)
Urban planning
Environmental planning
China
Quality (philosophy)
Clim

In [13]:
df_concatenado.sort_values(by='Library science',ascending=False)[['doi','title','publication_date',
                                                                  'cited_by_count','Library science']].head(10)

Unnamed: 0,doi,title,publication_date,cited_by_count,Library science
8804,https://doi.org/10.33463/2687-122x.2023.18(1-4...,Review of the manuscript of the bibliographic ...,2023-09-22,0,0.6488
14349,https://doi.org/10.11114/ijce.v6i2.6413,Reviewer Acknowledgements for International Jo...,2023-09-24,0,0.648
14238,https://doi.org/10.4103/1673-5374.385863,Neurovascular unit on a chip: the relevance an...,2023-09-22,0,0.6249
26319,https://doi.org/10.1080/15420353.2023.2219072,"Here Today, Here Tomorrow: A Case Study of Rev...",2023-09-25,0,0.6063
11071,https://doi.org/10.1021/acs.organomet.3c00352,Irina Beletskaya: Chemistry Excellence in Scie...,2023-09-25,0,0.6031
3964,https://doi.org/10.1021/acs.jpcb.3c05596,Tribute to Xiaoliang Sunney Xie,2023-09-21,0,0.5905
21513,https://doi.org/10.1002/fgc.33264,Doctoral Dissertation Research Improvement Gra...,2023-09-21,0,0.5902
2896,https://doi.org/10.1002/jmri.29018,Editorial for “Longitudinal Metabolite Changes...,2023-09-22,0,0.5893
21466,https://doi.org/10.1002/fgc.33243,Digital Humanities Advancement Grants (NEH),2023-09-21,0,0.5853
42284,https://doi.org/10.1002/fgc.33267,Postdoctoral Research Fellowships in Biology (...,2023-09-21,0,0.5828
