# Recomendador de artigos - Open Alex e GPT

In [1]:
import requests
import pandas as pd
from datetime import datetime,timedelta,date
import os

pd.options.display.max_columns = 999

In [2]:
#Se eu escolher extrair por mês

def obter_datas_mes(mes, ano):
    primeiro_dia = datetime(ano, mes, 1).strftime('%Y-%m-%d')
    if mes == 12:
        proximo_mes = datetime(ano + 1, 1, 1)
    else:
        proximo_mes = datetime(ano, mes + 1, 1)
    ultimo_dia_mes_atual = (proximo_mes - timedelta(days=1)).strftime('%Y-%m-%d')
    
    return primeiro_dia, ultimo_dia_mes_atual

#Se eu escolher extrair por semana

def obter_data_semana():
    
    data_fim = date.today()
    data_inicio = data_fim - timedelta(days=7)
    
    data_fim_formatada = data_fim.strftime('%Y-%m-%d')
    data_inicio_formatada = data_inicio.strftime('%Y-%m-%d')
    
    return data_inicio_formatada,data_fim_formatada


In [3]:
def coletar_dados_e_salvar(data_inicial,data_fim):
    
    cursor = '*'
            
    contador = 1
    contador_erro = 0
    
    diretorio = f'datasets_{data_inicial}_to_{data_fim}'
    
    if not os.path.exists(diretorio):
        os.makedirs(diretorio)
    
    while cursor != None:
        url = f'https://api.openalex.org/works?filter=from_publication_date:{data_inicial},to_publication_date:{data_fim},type:Article&per-page=200&cursor={cursor}'

        
        try:
            requisicao = requests.get(url)
            pagina_com_resultados = requisicao.json()
            
        except (requests.exceptions.RequestException, ValueError) as e:
            contador_erro += 1 
            print(f"Erro na página {contador} (Erro {contador_erro}): {e}")
            continue 
        
        resultados = pagina_com_resultados.get('results', [])
        
        df = pd.DataFrame(resultados)
        parquet_arquivo = os.path.join(diretorio, f'registros_{data_inicial}_to_{data_fim}_{contador}.parquet')
        df.to_parquet(parquet_arquivo, index=False)
        
        cursor = pagina_com_resultados['meta'].get('next_cursor')
        
        contador += 1
    
    print(f"Total de erros: {contador_erro}")

In [4]:
def concatenar_arquivos_parquet(folder_path):

    dataframes = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.parquet'):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_parquet(file_path)
            dataframes.append(df)

    df_concatenado = pd.concat(dataframes, ignore_index=True)
    
    df_concatenado.to_parquet('df_concatenado.parquet',index=False)
    

In [5]:
def extrair_concepts_scores(df):
    concept_data = []

    for concepts in df['concepts']:
        concept_scores = {}
        for concept in concepts:
            if concept['level'] < 3 and concept['score'] > 0.51:
                concept_scores[concept['display_name']] = concept['score']
        concept_data.append(concept_scores)

    df_concepts = pd.DataFrame(concept_data, index=df.index).fillna(0).round(4)

    df_final = pd.concat([df, df_concepts], axis=1)

    return df_final

In [83]:
data_inicial , data_final = obter_data_semana()

In [88]:
#coletar_dados_e_salvar(data_inicial,data_final)

Total de erros: 0


In [67]:
def get_source_name(primary_location):
    if primary_location is None:
        return None
    source_info = primary_location.get('source', None)
    
    if source_info is None:
        return None
    display_name = source_info.get('display_name', None)
    
    return display_name



def transformar_df(df):
    
    df = df.loc[:,['doi','title','publication_date','created_date','primary_location','open_access','concepts','cited_by_count']]
    
    df_filtrado = df.copy()
    
    df_filtrado.loc[:,'source_title'] = df_filtrado['primary_location'].apply(get_source_name)
    
    df_filtrado.loc[:,'open_access_status'] = df_filtrado['open_access'].apply(lambda x: x.get('is_oa',None))
    
    df_filtrado = extrair_concepts_scores(df_filtrado)
    
    df_filtrado.drop(columns=['primary_location','open_access','concepts'],inplace=True)
    
    return df_filtrado

In [32]:
def ler_e_transformar_arquivos_diretorio(diretorio_input, diretorio_output):
    
    if not os.path.exists(diretorio_output):
        os.makedirs(diretorio_output)
    
    arquivos = [arquivo for arquivo in os.listdir(diretorio_input) if arquivo.endswith(".parquet")]
    
    try:
        for arquivo_parquet in arquivos:
            caminho_input = os.path.join(diretorio_input, arquivo_parquet)
            df_original = pd.read_parquet(caminho_input)  
            df_processado = transformar_df(df_original) 
            
            output_file = os.path.splitext(arquivo_parquet)[0] + ".parquet"
            caminho_output = os.path.join(diretorio_output, output_file)
            
            df_processado.to_parquet(caminho_output, index=False)

    except Exception as error:
        print(arquivo_parquet,error)

In [68]:
diretorio_input = 'datasets_2023-09-21_to_2023-09-28'
diretorio_output = 'datasets_2023-09-21_to_2023-09-28_processados'

ler_e_transformar_arquivos_diretorio(diretorio_input,diretorio_output)

## CONCATENAR ARQUIVOS PARQUET NO DIRETORIO

___

In [19]:
transformar_df(df)

Unnamed: 0,doi,title,publication_date,created_date,cited_by_count,source_title,open_access_status,Euclidean geometry,Algorithm,Cancer research,Medicine,Uveitis,Guideline,Electric field,Bistability,Genetics,Biology,Pathogenicity,Computational biology,Cirrhosis,Lesion,Computer science,Face (sociological concept),Neuroscience,Autism,Microbiology,Chemistry,Dash,Wrist,Prosthesis,Surgery,Climatology,Pregnancy,Insulin,Endocrinology,Internal medicine,Follicular phase,Subcommissural organ,Cell biology,Cerebrospinal fluid,Atlas (anatomy),Artificial intelligence,Brain atlas,Neuroimaging,Gravitational wave,Physics,Binary number,Immune system,Immunology,Antigen,Vaccination,Population,Fusion,Psychology,Operationalization,Conceptualization,Compassion,Interpersonal communication,Mangrove,Biodiversity,Ecosystem,Environmental resource management,Tribe,Genus,Discontinuation,Stimulation,Gastroenterology,Serviceability (structure),Cantilever,Structural engineering,Stiffness,Limit state design,Gut flora,Conformal map,Mathematics,Curvature,Constant (computer programming),Tournament,Combinatorics
48,https://doi.org/10.2140/apde.2023.16.1485,Simplices in thin subsets of Euclidean spaces,2023-09-21,2020-09-14,1,Analysis & PDE,True,0.5652,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54,https://doi.org/10.1007/s00222-023-01220-6,On the birational section conjecture with stro...,2023-09-26,2021-09-13,1,Inventiones Mathematicae,True,0.0,0.5585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
61,https://doi.org/10.1002/hep.32781,c‐Rel–dependent Chk2 signaling regulates the D...,2023-09-27,2022-09-14,1,Hepatology,True,0.0,0.0,0.713,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
62,https://doi.org/10.5507/bp.2022.038,Trends in management of ocular syphilis in ter...,2023-09-21,2022-09-19,1,Biomedical Papers of the Faculty of Medicine o...,True,0.0,0.0,0.0,0.9037,0.8388,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95,https://doi.org/10.1253/circj.cj-22-0794,JCS/JSCVS/JATS/JSVS 2020 Guideline on Diagnosi...,2023-09-25,2023-09-01,1,Circulation journal,True,0.0,0.0,0.0,0.7069,0.0,0.7331,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,https://doi.org/10.1088/1478-3975/acf8a4,Universal calcium fluctuations in hydra morpho...,2023-09-22,2023-09-12,1,Physical Biology,True,0.0,0.0,0.0,0.0,0.0,0.0,0.5553,0.5521,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,https://doi.org/10.1126/science.adg7492,Accurate proteome-wide missense variant effect...,2023-09-22,2023-09-20,1,Science,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5898,0.581,0.5745,0.5123,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,https://doi.org/10.1007/s00330-023-10226-w,Hepatobiliary phase imaging in cirrhotic patie...,2023-09-21,2023-09-21,1,European Radiology,False,0.0,0.0,0.0,0.7577,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7257,0.5283,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99,https://doi.org/10.1142/s0218213023500689,Development of Optimal Hyper-parameter Tuning-...,2023-09-21,2023-09-21,1,International Journal on Artificial Intelligen...,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8565,0.5756,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100,https://doi.org/10.7554/elife.83223,Dynamic top-down biasing implements rapid adap...,2023-09-21,2023-09-22,1,eLife,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5612,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
