# Recomendador de artigos - Open Alex e GPT

In [4]:
import requests
import pandas as pd
from datetime import datetime,timedelta,date
import os
import pyarrow.parquet as pq


pd.options.display.max_columns = 999

In [5]:
def coletar_dados_e_salvar(data_inicial,data_fim):
    
    cursor = '*'
            
    contador = 1
    contador_erro = 0
    
    diretorio = f'datasets_{data_inicial}_to_{data_fim}'
    
    if not os.path.exists(diretorio):
        os.makedirs(diretorio)
    
    while cursor != None:
        url = f'https://api.openalex.org/works?filter=from_publication_date:{data_inicial},to_publication_date:{data_fim},type:Article&per-page=200&cursor={cursor}'

        
        try:
            requisicao = requests.get(url)
            pagina_com_resultados = requisicao.json()
            
        except (requests.exceptions.RequestException, ValueError) as e:
            contador_erro += 1 
            print(f"Erro na página {contador} (Erro {contador_erro}): {e}")
            continue 
        
        resultados = pagina_com_resultados.get('results', [])
        
        df = pd.DataFrame(resultados)
        parquet_arquivo = os.path.join(diretorio, f'registros_{data_inicial}_to_{data_fim}_{contador}.parquet')
        df.to_parquet(parquet_arquivo, index=False)
        
        cursor = pagina_com_resultados['meta'].get('next_cursor')
        
        contador += 1
    
    print(f"Total de erros: {contador_erro}")

In [6]:
def concatenar_arquivos_parquet(folder_path):

    dataframes = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.parquet'):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_parquet(file_path)
            dataframes.append(df)

    df_concatenado = pd.concat(dataframes, ignore_index=True)
    
    df_concatenado.to_parquet('df_concatenado.parquet',index=False)
    

In [7]:
def extrair_concepts_scores(df):
    concept_data = []

    for concepts in df['concepts']:
        concept_scores = {}
        for concept in concepts:
            if concept['score'] > 0.75:
                concept_scores[concept['display_name']] = concept['score']
        concept_data.append(concept_scores)

    df_concepts = pd.DataFrame(concept_data, index=df.index).fillna(0).round(4)

    df_final = pd.concat([df, df_concepts], axis=1)

    return df_final

In [8]:
def get_source_name(primary_location):
    if primary_location is None:
        return None
    source_info = primary_location.get('source', None)
    
    if source_info is None:
        return None
    display_name = source_info.get('display_name', None)
    
    return display_name



def transformar_df(df):
    
    df = df.loc[:,['doi','title','publication_date','created_date','primary_location','open_access','concepts','cited_by_count']]
    
    df_filtrado = df.copy()
    
    df_filtrado.loc[:,'source_title'] = df_filtrado['primary_location'].apply(get_source_name)
    
    df_filtrado.loc[:,'open_access_status'] = df_filtrado['open_access'].apply(lambda x: x.get('is_oa',None))
    
    df_filtrado = extrair_concepts_scores(df_filtrado)
    
    df_filtrado.drop(columns=['primary_location','open_access','concepts'],inplace=True)
    
    return df_filtrado

In [9]:
def ler_e_transformar_arquivos_diretorio(diretorio_input, diretorio_output):
    
    if not os.path.exists(diretorio_output):
        os.makedirs(diretorio_output)
    
    arquivos = [arquivo for arquivo in os.listdir(diretorio_input) if arquivo.endswith(".parquet")]
    
    try:
        for arquivo_parquet in arquivos:
            caminho_input = os.path.join(diretorio_input, arquivo_parquet)
            df_original = pd.read_parquet(caminho_input)  
            df_processado = transformar_df(df_original) 
            
            output_file = os.path.splitext(arquivo_parquet)[0] + ".parquet"
            caminho_output = os.path.join(diretorio_output, output_file)
            
            df_processado.to_parquet(caminho_output, index=False)

    except Exception as error:
        print(arquivo_parquet,error)

In [10]:
def concatenar_arquivos_parquet(diretorio_input):
    dfs = []

    for arquivo in os.listdir(diretorio_input):
        if arquivo.endswith('.parquet'):
            file_path = os.path.join(diretorio_input, arquivo)
            
            # Ler o arquivo Parquet
            df_parquet = pd.read_parquet(file_path)
            
            # Adicionar o DataFrame à lista de DataFrames
            dfs.append(df_parquet)

    # Concatenar os DataFrames verticalmente (por linhas)
    df_concatenado = pd.concat(dfs, ignore_index=True)

    return df_concatenado

In [11]:
data_inicial = '2023-09-23'
data_final = '2023-09-30'

coletar_dados_e_salvar(data_inicial,data_final)

Total de erros: 0


In [12]:
diretorio_input = 'datasets_2023-09-23_to_2023-09-30'
diretorio_output = 'datasets_2023-09-23_to_2023-09-30_processados'

In [13]:
ler_e_transformar_arquivos_diretorio(diretorio_input,diretorio_output)

registros_2023-09-23_to_2023-09-30_365.parquet "None of [Index(['doi', 'title', 'publication_date', 'created_date', 'primary_location',\n       'open_access', 'concepts', 'cited_by_count'],\n      dtype='object')] are in the [columns]"


In [None]:
df_concatenado = concatenar_arquivos_parquet(diretorio_output)

In [None]:
df_concatenado.to_parquet('df_concatenado.parquet')

___

In [9]:
df_concatenado = pd.read_parquet('df_concatenado.parquet')

In [10]:
df_concatenado.iloc[:,7:] = df_concatenado.iloc[:,7:].fillna(0)


In [11]:
concepts = df_concatenado.columns[7:]

def create_concept_column(row):
    concepts_list = [concept for concept in concepts if row[concept] > 0]
    return ';'.join(concepts_list)

# Aplica a função para criar a nova coluna
df_concatenado['Concepts'] = df_concatenado.apply(create_concept_column, axis=1)

In [12]:
df_concatenado['Concepts'].value_counts(normalize=True)

Concepts
                                          0.474710
Medicine                                  0.118301
Computer science                          0.043649
Biology                                   0.033010
Materials science                         0.032813
                                            ...   
Operating system                          0.000015
Geography;Humanities                      0.000015
Anthropology                              0.000015
Computer science;Mathematics education    0.000015
Biological system                         0.000015
Name: proportion, Length: 586, dtype: float64

In [30]:
concept = 'Statistics'

df_concatenado.query('`Statistics` > 0 ')[
    ['doi','title','publication_date','cited_by_count',concept]
    ].sort_values(by='Statistics',ascending=False)

Unnamed: 0,doi,title,publication_date,cited_by_count,Statistics
3172,https://doi.org/10.1155/2023/6676767,Censoring Balancing Functions for Undetected P...,2023-09-26,0,0.7511
60513,https://doi.org/10.18203/issn.2454-2156.intjsc...,Chain ratio and product estimators for populat...,2023-09-27,0,0.7456
57139,https://doi.org/10.1002/bimj.202200137,Spearman‐like correlation measure adjusting fo...,2023-09-27,0,0.7274
52236,https://doi.org/10.28951/bjb.v41i3.637,Ordinal data and residual analysis: Review and...,2023-09-26,0,0.7233
52350,https://doi.org/10.1186/s13063-023-07648-8,Evaluation of negative binomial and zero-infla...,2023-09-27,0,0.714
54630,https://doi.org/10.1080/16843703.2023.2259589,Deviance residual-based Shewhart control chart...,2023-09-22,0,0.7092
30484,https://doi.org/10.1177/09622802231198795,Logistic regression vs. predictive mean matchi...,2023-09-26,0,0.7058
62323,https://doi.org/10.1080/03610926.2023.2258426,Exponentially quantile regression-ratio-type e...,2023-09-25,0,0.6805
36511,https://doi.org/10.1002/jrsm.1673,Accuracy and precision of fixed and random eff...,2023-09-26,0,0.6786
26419,https://doi.org/10.1111/1556-4029.15387,A diagnosis of the primary difference between ...,2023-09-27,0,0.6686
