# Recomendador de artigos - OASIS

## Importação das bibliotecas

In [3]:
import pandas as pd
import os
import requests

## Funções

## Coleta de dados

# Revendo Open Alex

In [26]:
def coletar_dados_e_salvar(start_date, end_date):
    
    cursor = '*'
    
        
    contador = 1
    contador_erro = 0
    
    if not os.path.exists(f'datasets_{start_date}_to_{end_date}'):
        os.makedirs(f'datasets_{start_date}_to_{end_date}')
    
    while cursor != None:
        url = f'https://api.openalex.org/works?filter=from_publication_date:{start_date},to_publication_date:{end_date},type:article&per-page=200&cursor={cursor}'

        
        try:
            requisicao = requests.get(url)
            pagina_com_resultados = requisicao.json()
            
        except (requests.exceptions.RequestException, ValueError) as e:
            contador_erro += 1 
            print(f"Erro na página {contador} (Erro {contador_erro}): {e}")
            continue 
        
        resultados = pagina_com_resultados.get('results', [])
        
        df = pd.DataFrame(resultados)
        parquet_arquivo = os.path.join(f'datasets_{start_date}_to_{end_date}', f'registros_{contador}.parquet')
        df.to_parquet(parquet_arquivo, index=False)
        
        cursor = pagina_com_resultados['meta'].get('next_cursor')
        
        contador += 1
    
    print(f"Total de erros: {contador_erro}")

In [37]:
def extrair_concepts_scores(df):
    concept_data = []

    for concepts in df['concepts']:
        concept_scores = {}
        for concept in concepts:
            if concept['level'] == 0:
                concept_scores[concept['display_name']] = concept['score']
        concept_data.append(concept_scores)

    df_concepts = pd.DataFrame(concept_data, index=df.index).fillna(0).round(4)

    df_final = pd.concat([df, df_concepts], axis=1)

    return df_final

def extrair_concepts(df):
    concepts = []

    for coluna in df.iloc[:,5:].columns:
        if df[coluna].gt(0).any():
            concepts.append(coluna)

    df['concepts'] = df.apply(lambda row: ';'.join([col for col in concepts if row[col] > 0]), axis=1)

    return df


def processar_dataframe(df):
    df = df.loc[:, ['doi', 'title', 'publication_date', 'open_access', 'concepts']]
    
    df['open_access'] = df['open_access'].apply(lambda x: x['is_oa'])

    df = extrair_concepts_scores(df)
    
    df = extrair_concepts(df)
    

    return df

In [55]:
def processar_e_salvar_parquet(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    parquet_files = [file for file in os.listdir(input_folder) if file.endswith(".parquet")]
    
    try:
        for parquet_file in parquet_files:
            input_path = os.path.join(input_folder, parquet_file)
            df_original = pd.read_parquet(input_path)  
            df_processado = processar_dataframe(df_original) 
            
            output_file = os.path.splitext(parquet_file)[0] + ".parquet"
            output_path = os.path.join(output_folder, output_file)
            
            df_processado.to_parquet(output_path, index=False)

    except Exception as error:
        print(error)
        print(parquet_file)

In [27]:
def concatenar_arquivos_parquet(folder_path):
    dataframes = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.parquet'):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_parquet(file_path)
            dataframes.append(df)

    df_concatenado = pd.concat(dataframes, ignore_index=True)
    
    df_concatenado.to_parquet('df_concatenado.parquet', index=False)

In [32]:
coletar_dados_e_salvar('2023-10-02','2023-10-08')

Total de erros: 0


In [56]:
processar_e_salvar_parquet('datasets_2023-10-02_to_2023-10-08', 'datasets_2023-10-02_to_2023-10-08_processados')

In [57]:
concatenar_arquivos_parquet('datasets_2023-10-02_to_2023-10-08_processados')

In [58]:
df = pd.read_parquet('/home/franciscofoz/Documents/GitHub/recomendador-artigos-OpenAlex-GPT/df_concatenado.parquet')


In [91]:
df_filtrado = df.query('Medicine > 0')
df_filtrado[df_filtrado['title'].str.contains('machine learning')]

Unnamed: 0,doi,title,publication_date,open_access,concepts,Environmental science,Psychology,Computer science,Art,Business,Mathematics,Geography,Biology,Political science,Philosophy,History,Medicine,Physics,Materials science,Economics,Engineering,Sociology,Chemistry,Geology
142,https://doi.org/10.1002/jdd.13375,Leveraging machine learning to create user‐fri...,2023-10-02,False,Computer science;Medicine,0.0,0.0,0.4598,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.471,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3159,https://doi.org/10.1007/s00261-023-04029-2,Radiomics-based machine learning and deep lear...,2023-10-03,False,Medicine;Computer science,0.0,0.0,0.1073,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7582,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3738,https://doi.org/10.1186/s12911-023-02307-z,Development of a machine learning-based acuity...,2023-10-03,True,Medicine;Computer science,0.0,0.0,0.45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4257,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4113,https://doi.org/10.3389/fonc.2023.1258970,Predicting head and neck cancer treatment outc...,2023-10-02,True,Medicine;Computer science,0.0,0.0,0.3564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4221,https://doi.org/10.1093/postmj/qgad087,Risk factors of major bleeding detected by mac...,2023-10-04,False,Medicine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9239,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6166,https://doi.org/10.3389/fcvm.2023.1189293,Segmenting computed tomograms for cardiac abla...,2023-10-02,True,Computer science;Medicine,0.0,0.0,0.3114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6533,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7117,https://doi.org/10.3389/fendo.2023.1108616,Identifying potential biomarkers for non-obstr...,2023-10-03,True,Computer science;Biology;Medicine,0.0,0.0,0.3024,0.0,0.0,0.0,0.0,0.4829,0.0,0.0,0.0,0.1342,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9911,https://doi.org/10.3892/etm.2023.12235,Evaluation of the accuracy of heart dose predi...,2023-10-02,True,Computer science;Medicine,0.0,0.0,0.2799,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6212,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12762,https://doi.org/10.1038/s41598-023-43834-z,Identifying potential biomarkers of idiopathic...,2023-10-02,True,Medicine;Biology;Computer science,0.0,0.0,0.1327,0.0,0.0,0.0,0.0,0.21,0.0,0.0,0.0,0.549,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14507,https://doi.org/10.1186/s12911-023-02276-3,Opportunities and challenges of supervised mac...,2023-10-02,True,Computer science;Medicine,0.0,0.0,0.4722,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3516,0.0,0.0,0.0,0.0,0.0,0.0,0.0
