In [8]:
import shutil

import os
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from tqdm import tqdm

import pandas as pd

In [9]:
if os.path.exists("temp"):
    shutil.rmtree("temp")
    print(f"Pasta 'temp' apagada com sucesso.")
else:
    print("Pasta 'temp' já não existe.")

Pasta 'temp' apagada com sucesso.


In [10]:
url = "https://eric.ed.gov/?q=interdisciplinary+OR+interdisciplinarys+OR+interdisciplinarity+OR+interdisciplinarities+OR+interdisciplinar+OR+interdisciplinares+OR+interdiscipline+OR+interdisciplines+OR+multidisciplinary+OR+multidisciplinarys+OR+multidisciplinarity+OR+multidisciplinarities+OR+multidisciplinar+OR+multidisciplinares+OR+multidiscipline+OR+multidisciplines+OR+pluridisciplinary+OR+pluridisciplinarys+OR+pluridisciplinarity+OR+pluridisciplinarities+OR+pluridisciplinar+OR+pluridisciplinares+OR+pluridiscipline+OR+pluridisciplines+OR+transdisciplinary+OR+transdisciplinarys+OR+transdisciplinarity+OR+transdisciplinarities+OR+transdisciplinar+OR+transdisciplinares+OR+transdiscipline+OR+transdisciplines&pr=on&ft=on"

In [11]:
articles_per_page = 200

save_dir = os.path.join(os.getcwd(), "temp")
os.makedirs(save_dir, exist_ok=True)

options = webdriver.FirefoxOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

options.set_preference("browser.download.folderList", 2)
options.set_preference("browser.download.dir", save_dir)
options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream")

driver = webdriver.Firefox(options=options)
waiter = WebDriverWait(driver, 20, 0.1)  # espera por 20 segundos, com intervalo de 0.1 segundos entre as tentativas
action = ActionChains(driver) 

driver.get(url)

print('Waiting for page to load...')
waiter.until(
    lambda d: d.execute_script("return document.readyState") == "complete"
)
print('Page loaded.')

div_text = driver.find_element(By.ID, "rr0").text.replace(',', '')
match = re.search(r'(?:all\s+)?(\d+)\s+results', div_text)
num = int(match.group(1))
print(f"Number of results: {num}")

export_link = driver.find_element(By.XPATH, '//a[@onclick="if(document.getElementById(\'divExportList\').style.display==\'block\'){document.getElementById(\'divExportList\').style.display=\'none\';}else{document.getElementById(\'divSaveList\').style.display=\'none\';document.getElementById(\'divExportList\').style.display=\'block\';}return(false);"]')
action.move_to_element(export_link).click().perform()

results_to_include = driver.find_element(By.ID, "selectExport")
select = Select(results_to_include)
select.select_by_value(str(articles_per_page))

for start in tqdm(range(1, num + 1, articles_per_page),
                  desc="Downloading files",
                  unit="file"):
    
    input_field = driver.find_element(By.ID, "inputExport")
    input_field.clear()
    input_field.send_keys(str(start))
    
    button = driver.find_element(By.XPATH, '//input[@type="submit" and @value="Create file"]')
    button.click()

Waiting for page to load...
Page loaded.
Number of results: 4466


Downloading files: 100%|██████████| 23/23 [00:03<00:00,  7.34file/s]


In [12]:
def parse_nbib(file_path):
    data = {"TI": [], "AU": [], "DP": [], "JT": [], "LA": [], "LID": [], "AB": []}
    
    with open(file_path, 'r', encoding='utf-8') as file:
        current_entry = {key: "" for key in data.keys()}
        for line in file:
            if line.strip() == "":
                if any(current_entry.values()):
                    for key in data.keys():
                        data[key].append(current_entry[key].strip())
                    current_entry = {key: "" for key in data.keys()}
                continue
            
            for key in data.keys():
                if line.startswith(key + " - ") or line.startswith(key + "  - "):
                    if key == "AU":
                        current_entry[key] += line[len(key) + 3:].strip() + "; "
                    else:
                        current_entry[key] += line[len(key) + 3:].strip() + " "
                    break
    
    if any(current_entry.values()):
        for key in data.keys():
            data[key].append(current_entry[key].strip())
    
    return pd.DataFrame(data)

concatenated_filename = 'concatenated.nbib'
concatenated_path = os.path.join(save_dir, concatenated_filename)

with open(concatenated_path, 'w', encoding='utf-8') as outfile:
    for file_name in os.listdir(save_dir):
        if file_name.endswith('.nbib') and file_name != concatenated_filename:
            file_path = os.path.join(save_dir, file_name)
            with open(file_path, 'r', encoding='utf-8') as infile:
                outfile.write(infile.read())
                outfile.write("\n")  # Adiciona quebra de linha entre os arquivos

for file_name in os.listdir(save_dir):
    if file_name.endswith('.nbib') and file_name != concatenated_filename:
        os.remove(os.path.join(save_dir, file_name))

df = parse_nbib(concatenated_path)

excel_output_path = os.path.join(save_dir, 'nbib_data.xlsx')
df.to_excel(excel_output_path, index=False)

csv_gz_output_path = os.path.join(save_dir, 'nbib_data.csv.gz')
df.to_csv(csv_gz_output_path, index=False, compression='gzip')

if not df.empty:
    print("Arquivos concatenados com sucesso!")
    print(f"Total de registros no DataFrame: {len(df)}")
else:
    print("Falha ao concatenar os arquivos. O DataFrame está vazio.")

Arquivos concatenados com sucesso!
Total de registros no DataFrame: 4466


In [13]:
df = pd.read_csv(csv_gz_output_path, compression='gzip')

palavras_or_2 = ['teach', 'educat', 'learn']
palavras_or_1 = [
    'physic',   # physics
    'chem',     # chemistry
    'biolog',   # biology
    'astronom', # astronomy
    'geolog',   # geology
    'scien',    # science
    'ecolog',   # ecology
]

def verifica_texto(row):
    # Concatena TI e AB e converte para minúsculas
    texto = (str(row['TI']) + " " + str(row['AB'])).lower()
    tokens = texto.split()
    # Percorre os tokens procurando por pares consecutivos em que um
    # inicia com uma palavra de palavras_or_1 e o outro com uma de palavras_or_2,
    # em qualquer ordem.
    for i in range(len(tokens) - 1):
        t1, t2 = tokens[i], tokens[i+1]
        if (any(t1.startswith(p) for p in palavras_or_1) and 
            any(t2.startswith(p) for p in palavras_or_2)):
            return True
        if (any(t1.startswith(p) for p in palavras_or_2) and 
            any(t2.startswith(p) for p in palavras_or_1)):
            return True
    return False

df = df.drop_duplicates(subset=['TI', 'AB'])

df_filtrado = df[df.apply(verifica_texto, axis=1)]

df_filtrado.to_csv(csv_gz_output_path, index=False, compression='gzip')
df_filtrado.to_excel(excel_output_path, index=False)

print(f"Total de artigos filtrados: {len(df_filtrado)}")


Total de artigos filtrados: 341


In [15]:
# Caminho do arquivo de saída
output_txt_path = os.path.join(save_dir, 'informacoes.txt')

# Conteúdo do arquivo
content = f"""
URL: {url}
Number of results: {num}
Palavras OR 1: {', '.join(palavras_or_1)}
Palavras OR 2: {', '.join(palavras_or_2)}
Total de artigos filtrados: {len(df_filtrado)}
"""

# Salvando o arquivo
with open(output_txt_path, 'w', encoding='utf-8') as file:
    file.write(content)

print(f"Arquivo de informações da busca e filtragem salvo!")

Arquivo de informações da busca e filtragem salvo!
