In [None]:
# last time updated: 2023-10-01

In [14]:
#import stuff

import shutil
import os
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from tqdm import tqdm
import pandas as pd
from collections import defaultdict

print("stuff imported 🤓")

stuff imported 🤓


In [None]:
# delete temp folder if exists

if os.path.exists("temp"):
    shutil.rmtree("temp")
    print(f"temp folder deleted 🤓")
else:
    print("temp folder does not exist 🤓")

temp folder deleted 🤓


In [None]:
# defines the search url on ERIC website, just copy and paste it here

url = "https://eric.ed.gov/?q=interdisciplinary&pr=on&ff1=dtySince_2024"

print("url defined 🤓")

url defined 🤓


In [None]:
# does the scrapping to download the .nbib files from ERIC website

articles_per_page = 200

save_dir = os.path.join(os.getcwd(), "temp")
os.makedirs(save_dir, exist_ok=True)

options = webdriver.FirefoxOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

options.set_preference("browser.download.folderList", 2)
options.set_preference("browser.download.dir", save_dir)
options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/octet-stream")

driver = webdriver.Firefox(options=options)
waiter = WebDriverWait(driver, 20, 0.1)
action = ActionChains(driver) 

driver.get(url)

print('waiting for page to load...')
waiter.until(
    lambda d: d.execute_script("return document.readyState") == "complete"
)
print('page loaded')

div_text = driver.find_element(By.ID, "rr0").text.replace(',', '')
match = re.search(r'(?:all\s+)?(\d+)\s+results', div_text)
num = int(match.group(1))
print(f"number of results: {num}")

export_link = driver.find_element(By.XPATH, '//a[@onclick="if(document.getElementById(\'divExportList\').style.display==\'block\'){document.getElementById(\'divExportList\').style.display=\'none\';}else{document.getElementById(\'divSaveList\').style.display=\'none\';document.getElementById(\'divExportList\').style.display=\'block\';}return(false);"]')
action.move_to_element(export_link).click().perform()

results_to_include = driver.find_element(By.ID, "selectExport")
select = Select(results_to_include)
select.select_by_value(str(articles_per_page))

for start in tqdm(range(1, num + 1, articles_per_page),
                  desc="downloading files",
                  unit="file"):
    
    input_field = driver.find_element(By.ID, "inputExport")
    input_field.clear()
    input_field.send_keys(str(start))
    
    button = driver.find_element(By.XPATH, '//input[@type="submit" and @value="Create file"]')
    button.click()

print("download completed 🤓")

waiting for page to load...
page loaded
number of results: 1175


downloading files: 100%|██████████| 6/6 [00:01<00:00,  4.89file/s]

download completed 🤓





In [None]:
# concatenate all .nbib files into one file and create a .xlsx and .csv.gz with it

def parse_nbib_all_fields(file_path):
    entries = []
    current_entry = defaultdict(str)
    tag = None

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            if line.strip() == "":
                if current_entry:
                    entries.append(dict(current_entry))
                    current_entry = defaultdict(str)
                    tag = None
                continue

            match = re.match(r"^([A-Z]{2,4})\s+-\s+(.*)", line)
            if match:
                tag, content = match.groups()
                if tag in current_entry:
                    current_entry[tag] += "; " + content.strip()
                else:
                    current_entry[tag] = content.strip()
            else:
                if tag:
                    current_entry[tag] += " " + line.strip()

    if current_entry:
        entries.append(dict(current_entry))

    return pd.DataFrame(entries)

concatenated_filename = 'concatenated.nbib'
concatenated_path = os.path.join(save_dir, concatenated_filename)

with open(concatenated_path, 'w', encoding='utf-8') as outfile:
    for file_name in os.listdir(save_dir):
        if file_name.endswith('.nbib') and file_name != concatenated_filename:
            file_path = os.path.join(save_dir, file_name)
            with open(file_path, 'r', encoding='utf-8') as infile:
                outfile.write(infile.read())
                outfile.write("\n")

for file_name in os.listdir(save_dir):
    if file_name.endswith('.nbib') and file_name != concatenated_filename:
        os.remove(os.path.join(save_dir, file_name))

df = parse_nbib_all_fields(concatenated_path)

excel_output_path = os.path.join(save_dir, 'nbib_data.xlsx')
df.to_excel(excel_output_path, index=False)

csv_gz_output_path = os.path.join(save_dir, 'nbib_data.csv.gz')
df.to_csv(csv_gz_output_path, index=False, compression='gzip')

if not df.empty:
    print("the .nbib files concatenated successfully, .xlsx and .csv.gz files created 🤓")
else:
    print("failed to concatenate files, DataFrame is empty... probably... 🤓")

.nbib files concatenated successfully, .xlsx and .csv.gz files created 🤓


In [None]:
# searches and filters the articles based on keywords

df = pd.read_csv(csv_gz_output_path, compression='gzip')

palavras_or_2 = ['teach', 'educat', 'learn']
palavras_or_1 = ['physic']

def verifica_texto(row):
    texto = (str(row['TI']) + " " + str(row['AB'])).lower()
    tokens = texto.split()
    for i in range(len(tokens) - 1):
        t1, t2 = tokens[i], tokens[i+1]
        if (any(t1.startswith(p) for p in palavras_or_1) and 
            any(t2.startswith(p) for p in palavras_or_2)):
            return True
        if (any(t1.startswith(p) for p in palavras_or_2) and 
            any(t2.startswith(p) for p in palavras_or_1)):
            return True
    return False

df = df.drop_duplicates(subset=['TI', 'AB'])

df_filtrado = df[df.apply(verifica_texto, axis=1)]

df_filtrado.to_csv(csv_gz_output_path, index=False, compression='gzip')
df_filtrado.to_excel(excel_output_path, index=False)

print(f"Total de artigos filtrados: {len(df_filtrado)}")


Total de artigos filtrados: 356


In [None]:
# writes a .txt file with the search and filtering information

output_txt_path = os.path.join(save_dir, 'informacoes.txt')

content = f"""
URL: {url}
Number of results: {num}
Palavras OR 1: {', '.join(palavras_or_1)}
Palavras OR 2: {', '.join(palavras_or_2)}
Total de artigos filtrados: {len(df_filtrado)}
"""

with open(output_txt_path, 'w', encoding='utf-8') as file:
    file.write(content)

print(f"Arquivo de informações da busca e filtragem salvo!")