In [1]:
!pip install bibtexparser



In [2]:
import re
import os
import bibtexparser

In [3]:
# Download .bib file
!wget https://raw.githubusercontent.com/5H5KN5/SIT723/main/Impact/Bib/savedrecs.bib

--2024-01-25 23:25:29--  https://raw.githubusercontent.com/5H5KN5/SIT723/main/Impact/Bib/savedrecs.bib
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3988510 (3.8M) [text/plain]
Saving to: ‘savedrecs.bib.1’


2024-01-25 23:25:29 (50.8 MB/s) - ‘savedrecs.bib.1’ saved [3988510/3988510]



In [4]:
def extract_and_format_doi_csv(csv_file_path):
    if not os.path.exists(csv_file_path):
        print(f"File not found: {csv_file_path}")
        return

    try:
        # Read the CSV file into a DataFrame
        df = pd.read_csv(csv_file_path)

        # Check if 'DOI' column exists
        if 'DOI' not in df.columns:
            print("Column 'DOI' not found in the file.")
            return

        # Extract DOIs
        dois = df['DOI'].dropna().unique()
        count = len(dois)
        formatted_dois = ' OR '.join([f"DO=({doi})" for doi in dois])

        # Print the results
        print(f"\nCount of extracted DOIs for {csv_file_path}: {count}")
        print(f"Formatted DOIs:" if formatted_dois else "No DOIs found or an error occurred.")
        print(formatted_dois)
    except Exception as e:
        print(f"An error occurred: {e}")


def extract_and_format_doi_bib(bib_file_path):
    # Regular expression pattern for matching DOIs
    doi_pattern = r'doi\s*=\s*{(.*?)}'

    if not os.path.exists(bib_file_path):
        print(f"File not found: {bib_file_path}")
        return

    try:
        with open(bib_file_path, 'r', encoding='utf-8') as file:
            # Extract DOIs using a list comprehension
            dois = re.findall(doi_pattern, file.read(), re.IGNORECASE)
            count = len(dois)
            formatted_dois = ' OR '.join([f"DO=({doi})" for doi in dois])

            # Print the results
            print(f"\nCount of extracted DOIs for {bib_file_path}: {count}")
            print(f"Formatted DOIs:" if formatted_dois else "No DOIs found or an error occurred.")
            print(formatted_dois)
    except Exception as e:
        print(f"An error occurred: {e}")

def split_bibtex_by_journal(input_file):
    # Define the journals to split by
    journals = [
        "HEALTH PSYCHOLOGY AND BEHAVIORAL MEDICINE",
        "PILOT AND FEASIBILITY STUDIES",
        "HEALTH PSYCHOLOGY REVIEW",
        "PSYCHOLOGY AND HEALTH",
        "HEALTH PSYCHOLOGY",
        "IMPLEMENTATION SCIENCE",
        "HEALTH PSYCHOLOGY RESEARCH",
        "HEALTH PSYCHOLOGY OPEN",
        "HEALTH PSYCHOLOGY REPORT",
        "PSYCHOLOGY HEALTH AND MEDICINE"
    ]

    # Dictionary to hold bibtex entries for each journal
    journal_entries = {journal: [] for journal in journals}

    # Read the input file
    with open(input_file, 'r') as file:
        content = file.read()

    # Split the content into individual entries
    entries = re.split(r'@article\{', content)

    created_files = []

    for entry in entries:
        for journal in journals:
            if journal in entry:
                # Add the entry to the corresponding journal in the dictionary
                journal_entries[journal].append('@article{' + entry)

    # Write each journal's entries to separate files
    for journal, entries in journal_entries.items():
        if entries:  # Only write if there are entries for this journal
            filename = journal.replace(' ', '_') + '.bib'
            created_files.append(filename)
            with open(filename, 'w') as file:
                file.write('\n'.join(entries))

    return created_files

In [5]:
# Set Directories
# .csv
csv_path = '/content/drive/My Drive/Colab Notebooks/SIT723/Data/Articles/included.csv'
# .bib
bib_file_path = 'savedrecs.bib'

# Extracting DOIS
extract_and_format_doi_csv(csv_path)
extract_and_format_doi_bib(bib_file_path)

# Extracting Journal Specific DOIS
created_files = split_bibtex_by_journal('savedrecs.bib')

for file_path in created_files:
    extract_and_format_doi_bib(file_path)

File not found: /content/drive/My Drive/Colab Notebooks/SIT723/Data/Articles/included.csv

Count of extracted DOIs for savedrecs.bib: 414
Formatted DOIs:
DO=(10.1080/21642850.2023.2167719) OR DO=(10.1080/21642850.2023.2265142) OR DO=(10.1186/s40814-023-01413-z) OR DO=(10.1186/s40814-023-01403-1) OR DO=(10.1080/08870446.2023.2267610) OR DO=(10.1080/17437199.2023.2261518) OR DO=(10.1080/13548506.2023.2253510) OR DO=(10.1186/s40814-023-01385-0) OR DO=(10.1186/s40814-023-01386-z) OR DO=(10.1186/s40814-023-01380-5) OR DO=(10.1186/s40814-023-01381-4) OR DO=(10.1186/s40814-023-01324-z) OR DO=(10.1080/17437199.2023.2248222) OR DO=(10.1186/s40814-023-01357-4) OR DO=(10.1186/s40814-023-01376-1) OR DO=(10.1037/hea0001318) OR DO=(10.1186/s40814-023-01368-1) OR DO=(10.1186/s40814-023-01350-x) OR DO=(10.1186/s40814-023-01323-0) OR DO=(10.1186/s40814-023-01319-w) OR DO=(10.1186/s40814-023-01333-y) OR DO=(10.1186/s40814-023-01326-x) OR DO=(10.1186/s40814-023-01332-z) OR DO=(10.1186/s40814-023-01303-4)