In [4]:
import requests
import json
from tqdm import tqdm
import os  # Import os module for directory management
import zipfile
import csv
import sys

In [5]:
# Define the API endpoint and initial parameters
api_url = "https://api.gbif.org/v1/literature/search"
params = {
    "contentType": "literature",
    "literatureType": ["journal", "working_paper"],
    "relevance": "GBIF_USED",
    "peerReview": "true",
    "limit": 10,
    "offset": 0  # Start from the beginning
}

In [6]:
# Function to get data from the API
def fetch_data(params):
    response = requests.get(api_url, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Failed to fetch data: {response.status_code}")
        return None

In [7]:
# Function to extract all entries and filter those with content in gbifDownloadKey
def extract_filtered_entries():
    all_entries = []
    params['offset'] = 0  # Ensure offset starts at 0
    
    # Fetch initial data to determine total number of results
    initial_data = fetch_data(params)
    if not initial_data or 'count' not in initial_data:
        print("Failed to fetch initial data or count not available.")
        return []
    
    total_results = initial_data['count']
    print(f"Total results to fetch: {total_results}")
    
    with tqdm(total=total_results, desc="Fetching entries") as pbar:
        while True:
            data = fetch_data(params)
            if data and 'results' in data:
                # Filter entries that have content in gbifDownloadKey
                filtered_entries = [entry for entry in data['results'] if entry.get('gbifDownloadKey')]
                all_entries.extend(filtered_entries)
                pbar.update(len(data['results']))
                if len(data['results']) < params['limit']:
                    # No more data to fetch
                    break
                else:
                    # Move to the next page
                    params['offset'] += params['limit']
            else:
                break
            
    return all_entries

In [8]:
# Extract and filter entries
filtered_entries = extract_filtered_entries()

# Optionally, save the data to a file
with open('filtered_gbif_entries.json', 'w') as f:
    json.dump(filtered_entries, f, indent=2)

# Print the number of filtered entries fetched
print(f"Total filtered entries fetched: {len(filtered_entries)}")

Total results to fetch: 10585


Fetching entries: 100%|██████████| 10585/10585 [04:38<00:00, 38.05it/s]


Total filtered entries fetched: 3572


### Summary:
- **Increase Field Size Limit**: The script sets the field size limit for CSV processing to 1,000,000 characters to handle large fields.
- **Load and Save Processed DOIs**: Functions to load and save DOIs to track which entries have been processed.
- **Download and Process Data**: The main function to download, unzip, process, and filter data, ensuring only preserved specimens are kept, and appending results to an output file on the D drive.
- **Directory Checks**: Ensures necessary directories exist before writing files.

In [43]:
# Increase the CSV field size limit to a large value
csv.field_size_limit(10**6)

# Function to load processed DOIs from skip file
def load_processed_dois(skip_file):
    print(f"Loading processed DOIs from {skip_file}")
    if os.path.exists(skip_file):
        with open(skip_file, 'r', encoding='utf-8') as file:
            return set(line.strip() for line in file)
    return set()

# Function to save a DOI to the skip file
def save_processed_doi(skip_file, doi):
    print(f"Saving DOI {doi} to {skip_file}")
    with open(skip_file, 'a', encoding='utf-8') as file:
        file.write(doi + '\n')

# Function to load downloaded keys from a file
def load_downloaded_keys(downloaded_keys_file):
    print(f"Loading downloaded keys from {downloaded_keys_file}")
    if os.path.exists(downloaded_keys_file):
        with open(downloaded_keys_file, 'r', encoding='utf-8') as file:
            return set(line.strip() for line in file)
    return set()

# Function to save a downloaded key to a file
def save_downloaded_key(downloaded_keys_file, key):
    print(f"Saving downloaded key {key} to {downloaded_keys_file}")
    with open(downloaded_keys_file, 'a', encoding='utf-8') as file:
        file.write(key + '\n')

# Function to download, unzip, process data using gbifDownloadKey, and delete zip files and extracted contents
def download_and_process_gbif_data(filtered_entries, skip_file, downloaded_keys_file):
    base_url = "https://api.gbif.org/v1/occurrence/download/request/"
    download_dir = "D:/gbif_downloads"  # Change to D drive
    error_log = "D:/gbif_errors/error_log.txt"  # Change to D drive and use a subdirectory
    output_file = "D:/gbif_outputs/output_data.csv"  # Change to D drive and use a subdirectory
    
    # Ensure the directories exist
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)
    if not os.path.exists(os.path.dirname(error_log)):
        os.makedirs(os.path.dirname(error_log))
    if not os.path.exists(os.path.dirname(output_file)):
        os.makedirs(os.path.dirname(output_file))
    if not os.path.exists(os.path.dirname(skip_file)):
        os.makedirs(os.path.dirname(skip_file))
    if not os.path.exists(os.path.dirname(downloaded_keys_file)):
        os.makedirs(os.path.dirname(downloaded_keys_file))
    
    # Load processed DOIs
    processed_dois = load_processed_dois(skip_file)
    print(f"Loaded {len(processed_dois)} processed DOIs")

    # Load downloaded keys
    downloaded_keys = load_downloaded_keys(downloaded_keys_file)
    print(f"Loaded {len(downloaded_keys)} downloaded keys")
    
    # Determine if we need to write the header
    write_header = not os.path.exists(output_file)
    
    # Open the output CSV file in append mode
    with open(output_file, 'a', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['gbifID', 'year', 'countryCode', 'gbifDownloadKey', 'doi']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        # Write the header if the file did not exist before
        if write_header:
            writer.writeheader()
            print(f"Wrote header to {output_file}")
        
        with open(error_log, 'w') as error_file:
            for entry in tqdm(filtered_entries, desc="Downloading and processing GBIF data"):
                try:
                    identifiers = entry.get('identifiers', {})
                    doi = identifiers.get('doi', '')
                    if doi in processed_dois:
                        print(f"Skipping already processed DOI: {doi}")
                        continue
                    
                    key = entry.get('gbifDownloadKey', [])[0]
                    if key in downloaded_keys:
                        print(f"Skipping already downloaded key: {key}")
                        continue
                    
                    file_path = os.path.join(download_dir, f"{key}.zip")
                    
                    # Check if file already exists
                    if os.path.exists(file_path):
                        print(f"File {file_path} already exists. Skipping download.")
                        continue
                    
                    # Download the zip file
                    download_url = f"{base_url}{key}.zip"
                    print(f"Downloading {download_url}")
                    response = requests.get(download_url, stream=True)
                    if response.status_code == 200:
                        with open(file_path, 'wb') as file:
                            for chunk in response.iter_content(chunk_size=1024):
                                file.write(chunk)
                        print(f"Downloaded {file_path}")
                        # Save the downloaded key
                        save_downloaded_key(downloaded_keys_file, key)
                        downloaded_keys.add(key)
                    elif response.status_code == 404:
                        error_message = f"Failed to download data for key {key}: 404 Not Found"
                        error_file.write(error_message + '\n')
                        print(error_message)
                        continue
                    else:
                        error_message = f"Failed to download data for key {key}: {response.status_code}"
                        error_file.write(error_message + '\n')
                        print(error_message)
                        continue
                
                    # Unzip the downloaded file and extract required information
                    try:
                        print(f"Unzipping {file_path}")
                        with zipfile.ZipFile(file_path, 'r') as zip_ref:
                            zip_ref.extractall(download_dir)
                            extracted_files = zip_ref.namelist()
                            print(f"Extracted files: {extracted_files}")
                            
                            # Check for occurrence.txt (Darwin Core archive) or single CSV file
                            occurrence_file_path = None
                            if 'occurrence.txt' in extracted_files:
                                occurrence_file_path = os.path.join(download_dir, 'occurrence.txt')
                            else:
                                csv_file_name = f"{key}.csv"
                                if csv_file_name in extracted_files:
                                    occurrence_file_path = os.path.join(download_dir, csv_file_name)
                            
                            if occurrence_file_path:
                                print(f"Processing {occurrence_file_path}")
                                with open(occurrence_file_path, newline='', encoding='utf-8') as occurrence_file:
                                    reader = csv.DictReader(occurrence_file, delimiter='\t')
                                    for row in reader:
                                        # Check for both 'basisOfRecord' and 'basisofrecord' in a case-insensitive manner
                                        basis_of_record = row.get('basisOfRecord', '').lower() if 'basisOfRecord' in row else row.get('basisofrecord', '').lower()
                                        if basis_of_record == 'preserved_specimen'.lower():
                                            writer.writerow({
                                                'gbifID': row['gbifID'],
                                                'year': row['year'],
                                                'countryCode': row['countryCode'],
                                                'gbifDownloadKey': key,
                                                'doi': doi
                                            })
                                print(f"Processed {occurrence_file_path}")
                                # Ensure the file is closed before deleting it
                                del reader
                                os.remove(occurrence_file_path)
                                print(f"Deleted extracted file {occurrence_file_path}")
                        
                        # Ensure the zip file is closed before deleting it
                        del zip_ref
                        os.remove(file_path)
                        print(f"Deleted {file_path}")
                        
                        # Save the DOI to the skip file
                        save_processed_doi(skip_file, doi)
                        print(f"Saved DOI {doi} to skip file")
                    except zipfile.BadZipFile:
                        error_message = f"Bad zip file {file_path}"
                        error_file.write(error_message + '\n')
                        print(error_message)
                    except Exception as e:
                        error_message = f"Failed to process file {file_path}: {str(e)}"
                        error_file.write(error_message + '\n')
                        print(error_message)
                except requests.exceptions.RequestException as e:
                    error_message = f"Request error for key {key}: {str(e)}"
                    error_file.write(error_message + '\n')
                    print(error_message)
                except Exception as e:
                    error_message = f"Unexpected error for key {key}: {str(e)}"
                    error_file.write(error_message + '\n')
                    print(error_message)

In [None]:
# Call the function with the filtered entries and specify the skip file and downloaded keys file
skip_file = "D:/gbif_skip_files/processed_dois.txt"  # Change to D drive and use a subdirectory
downloaded_keys_file = "D:/gbif_skip_files/downloaded_keys.txt"  # Change to D drive and use a subdirectory
download_and_process_gbif_data(filtered_entries, skip_file, downloaded_keys_file)

Loading processed DOIs from D:/gbif_skip_files/processed_dois.txt
Loaded 186 processed DOIs
Loading downloaded keys from D:/gbif_skip_files/downloaded_keys.txt
Loaded 152 downloaded keys


Downloading and processing GBIF data:   0%|          | 0/3572 [00:00<?, ?it/s]

Skipping already processed DOI: 10.15666/aeer/2202_18851902
Skipping already processed DOI: 10.1002/ece3.11230
Skipping already processed DOI: 10.1038/s41598-024-59947-y
Skipping already downloaded key: 0019000-220831081235567
Skipping already processed DOI: 10.1016/j.tfp.2024.100559
Skipping already processed DOI: 10.1007/s10530-024-03313-6
Skipping already processed DOI: 10.32383/appdr/185727
Skipping already processed DOI: 10.3390/fishes9040148
Skipping already processed DOI: 10.1007/s10681-024-03317-2
Skipping already processed DOI: 10.1007/s10750-024-05554-x
Skipping already processed DOI: 10.1016/j.ecoinf.2024.102604
Skipping already processed DOI: 10.1093/jee/toae013
Skipping already processed DOI: 10.3897/zookeys.1196.116144
Skipping already processed DOI: 10.37828/em.2024.72.20
Skipping already processed DOI: 10.7494/geom.2024.18.3.45
Skipping already processed DOI: 10.1007/s10113-024-02222-7
Skipping already processed DOI: 10.13057/biodiv/d250328
Skipping already processed DO

Downloading and processing GBIF data:   1%|          | 22/3572 [00:00<00:57, 61.55it/s]

Failed to download data for key 0014654-230224095556074: 404 Not Found
Skipping already processed DOI: 10.1002/tax.13173
Skipping already downloaded key: 0202277-220831081235567
Skipping already processed DOI: 10.1002/ecs2.4837
Skipping already processed DOI: 10.1002/ps.8128
Skipping already processed DOI: 10.15560/20.2.536
Skipping already processed DOI: 10.1111/eff.12784
Skipping already downloaded key: 0001340-160118175350007
Skipping already processed DOI: 10.1111/gcb.17282
Skipping already processed DOI: 10.1093/botlinnean/boae019
Skipping already processed DOI: 10.1007/s10340-024-01767-0
Skipping already processed DOI: 10.1038/s41558-024-01966-8
Skipping already processed DOI: 10.1002/ecs2.4830
Skipping already processed DOI: 10.1111/geb.13847
Skipping already processed DOI: 10.26577/eb.2024.v98.i1.010
Skipping already processed DOI: 10.1038/s41467-024-46818-3
Skipping already processed DOI: 10.3389/fevo.2024.1346795
Skipping already processed DOI: 10.3390/d16040223
Skipping alre

Downloading and processing GBIF data:   3%|▎         | 91/3572 [00:00<00:23, 146.19it/s]

Failed to download data for key 0294639-200613084148143: 404 Not Found
Skipping already processed DOI: 10.1051/alr/2024002
Skipping already processed DOI: 10.1007/s10530-024-03283-9
Skipping already downloaded key: 0046560-210914110416597
Skipping already processed DOI: 10.1016/j.revpalbo.2024.105096
Skipping already processed DOI: 10.3390/biology13030198
Skipping already processed DOI: 10.1093/jxb/erae126
Skipping already processed DOI: 10.3897/bdj.12.e120670
Skipping already processed DOI: 10.1093/botlinnean/boae016
Skipping already processed DOI: 10.1007/s10661-024-12543-z
Skipping already processed DOI: 10.1111/ecog.06697
Skipping already processed DOI: 10.1016/j.vetpar.2024.110172
Skipping already processed DOI: 10.22271/letters.2024.v4.i1b.85
Skipping already processed DOI: 10.1111/njb.04266
Skipping already processed DOI: 10.3390/su16051929
Skipping already processed DOI: 10.1007/s11258-024-01408-7
Skipping already processed DOI: 10.1002/ece3.11132
Skipping already processed DOI

Downloading and processing GBIF data:   5%|▍         | 169/3572 [00:01<00:18, 185.73it/s]

Failed to download data for key 0000129-150523225239109: 404 Not Found
Skipping already processed DOI: 10.1038/s41598-024-54735-0
Skipping already downloaded key: 0014439-240202131308920
Skipping already processed DOI: 10.5751/es-14793-290121
Skipping already processed DOI: 10.1111/eff.12771
Skipping already processed DOI: 10.1007/s10530-024-03270-0
Skipping already processed DOI: 10.1016/j.marpolbul.2024.116162
Skipping already processed DOI: 10.12976/jib/2024.46.1.2
Skipping already processed DOI: 10.12976/jib/2024.46.1.1
Skipping already downloaded key: 0178283-220831081235567
Skipping already downloaded key: 0008475-230530130749713
Skipping already downloaded key: 0015937-231120084113126
Skipping already processed DOI: 10.1016/j.gecco.2024.e02861
Skipping already processed DOI: 10.5597/lajam00321
Skipping already downloaded key: 0259755-220831081235567
Skipping already processed DOI: 10.1111/gcb.17205
Skipping already processed DOI: 10.1111/ele.14391
Skipping already processed DOI:

Downloading and processing GBIF data:   6%|▌         | 221/3572 [00:01<00:18, 179.27it/s]

Downloaded D:/gbif_downloads\0072734-210914110416597.zip
Saving downloaded key 0072734-210914110416597 to D:/gbif_skip_files/downloaded_keys.txt
Unzipping D:/gbif_downloads\0072734-210914110416597.zip
Extracted files: ['0072734-210914110416597.csv']
Processing D:/gbif_downloads\0072734-210914110416597.csv
Processed D:/gbif_downloads\0072734-210914110416597.csv
Deleted extracted file D:/gbif_downloads\0072734-210914110416597.csv
Deleted D:/gbif_downloads\0072734-210914110416597.zip
Saving DOI 10.1007/s42965-024-00329-w to D:/gbif_skip_files/processed_dois.txt
Saved DOI 10.1007/s42965-024-00329-w to skip file
Downloading https://api.gbif.org/v1/occurrence/download/request/0016949-231002084531237.zip


Downloading and processing GBIF data:   6%|▌         | 221/3572 [00:13<00:18, 179.27it/s]