This step is taken after the occurrence.txt file has been extracted from the dwca zip file, and saved as a reduced CSV file.

This script will load the occurrence csv file, split it by "acceptedTaxonKey" and save each as a separate CSV file

In [1]:
import os
import pandas as pd
import numpy as np
import concurrent.futures
import logging
import sys

# Configure loggingg
logging.basicConfig(filename='split_log.log', level=logging.INFO,
                    format='%(asctime)s [%(levelname)s] - %(message)s')

In [2]:
home_dir = os.path.dirname(os.getcwd())

if sys.platform.startswith("linux"):
    data_dir = "/bask/projects/v/vjgo8416-amber/data/gbif-species-trainer-AMI-fork/"
elif sys.platform == "darwin":
    data_dir = "/Users/lbokeria/Documents/projects/gbif-species-trainer-data/"
else:
    print("Not linux or mac!")


save_folder = "occurrence_dataframes"

save_dir = os.path.join(data_dir,save_folder)

# If save_dir doesn't exist, create it 
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

filename = "lepidoptera"

occ_df_path = os.path.join(data_dir,"dwca_files","occurrence_"+filename+".csv")

In [3]:
kwargs = {}

kwargs['parse_dates'] = True
kwargs['on_bad_lines'] = "skip"

occ_df = pd.read_csv(occ_df_path, **kwargs)

  occ_df = pd.read_csv(occ_df_path, **kwargs)


In [4]:
# Select only numeric acceptedTaxonKey rows
def is_number(x):
    try:
        # Check for NaN
        if pd.isna(x):
            return False
        # Try converting the element to a float.
        float(x)  
        return True  # If conversion is successful, it's a number or a number string.
    except ValueError:  # If conversion fails, it's not a number string.
        return False
    except TypeError:  # If type conversion is not possible (e.g., for NaNs), also not considered as a number.
        return False

In [5]:
def custom_type(x):
    if pd.isna(x):
        return 'missing'
    elif isinstance(x, bool):
        return 'bool'
    else:
        return type(x).__name__

In [6]:
type_counts_custom_pre = occ_df["acceptedTaxonKey"].apply(custom_type).value_counts()
print(type_counts_custom_pre)

acceptedTaxonKey
int        70060192
float      15957971
str          163836
missing          49
Name: count, dtype: int64


In [7]:
mask = occ_df["acceptedTaxonKey"].apply(is_number)

occ_df = occ_df[mask].copy()

In [8]:
type_counts_custom_post = occ_df["acceptedTaxonKey"].apply(custom_type).value_counts()
print(type_counts_custom_post)

acceptedTaxonKey
int      70060192
float    15957971
str        163831
Name: count, dtype: int64


In [9]:
# Conver all strings to floats
# Custom function to convert strings to floats when possible
def convert_str_to_float(x):
    if isinstance(x, str):  # Check if x is a string
        try:
            return float(x)  # Try to convert x to a float
        except ValueError:  # Handle exception if conversion is not possible
            return x  # Return the original string if conversion fails
    else:
        return x  # Return x unchanged if it's not a string

# Applying the custom function to the 'mixed_col'
occ_df["acceptedTaxonKey"] = occ_df["acceptedTaxonKey"].apply(convert_str_to_float)

# Pring type counts
occ_df["acceptedTaxonKey"].apply(custom_type).value_counts()

acceptedTaxonKey
float    86181994
Name: count, dtype: int64

In [10]:
# Now check if any floats have non-0 fractions
float_rows_with_fraction = occ_df[
    occ_df['acceptedTaxonKey'].apply(
        lambda x: isinstance(x, float) and (x != int(x))
    )
]

In [11]:
# Remove the rows with non-0 fraction
mask = ~occ_df["acceptedTaxonKey"].apply(lambda x: isinstance(x, float) and x != int(x))

occ_df = occ_df[mask]

In [12]:
# Convert everything to integer
occ_df["acceptedTaxonKey"] = occ_df["acceptedTaxonKey"].astype(int)

In [13]:
# Last check of types
type_counts_custom_post2 = occ_df["acceptedTaxonKey"].apply(custom_type).value_counts()
print(type_counts_custom_post2)

acceptedTaxonKey
int    86181993
Name: count, dtype: int64


In [14]:
# Try saving as dataframes
groups = occ_df.groupby("acceptedTaxonKey")

In [15]:
groups.ngroups

272182

In [16]:
def save_group(group):
    
    group_name, group_df = group
    filename = f"{group_name}.csv"
    
    try:
        group_df.to_csv(os.path.join(save_dir, filename), index=False)
        logging.info(f"Saved {filename} with {len(group_df)} rows")
    except Exception as e:
        logging.error(f"Couldn't save {filename}: {str(e)}")

groups = list(occ_df.groupby("acceptedTaxonKey"))

In [17]:
# Use multi-threading
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(save_group, groups)