Attempting to load the occurrence dataframe and make it smaller by retaining just the necessary columns.

In [1]:
import os
import pandas as pd
import sys
from collections import Counter

from dwca.read import DwCAReader
from dwca.descriptors import shorten_term
from dwca.darwincore.utils import qualname as qn

In [2]:
home_dir = os.path.dirname(os.getcwd())
data_dir = "/Users/lbokeria/Documents/projects/gbif-species-trainer-data/"

family_name_1 = "Sesiidae"
family_name_2 = "lepidoptera"

dwca_file_path_1     = os.path.join(data_dir,"dwca_files",family_name_1+".zip")
dwca_file_path_2     = os.path.join(data_dir,"dwca_files",family_name_2+".zip")
multimedia_file_path = os.path.join(data_dir,"dwca_files",family_name_2,"multimedia.txt")

In [3]:
fields_to_keep = [
    "coreid",
    "identifier",
]

In [4]:
# Read the datafile descriptors for one
kwargs = {}
with DwCAReader(dwca_file_path_1) as dwca:
    # Get the file descriptor
    datafile_descriptor = dwca.get_descriptor_for("multimedia.txt")

In [7]:
# Just load the bigger file with usecols enabled
kwargs = {}

kwargs['delimiter'] = datafile_descriptor.fields_terminated_by
kwargs['skiprows'] = datafile_descriptor.lines_to_ignore
kwargs['header'] = None
kwargs['names'] = datafile_descriptor.short_headers
kwargs['parse_dates'] = True
kwargs['on_bad_lines'] = "skip"
kwargs['usecols'] = fields_to_keep

media_df = pd.read_csv(multimedia_file_path, **kwargs)

# Add a column for default values, if present in the file descriptor
for field in datafile_descriptor.fields:
    field_default_value = field['default']
    if field_default_value is not None:
        media_df[shorten_term(field['term'])] = field_default_value

In [8]:
# Save the big file
print(sys.getsizeof(media_df)/1024/1024)

2366.7391481399536


In [9]:
media_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18903772 entries, 0 to 18903771
Data columns (total 2 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   coreid      int64 
 1   identifier  object
dtypes: int64(1), object(1)
memory usage: 288.4+ MB


In [10]:
# Save it
media_df.to_csv(os.path.join(data_dir,"dwca_files","multimedia_"+family_name_2+".csv"),index=False)