# Converting LiPD files to a `pandas.DataFrame`

This notebook demonstrates how to convert a collection of LiPD files to a `pandas.DataFrame`.

In [1]:
import pandas as pd
import lipd
import numpy as np
import os

In [None]:
# download and unzip the dataset in LiPD
!wget https://lipdverse.org/CoralHydro2k/current_version/CoralHydro2k0_5_4.zip
!unzip CoralHydro2k0_5_4.zip

Archive:  CoralHydro2k0_5_4.zip
replace AB08MEN01.lpd? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [6]:
# load LiPD files from the given directory
# note that this function has some bug on path switching;
# not any arbitray path can work
!cd ./lipdfiles
lipds = lipd.readLipd('.')
# extract timeseries from the list of LiDP objects
ts_list = lipd.extractTs(lipds)

zsh:cd:1: no such file or directory: ./lipdfiles
Disclaimer: LiPD files may be updated and modified to adhere to standards

Found: 179 LiPD file(s)
reading: KA17RYU01.lpd
reading: CH18YOA02.lpd
reading: FL17DTO02.lpd
reading: BO14HTI01.lpd
reading: AB20MEN01.lpd
reading: DR99ABR01.lpd
reading: OS13NLP01.lpd
reading: ZI04IFR01.lpd
reading: CH18YOA01.lpd
reading: FL17DTO01.lpd
reading: SA16CLA01.lpd
reading: AB20MEN03.lpd
reading: TU01DEP01.lpd
reading: BO14HTI02.lpd
reading: AB20MEN02.lpd
reading: CO00MAL01.lpd
reading: SW98STP01.lpd
reading: AB20MEN06.lpd
reading: AB08MEN01.lpd
reading: RE18CAY01.lpd
reading: HE13MIS02.lpd
reading: CA13SAP01.lpd
reading: MO06PED01.lpd
reading: AB20MEN07.lpd
reading: DR00NBB01.lpd
reading: ZI16ROD01.lpd
reading: DR00KSB01.lpd
reading: ZI15MER01.lpd
reading: NU09KIR01.lpd
reading: KU00NIN01.lpd
reading: PF19LAR01.lpd
reading: AB20MEN05.lpd
reading: HE13MIS01.lpd
reading: HE10GUA01.lpd
reading: EV18ROC01.lpd
reading: AB20MEN04.lpd
reading: CA07FLI01.lpd
r

In [7]:
# create a null DataFrame

col_str=[
    'datasetId',
    'dataSetName', 'archiveType',                                                                                
    'geo_meanElev', 'geo_meanLat', 'geo_meanLon',
    'year', 'yearUnits',                                                                                         
    'paleoData_variableName',
    'paleoData_units',                                                                                           
    'paleoData_values',
    'paleoData_notes',
]

df_tmp = pd.DataFrame(index=range(len(ts_list)), columns=col_str)

In [8]:
# check the keys
ts_list[0].keys()

dict_keys(['mode', 'time_id', 'context', 'additionalDataUrl', 'archiveType', 'dataSetName', 'datasetId', 'originalDataUrl', 'createdBy', 'changelog', 'lipdverseLink', 'maxYear', 'minYear', 'pub1_author', 'pub1_citation', 'pub1_doi', 'pub1_firstauthor', 'pub1_journal', 'pub1_title', 'pub1_year', 'pub2_author', 'pub2_citation', 'pub2_doi', 'pub2_firstauthor', 'pub2_journal', 'pub2_title', 'pub2_year', 'geo_meanLon', 'geo_meanLat', 'geo_meanElev', 'geo_type', 'geo_description', 'geo_ocean', 'geo_secondarySiteName', 'geo_siteName', '@context', 'lipdVersion', 'tableType', 'paleoData_tableName', 'paleoData_filename', 'paleoData_missingValue', 'year', 'yearUnits', 'paleoData_measurementTableName', 'paleoData_TSid', 'paleoData_analyticalError', 'paleoData_analyticalErrorUnits', 'paleoData_archiveSpecies', 'paleoData_ch2kCoreCode', 'paleoData_coralExtensionRate', 'paleoData_coralExtensionRateNotes', 'paleoData_coralHydro2kGroup', 'paleoData_isAnomaly', 'paleoData_isComposite', 'paleoData_jcpCor

In [9]:
# loop over the timeseries and pick those for global temperature analysis
i = 0                                                                                                                
for ts in ts_list:
    # need to filter these variables in the list
    if ts['paleoData_variableName'] not in ['year', 'd18OUncertainty', 'SrCaUncertainty']:
        for name in col_str:                                                                                         
            try:
                df_tmp.loc[i, name] = ts[name]                                                                       
            except:
                df_tmp.loc[i, name] = np.nan                                                                         
    
    i += 1 
        
# drop the rows with all NaNs (those not for global temperature analysis)
df = df_tmp.dropna(how='all')
df

Unnamed: 0,datasetId,dataSetName,archiveType,geo_meanElev,geo_meanLat,geo_meanLon,year,yearUnits,paleoData_variableName,paleoData_units,paleoData_values,paleoData_notes
0,ch2kKA17RYU01,KA17RYU01,coral,-3.5,28.3,130.0,"[1578.58, 1579.08, 1579.58, 1580.08, 1580.58, ...",AD,SrCa,mmol/mol,"[8.802, 9.472, 8.825, 9.355, 8.952, 9.297, 8.8...",Core data is a composite of overlapping indivi...
2,ch2kCH18YOA02,CH18YOA02,coral,,16.448,111.605,"[1987.92, 1988.085, 1988.25, 1988.42, 1988.585...",AD,SrCa,mmol/mol,"[8.58, 8.683, 8.609, 8.37, 8.38, 8.417, 8.584,...",Microatoll; coral rubble samples; data reporte...
4,ch2kFL17DTO02,FL17DTO02,coral,-3.0,24.699,-82.799,"[1837.04, 1837.13, 1837.21, 1837.29, 1837.38, ...",AD,SrCa,mmol/mol,"[9.159, 9.257, 9.245, 9.166, 9.045, 9.013, 8.9...",
6,ch2kFL17DTO02,FL17DTO02,coral,-3.0,24.699,-82.799,"[1837.0, 1838.0, 1839.0, 1840.0, 1841.0, 1842....",AD,SrCa_annual,mmol/mol,"[9.053, 9.01, 8.984, 9.062, 9.054, 9.017, 8.99...",
8,ch2kBO14HTI01,BO14HTI01,coral,-3.6,12.21,109.31,"[1977.37, 1977.45, 1977.54, 1977.62, 1977.7, 1...",AD,d18O,permil,"[-5.4206, -5.3477, -5.1354, -5.7119, -5.9058, ...",A composite of cores TN and BB (CoralHydro2k I...
...,...,...,...,...,...,...,...,...,...,...,...,...
598,ch2kZI08MAY01,ZI08MAY01,coral,-2.0,-12.65,45.1,"[1881.6247, 1881.791367, 1881.958033, 1882.124...",AD,SrCa,mmol/mol,"[8.947578, 8.797017, 8.784511, 8.751525, 8.778...","in situ d18o; for annual (sr/Ca, slope- -0.058..."
600,ch2kZI08MAY01,ZI08MAY01,coral,-2.0,-12.65,45.1,"[1881.6247, 1881.791367, 1881.958033, 1882.124...",AD,d18O_sw,permil,"[0.38, 0.642971429, 0.664339016, 0.632390476, ...","in situ d18o; for annual (sr/Ca, slope- -0.058..."
602,ch2kLI06FIJ01,LI06FIJ01,coral,-10.0,-16.82,179.23,"[1617.5, 1618.5, 1619.5, 1620.5, 1621.5, 1622....",AD,d18O,permil,"[-4.6922, -4.6266, -4.6018, -4.5486, -4.6102, ...",mm-scale drilling but available data is at ann...
604,ch2kSM06LKF02,SM06LKF02,coral,-4.0,24.56,-81.41,"[1960.97, 1961.03, 1961.09, 1961.15, 1961.21, ...",AD,d18O,permil,"[-3.85, -3.98, -4.21, -4.06, -3.97, -4.04, -3....",


In [10]:
# double check the variable names we have
set(df['paleoData_variableName'])

{'SrCa', 'SrCa_annual', 'd18O', 'd18O_annual', 'd18O_sw', 'd18O_sw_annual'}

In [13]:
# save to a pickle file
df.to_pickle('../ch2k.pkl')

In [12]:
!ls ch2k.pkl

ch2k.pkl
