# Bottle data in bats


In [20]:
import pandas as pd
import numpy as np
import xarray as xr
import cf_xarray.units  # noqa: F401
import pint_xarray  # noqa: F401

pd.options.plotting.backend = "plotly"

## Load raw data

---


In [2]:
# Name : [type, long_name, standard_name, units, dict[other_attrs]]
META = {
    "cruise_num": [int, "Cruise Number", None, None],
    "date": [int, "Date", None, None],
    "tow_num": [int, "Tow Number", None, None],
    "latitude_degrees": [int, "Latitude", "latitude", "degrees"],
    "latitude_minutes": [float, "Latitude", "latitude", "minutes"],
    "longitude_degrees": [int, "Longitude", "longitude", "degrees"],
    "longitude_minutes": [float, "Longitude", "longitude", "minutes"],
    "time_in_local": [int, "Time of Water Collection (Local Time)", None, None],
    "time_out_local": [int, "Time of End of Water Collection (Local Time)", None, None],
    "duration_minutes": [int, "Duration of Collection", None, "minutes"],
    "max_depth": [float, "Maximum Depth", None, "meter"],
    "volume_of_water_m3": [float, "Volume of Water Sample", None, "meter^3"],
    "sieve_size": [int, "Sieve Size", None, "micrometer"],
    "wet_weight_mg": [float, "Wet Weight", None, "milligram"],
    "dry_weight_mg": [float, "Dry Weight", None, "milligram"],
    "wet_weight_volume_of_water_mg_m3": [
        float,
        "Wet Weight per Volume of Water",
        None,
        "milligram/meter^3",
    ],
    "dry_weight_volume_of_water_mg_m3": [
        float,
        "Dry Weight per Volume of Water",
        None,
        "milligram/meter^3",
    ],
    "total_wet_weight_volume_all_size_fractions_mg_m3": [
        float,
        "Total Wet Weight per Volume for all Size Fractions",
        None,
        "milligram/meter^3",
    ],
    "total_dry_weight_volume_all_size_fractions_mg_m3": [
        float,
        "Total Dry Weight per Volume for all Size Fractions",
        None,
        "milligram/meter^3",
    ],
    "wet_weight_volume_of_water_normalized_200m_depth_mg_m3": [
        float,
        "Wet Weight per Volume of Water Normalized for 200m Depth",
        None,
        "milligram/meter^2",
    ],
    "dry_weight_volume_of_water_normalized_200m_depth_mg_m3": [
        float,
        "Dry Weight per Volume of Water Normalized for 200m Depth",
        None,
        "milligram/meter^2",
    ],
    "total_wet_weight_volume_all_size_fractions_normalized_200m_depth_mg_m3": [
        float,
        "Total Wet Weight per Volume for all Size Fractions Normalized for 200m Depth",
        None,
        "milligram/meter^2",
    ],
    "total_dry_weight_volume_all_size_fractions_normalized_200m_depth_mg_m3": [
        float,
        "Total Dry Weight per Volume for all Size Fractions Normalized for 200m Depth",
        None,
        "milligram/meter^2",
    ],
}

INDEX = [
    "cruise_num",
    "date",
    "tow_num",
    "latitude_degrees",
    "latitude_minutes",
    "longitude_degrees",
    "longitude_minutes",
    "time_in_local",
    "time_out_local",
    "sieve_size",
]

DATA = list(set(META.keys()) - set(INDEX))
HEADER = list(META.keys())
DTYPE = {k: v[0] for k, v in META.items()}

In [3]:
# Set the data type of each column manualy
raw_data = pd.read_csv(
    "../../1_raw/bats_zooplankton.txt",
    skiprows=36,
    sep="\t",
    names=HEADER,
    dtype=DTYPE,
)
raw_data.head()

Unnamed: 0,cruise_num,date,tow_num,latitude_degrees,latitude_minutes,longitude_degrees,longitude_minutes,time_in_local,time_out_local,duration_minutes,...,wet_weight_mg,dry_weight_mg,wet_weight_volume_of_water_mg_m3,dry_weight_volume_of_water_mg_m3,total_wet_weight_volume_all_size_fractions_mg_m3,total_dry_weight_volume_all_size_fractions_mg_m3,wet_weight_volume_of_water_normalized_200m_depth_mg_m3,dry_weight_volume_of_water_normalized_200m_depth_mg_m3,total_wet_weight_volume_all_size_fractions_normalized_200m_depth_mg_m3,total_dry_weight_volume_all_size_fractions_normalized_200m_depth_mg_m3
0,20066,19940406,1,31,33.89,63,52.45,1558,1626,28,...,2460.0,210.4,18.15,1.55,53.73,4.22,3630.98,310.55,10745.33,843.69
1,20066,19940406,1,31,33.89,63,52.45,1558,1626,28,...,1220.0,120.2,9.0,0.89,53.73,4.22,1800.73,177.42,10745.33,843.69
2,20066,19940406,1,31,33.89,63,52.45,1558,1626,28,...,2200.0,110.4,16.24,0.81,53.73,4.22,3247.22,162.95,10745.33,843.69
3,20066,19940406,1,31,33.89,63,52.45,1558,1626,28,...,940.0,94.4,6.94,0.7,53.73,4.22,1387.45,139.34,10745.33,843.69
4,20066,19940406,1,31,33.89,63,52.45,1558,1626,28,...,460.0,36.2,3.39,0.27,53.73,4.22,678.96,53.43,10745.33,843.69


In [4]:
# convert -999 to NaN
raw_data = raw_data.replace(-999, np.nan)

In [5]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7299 entries, 0 to 7298
Data columns (total 23 columns):
 #   Column                                                                  Non-Null Count  Dtype  
---  ------                                                                  --------------  -----  
 0   cruise_num                                                              7299 non-null   int64  
 1   date                                                                    7299 non-null   int64  
 2   tow_num                                                                 7299 non-null   int64  
 3   latitude_degrees                                                        7299 non-null   int64  
 4   latitude_minutes                                                        7299 non-null   float64
 5   longitude_degrees                                                       7299 non-null   int64  
 6   longitude_minutes                                                       7299 non

We replace NaN values in time column with 0.


## Clean data

---


Remove the data when time and position are not known.


In [6]:
raw_data = raw_data.dropna(subset=INDEX)

We only select the data with a flag equal to 2 (verified/acceptable).


In [7]:
fig = (
    raw_data.isna()
    .sum()
    .plot(
        kind="bar",
        title=f"Missing values per column for a total of {len(raw_data)} entries",
        labels=dict(
            index="Column",
            value="Number of missing values",
        ),
    )
)
# Rotate x-axis labels by 45 degrees
fig.update_xaxes(tickangle=-45)
# multiply the figure ratio of height by 2
fig.update_layout(height=500)
# Remove legend
fig.update_layout(showlegend=False)
fig.show()

## Manage date

---


In [8]:
# Format is yearmonthday : yyyymmdd
raw_data["date"] = pd.to_datetime(raw_data["date"], format="%Y%m%d")
time_in = raw_data["time_in_local"].astype(int).apply(lambda x: str(x).zfill(4))
time_in = pd.to_timedelta(time_in.str[:2] + ":" + time_in.str[2:] + ":00")
raw_data["time"] = raw_data["date"] + time_in
condition_next_day = (raw_data["time_out_local"] - raw_data["time_in_local"]) < 0
# compute the duration
raw_data["time_out_local"] = raw_data["time_out_local"] + 2400 * condition_next_day
raw_data["duration"] = pd.to_timedelta(
    raw_data["time_out_local"] - raw_data["time_in_local"], unit="m"
)
raw_data.head()

Unnamed: 0,cruise_num,date,tow_num,latitude_degrees,latitude_minutes,longitude_degrees,longitude_minutes,time_in_local,time_out_local,duration_minutes,...,wet_weight_volume_of_water_mg_m3,dry_weight_volume_of_water_mg_m3,total_wet_weight_volume_all_size_fractions_mg_m3,total_dry_weight_volume_all_size_fractions_mg_m3,wet_weight_volume_of_water_normalized_200m_depth_mg_m3,dry_weight_volume_of_water_normalized_200m_depth_mg_m3,total_wet_weight_volume_all_size_fractions_normalized_200m_depth_mg_m3,total_dry_weight_volume_all_size_fractions_normalized_200m_depth_mg_m3,time,duration
0,20066,1994-04-06,1,31,33.89,63,52.45,1558,1626,28,...,18.15,1.55,53.73,4.22,3630.98,310.55,10745.33,843.69,1994-04-06 15:58:00,0 days 01:08:00
1,20066,1994-04-06,1,31,33.89,63,52.45,1558,1626,28,...,9.0,0.89,53.73,4.22,1800.73,177.42,10745.33,843.69,1994-04-06 15:58:00,0 days 01:08:00
2,20066,1994-04-06,1,31,33.89,63,52.45,1558,1626,28,...,16.24,0.81,53.73,4.22,3247.22,162.95,10745.33,843.69,1994-04-06 15:58:00,0 days 01:08:00
3,20066,1994-04-06,1,31,33.89,63,52.45,1558,1626,28,...,6.94,0.7,53.73,4.22,1387.45,139.34,10745.33,843.69,1994-04-06 15:58:00,0 days 01:08:00
4,20066,1994-04-06,1,31,33.89,63,52.45,1558,1626,28,...,3.39,0.27,53.73,4.22,678.96,53.43,10745.33,843.69,1994-04-06 15:58:00,0 days 01:08:00


## Manage space

---


Standard longitude should be between -180 and 180 degrees and expressed as degrees_east.


In [9]:
# Convert columns ["latitude_degrees","latitude_minutes"] to decimal degrees
raw_data["latitude"] = raw_data["latitude_degrees"] + raw_data["latitude_minutes"] / 60
raw_data["latitude"]
# Same for longitude
raw_data["longitude"] = -(
    raw_data["longitude_degrees"] + raw_data["longitude_minutes"] / 60
)

In [10]:
raw_data[["latitude_degrees", "latitude_minutes", "latitude"]]

Unnamed: 0,latitude_degrees,latitude_minutes,latitude
0,31,33.890,31.564833
1,31,33.890,31.564833
2,31,33.890,31.564833
3,31,33.890,31.564833
4,31,33.890,31.564833
...,...,...,...
7294,31,38.242,31.637367
7295,31,38.242,31.637367
7296,31,38.242,31.637367
7297,31,38.242,31.637367


In [11]:
raw_data[["longitude_degrees", "longitude_minutes", "longitude"]]

Unnamed: 0,longitude_degrees,longitude_minutes,longitude
0,63,52.450,-63.874167
1,63,52.450,-63.874167
2,63,52.450,-63.874167
3,63,52.450,-63.874167
4,63,52.450,-63.874167
...,...,...,...
7294,64,12.072,-64.201200
7295,64,12.072,-64.201200
7296,64,12.072,-64.201200
7297,64,12.072,-64.201200


In [12]:
fig = raw_data["longitude"].plot(kind="hist", title="Longitude distribution", nbins=100)
# change the axis name
fig.update_xaxes(title_text="Longitude (degree_east)")
fig.update_yaxes(title_text="Count")
fig.show()

In [13]:
fig = raw_data["latitude"].plot(kind="hist", title="Latitude distribution", nbins=100)
# change the axis name
fig.update_xaxes(title_text="Latitude (degree_north)")
fig.update_yaxes(title_text="Count")
fig.show()

In [15]:
raw_data["max_depth"].astype(float).plot.hist(
    nbins=100, title="Max depth distribution", labels=dict(value="Max depth (m)")
)

## Produce preprocessed data

---


In [25]:
preprocessed_data = pd.DataFrame(
    {
        "time": raw_data["time"],
        "latitude": raw_data["latitude"],
        "longitude": raw_data["longitude"],
        "sieve_size": raw_data["sieve_size"],
        **{k: raw_data[k] for k in DATA},
    }
)
preprocessed_data.head()

Unnamed: 0,time,latitude,longitude,sieve_size,wet_weight_volume_of_water_mg_m3,dry_weight_volume_of_water_mg_m3,total_wet_weight_volume_all_size_fractions_normalized_200m_depth_mg_m3,wet_weight_volume_of_water_normalized_200m_depth_mg_m3,total_wet_weight_volume_all_size_fractions_mg_m3,total_dry_weight_volume_all_size_fractions_mg_m3,total_dry_weight_volume_all_size_fractions_normalized_200m_depth_mg_m3,max_depth,dry_weight_mg,dry_weight_volume_of_water_normalized_200m_depth_mg_m3,wet_weight_mg,duration_minutes,volume_of_water_m3
0,1994-04-06 15:58:00,31.564833,-63.874167,200.0,18.15,1.55,10745.33,3630.98,53.73,4.22,843.69,,210.4,310.55,2460.0,28,135.501
1,1994-04-06 15:58:00,31.564833,-63.874167,500.0,9.0,0.89,10745.33,1800.73,53.73,4.22,843.69,,120.2,177.42,1220.0,28,135.501
2,1994-04-06 15:58:00,31.564833,-63.874167,1000.0,16.24,0.81,10745.33,3247.22,53.73,4.22,843.69,,110.4,162.95,2200.0,28,135.501
3,1994-04-06 15:58:00,31.564833,-63.874167,2000.0,6.94,0.7,10745.33,1387.45,53.73,4.22,843.69,,94.4,139.34,940.0,28,135.501
4,1994-04-06 15:58:00,31.564833,-63.874167,5000.0,3.39,0.27,10745.33,678.96,53.73,4.22,843.69,,36.2,53.43,460.0,28,135.501


## Final plot

---


In [26]:
preprocessed_data.describe()

Unnamed: 0,time,latitude,longitude,sieve_size,wet_weight_volume_of_water_mg_m3,dry_weight_volume_of_water_mg_m3,total_wet_weight_volume_all_size_fractions_normalized_200m_depth_mg_m3,wet_weight_volume_of_water_normalized_200m_depth_mg_m3,total_wet_weight_volume_all_size_fractions_mg_m3,total_dry_weight_volume_all_size_fractions_mg_m3,total_dry_weight_volume_all_size_fractions_normalized_200m_depth_mg_m3,max_depth,dry_weight_mg,dry_weight_volume_of_water_normalized_200m_depth_mg_m3,wet_weight_mg,duration_minutes,volume_of_water_m3
count,7272,7272.0,7272.0,7272.0,7231.0,7232.0,7205.0,7231.0,7205.0,7210.0,7210.0,6706.0,7232.0,7232.0,7231.0,7272.0,7272.0
mean,2008-03-31 00:22:06.105610496,31.663996,-64.169852,1741.075358,3.895471,0.605147,3860.85592,779.099026,19.304351,3.016734,603.333162,183.52353,376.440926,121.031502,2477.068259,31.858223,665.466877
min,1994-04-06 15:58:00,31.411317,-64.504833,200.0,0.0,0.0,131.28,0.0,0.66,0.19,38.35,40.0,0.0,0.0,0.0,12.0,71.77
25%,2001-03-09 14:50:00,31.653317,-64.183042,500.0,1.23,0.24,1853.93,245.69,9.27,1.64,328.07,162.0,144.75,48.3375,726.2,26.0,477.6
50%,2007-12-06 22:52:00,31.665817,-64.16745,1000.0,2.71,0.46,2965.97,542.05,14.83,2.48,496.68,187.2,293.0,92.865,1690.2,34.0,617.374
75%,2015-05-13 10:53:00,31.675517,-64.156617,2000.0,4.96,0.78,4860.15,991.985,24.3,3.78,756.7,207.5,485.35,156.1025,3165.5,37.0,825.023
max,2022-12-13 22:43:00,31.9745,-63.779333,5000.0,211.27,8.88,55146.31,42254.72,275.73,14.59,2917.95,306.2,7647.6,1775.62,181990.8,71.0,1825.68
std,,0.041134,0.050405,1741.41274,5.253114,0.571305,3289.735975,1050.630835,16.448575,1.964997,393.005221,35.556761,368.296287,114.251762,3598.616782,8.525664,267.627208


In [27]:
df_normalized = preprocessed_data.groupby(["time", "latitude", "longitude"]).mean()
df_normalized = (df_normalized - df_normalized.min()) / (
    df_normalized.max() - df_normalized.min()
)
fig = df_normalized.plot.box()
fig.update_xaxes(title_text="Variable")
fig.update_yaxes(title_text="Normalized values distribution")
# Rotate x-axis labels by 45 degrees
fig.update_xaxes(tickangle=-45)
# multiply the figure ratio of height by 2
fig.update_layout(height=800)
fig.show()

## Export preprocessed data

---


In [28]:
preprocessed_data.to_csv("../../2_preprocessed/bats_zooplankton.csv", index=False)

In [29]:
out_data = xr.Dataset.from_dataframe(preprocessed_data)
for k, v in META.items():
    if k not in out_data:
        continue
    if v[1] is not None:
        out_data[k].attrs["long_name"] = v[1]
    if v[2] is not None:
        out_data[k].attrs["standard_name"] = v[2]
    if v[3] is not None:
        out_data[k].attrs["units"] = v[3]
    if len(v) > 4:
        out_data[k].attrs.update(v[4])
out_data["time"].attrs = {"axis": "T"}
out_data["latitude"].attrs = {"axis": "Y", "units": "degrees_north"}
out_data["longitude"].attrs = {"axis": "X", "units": "degrees_east"}
out_data["max_depth"].attrs = {"axis": "Z", "units": "meters"}
out_data

In [30]:
try:
    out_data.pint.quantify()
except Exception as e:
    print(e)
    print("Some units cannot be quantified and are only here for information.")

In [32]:
out_data.to_netcdf("../../2_preprocessed/bats_zooplankton.nc")