# Bottle data in bats


In [None]:
import pandas as pd
import numpy as np
import plotly.express as px

pd.options.plotting.backend = "plotly"

## Load raw data

---


In [None]:
INDEX = {
    "id": int,
    "date": int,
    "date_decy": float,
    "time": int,
    "lat": float,
    "lon": float,
    "quality_flag": int,
    "depth": float,
}
DATA = {
    "temperature": float,
    "ctd salinity": float,
    "salinity": float,
    "sigma-theta": float,
    "oxygen": float,
    "oxygen fix temp": float,
    "oxy anomaly": float,
    "dissolved inorganic carbon": float,
    "alkalinity": float,
    "nitrate+nitrite": float,
    "nitrite": float,
    "phosphate": float,
    "silicate": float,
    "poc": float,
    "pon": float,
    "toc": float,
    "tn": float,
    "bacteria enumeration": float,
    "pop": float,
    "total dissolved phosphorus": float,
    "Low-level phosphorus": float,
    "particulate biogenic silica": float,
    "particulate lithogenic silica": float,
    "prochlorococcus": float,
    "synechococcus": float,
    "picoeukaryotes": float,
    "nanoeukaryotes": float,
}
HEADER = list(INDEX.keys()) + list(DATA.keys())
DTYPE = {**INDEX, **DATA}

In [None]:
# Set the data type of each column manualy
raw_data = pd.read_csv(
    "../../1_raw/bats_bottle.txt",
    skiprows=59,
    sep="\t",
    names=HEADER,
    dtype=DTYPE,
)
raw_data[list(DATA.keys())] = raw_data[list(DATA.keys())].apply(
    pd.to_numeric, errors="coerce"
)
raw_data.head()

We replace NaN values in time column with 0.


In [None]:
# convert -999 to NaN
raw_data = raw_data.replace(-999, np.nan)
# replace NaN with 0 in time column because day/night cycle is not relevant here
raw_data["time"] = raw_data["time"].fillna(0)

In [None]:
raw_data.info()

## Clean data

---


Remove the data when time and position are not known.


In [None]:
raw_data = raw_data.dropna(subset=list(INDEX.keys()))

We only select the data with a flag equal to 2 (verified/acceptable).


In [None]:
print(f"Count QF flag : {np.unique(raw_data["quality_flag"], return_counts=True)}")
# Drop when QF is not 2
raw_data = raw_data[raw_data["quality_flag"] == 2]

In [None]:
fig = (
    raw_data.isna()
    .sum()
    .plot(
        kind="bar",
        title=f"Missing values per column for a total of {len(raw_data)} entries",
        labels=dict(
            index="Column",
            value="Number of missing values",
        ),
    )
)
# Rotate x-axis labels by 45 degrees
fig.update_xaxes(tickangle=-45)
# multiply the figure ratio of height by 2
fig.update_layout(height=500)
# Remove legend
fig.update_layout(showlegend=False)
fig.show()

## Manage date

---


In [None]:
# Format is yearmonthday : yyyymmdd
raw_data["date"] = pd.to_datetime(raw_data["date"], format="%Y%m%d")
# Format is hourminute : hhmm
raw_data["time"] = raw_data["time"].astype(int).apply(lambda x: str(x).zfill(4))
raw_data["time"] = pd.to_timedelta(
    raw_data["time"].str[:2] + ":" + raw_data["time"].str[2:] + ":00"
)
raw_data["date"] = raw_data["date"] + raw_data["time"]

## Manage space

---


Standard longitude should be between -180 and 180 degrees and expressed as degrees_east.


In [None]:
raw_data["lon"] = -raw_data["lon"]

In [None]:
fig = raw_data["lon"].plot(kind="hist", title="Longitude distribution", nbins=100)
# change the axis name
fig.update_xaxes(title_text="Longitude (degree_east)")
fig.update_yaxes(title_text="Count")
fig.show()

In [None]:
fig = raw_data["lat"].plot(kind="hist", title="Latitude distribution", nbins=100)
# change the axis name
fig.update_xaxes(title_text="Latitude (degree_north)")
fig.update_yaxes(title_text="Count")
fig.show()

In [None]:
raw_data["depth"].astype(float).describe()

In [None]:
raw_data["depth"].astype(float).plot.hist(nbins=100)

## Produce preprocessed data

---


In [None]:
preprocessed_data = pd.DataFrame(
    {
        "time": raw_data["date"],
        "latitude": raw_data["lat"],
        "longitude": raw_data["lon"],
        "depth": raw_data["depth"],
        **{k: raw_data[k] for k in DATA.keys()},
    }
)
preprocessed_data.head()

## Final plot

---


In [None]:
preprocessed_data.describe()

In [None]:
df_normalized = preprocessed_data.groupby(
    ["time", "latitude", "longitude", "depth"]
).mean()
df_normalized = (df_normalized - df_normalized.min()) / (
    df_normalized.max() - df_normalized.min()
)
fig = df_normalized.plot.box()
fig.update_xaxes(title_text="Variable")
fig.update_yaxes(title_text="Normalized values distribution")
fig.show()

## Export preprocessed data

---


In [None]:
preprocessed_data.to_csv("../../2_preprocessed/bats_bottle.csv")