# Bottle data in bats


In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

pd.options.plotting.backend = "plotly"

## Load raw data

---


In [2]:
INDEX = {
    "id": int,
    "year month day - deploy": int,
    "year month day - recover": int,
    "dec_year deploy": float,
    "dec_year_recover": float,
    "time_deploy": int,
    "time_recover": int,
    "latitude of water collection - deploy": float,
    "latitude of water collection - recover": float,
    "longitude of water collection - deploy": float,
    "longitude of water collection - recover": float,
    "quality_flag": int,
    "collection depth": float,
}

DATA = {
    "pressure": float,
    "ctd temp": float,
    "salinity from goflo bottle or ctd": float,
    "14c primary production light bottle #1": float,
    "14c primary production light bottle #2": float,
    "14c primary production light bottle #3": float,
    "14c primary production dark bottle": float,
    "14c primary production time zero": float,
    "primary production mean light values - dark value": float,
}

HEADER = list(INDEX.keys()) + list(DATA.keys())
DTYPE = {**INDEX, **DATA}

In [3]:
# Set the data type of each column manualy
raw_data = pd.read_csv(
    "../../1_raw/bats_primary_production.txt",
    skiprows=39,
    sep="\t",
    names=HEADER,
    dtype=DTYPE,
)
raw_data[list(DATA.keys())] = raw_data[list(DATA.keys())].apply(
    pd.to_numeric, errors="coerce"
)
raw_data.head()

Unnamed: 0,id,year month day - deploy,year month day - recover,dec_year deploy,dec_year_recover,time_deploy,time_recover,latitude of water collection - deploy,latitude of water collection - recover,longitude of water collection - deploy,...,collection depth,pressure,ctd temp,salinity from goflo bottle or ctd,14c primary production light bottle #1,14c primary production light bottle #2,14c primary production light bottle #3,14c primary production dark bottle,14c primary production time zero,primary production mean light values - dark value
0,1000308101,19881218,19881218,1988.965,-999.0,-999,-999,31.669,-999.0,64.049,...,5.0,-999.0,-999.0,-999.0,7.21,6.59,-999.0,0.75,1.26,6.15
1,1000308102,19881218,19881218,1988.965,-999.0,-999,-999,31.669,-999.0,64.049,...,25.0,-999.0,-999.0,-999.0,6.0,-999.0,-999.0,-999.0,1.97,-999.0
2,1000308103,19881218,19881218,1988.965,-999.0,-999,-999,31.669,-999.0,64.049,...,50.0,-999.0,-999.0,-999.0,3.62,2.69,3.19,1.02,1.57,2.15
3,1000308104,19881218,19881218,1988.965,-999.0,-999,-999,31.669,-999.0,64.049,...,75.0,-999.0,-999.0,-999.0,2.21,1.4,1.55,1.43,1.47,0.29
4,1000308105,19881218,19881218,1988.965,-999.0,-999,-999,31.669,-999.0,64.049,...,100.0,-999.0,-999.0,-999.0,1.15,1.78,8.48,0.95,1.46,2.85


We replace NaN values in time column with 0.


In [4]:
# convert -999 to NaN
raw_data = raw_data.replace(-999, np.nan)
# replace NaN with 0 in time column because day/night cycle is not relevant here
raw_data[["time_deploy", "time_recover"]] = (
    raw_data[["time_deploy", "time_recover"]].fillna(0).astype(int)
)

In [5]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3586 entries, 0 to 3585
Data columns (total 22 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   id                                                 3586 non-null   int64  
 1   year month day - deploy                            3586 non-null   int64  
 2   year month day - recover                           3586 non-null   int64  
 3   dec_year deploy                                    3586 non-null   float64
 4   dec_year_recover                                   1610 non-null   float64
 5   time_deploy                                        3586 non-null   int64  
 6   time_recover                                       3586 non-null   int64  
 7   latitude of water collection - deploy              3570 non-null   float64
 8   latitude of water collection - recover             1602 non-null   float64
 9   longitud

## Clean data

---


Remove the data when time and position are not known.


We only select the data with a flag equal to 2 (verified/acceptable).


In [6]:
print(f"Count QF flag : {np.unique(raw_data["quality_flag"], return_counts=True)}")
# Drop when QF is not 2
raw_data = raw_data[raw_data["quality_flag"] == 2]

Count QF flag : (array([-3,  2]), array([   8, 3578]))


In [7]:
fig = (
    raw_data.isna()
    .sum()
    .plot(
        kind="bar",
        title=f"Missing values per column for a total of {len(raw_data)} entries",
        labels=dict(
            index="Column",
            value="Number of missing values",
        ),
    )
)
# Rotate x-axis labels by 45 degrees
fig.update_xaxes(tickangle=-45)
# multiply the figure ratio of height by 2
fig.update_layout(height=500)
# Remove legend
fig.update_layout(showlegend=False)
fig.show()

In [8]:
longitude_diff = (
    raw_data["longitude of water collection - deploy"]
    - raw_data["longitude of water collection - recover"]
)
latitude_diff = (
    raw_data["latitude of water collection - deploy"]
    - raw_data["latitude of water collection - recover"]
)

fig = (
    pd.DataFrame(
        {
            "date": pd.to_datetime(
                raw_data["year month day - deploy"], format="%Y%m%d"
            ),
            "longitude_diff": longitude_diff,
            "latitude_diff": latitude_diff,
        }
    )
    .set_index("date")
    .resample("YE")
    .median()
    .reset_index()
    .dropna()
    .plot(
        x="date",
        y=["latitude_diff", "longitude_diff"],
        kind="bar",
    )
)

# change x and y labels
fig.update_layout(
    xaxis_title="Year",
    yaxis_title="Median difference in degrees",
    title="Difference in latitude and longitude between deploy and recover",
)

## Manage date

---


In [9]:
# Format is yearmonthday : yyyymmdd
date_deploy = pd.to_datetime(raw_data["year month day - deploy"], format="%Y%m%d")
date_deploy
date_recover = pd.to_datetime(raw_data["year month day - recover"], format="%Y%m%d")
date_recover
# # Format is hourminute : hhmm
time_deploy = raw_data["time_deploy"].apply(str).apply(lambda x: str(x).zfill(4))
time_deploy = pd.to_timedelta(time_deploy.str[:2] + ":" + time_deploy.str[2:] + ":00")

time_recover = raw_data["time_recover"].apply(str).apply(lambda x: str(x).zfill(4))
time_recover = pd.to_timedelta(
    time_recover.str[:2] + ":" + time_recover.str[2:] + ":00"
)
raw_data["time_deploy"] = date_deploy + time_deploy
raw_data["time_recover"] = date_recover + time_recover
raw_data = raw_data.drop(
    [
        "year month day - deploy",
        "year month day - recover",
        "dec_year deploy",
        "dec_year_recover",
    ],
    axis=1,
)

In [10]:
time_diff = raw_data["time_recover"] - raw_data["time_deploy"]
time_diff = pd.DataFrame(
    {
        "year": raw_data["time_deploy"].dt.year,
        "time_diff": time_diff.dt.total_seconds() / 60,  # in minutes
    }
)

fig = (
    time_diff[time_diff["time_diff"] > 0]
    .groupby("year")
    .median()
    .reset_index()
    .plot(
        x="year",
        y=["time_diff"],
        kind="bar",
    )
)

# change x and y labels
fig.update_layout(
    xaxis_title="Year",
    yaxis_title="Median difference in minutes",
    title="Difference in minutes between deploy and recover",
)
# no legend
fig.update_layout(showlegend=False)
fig.show()

## Manage space

---


Standard longitude should be between -180 and 180 degrees and expressed as degrees_east.


In [None]:
raw_data["lon"] = -raw_data["lon"]

In [None]:
fig = raw_data["lon"].plot(kind="hist", title="Longitude distribution", nbins=100)
# change the axis name
fig.update_xaxes(title_text="Longitude (degree_east)")
fig.update_yaxes(title_text="Count")
fig.show()

In [None]:
fig = raw_data["lat"].plot(kind="hist", title="Latitude distribution", nbins=100)
# change the axis name
fig.update_xaxes(title_text="Latitude (degree_north)")
fig.update_yaxes(title_text="Count")
fig.show()

In [None]:
raw_data["depth"].astype(float).describe()

In [None]:
raw_data["depth"].astype(float).plot.hist(nbins=100)

## Produce preprocessed data

---


In [None]:
preprocessed_data = pd.DataFrame(
    {
        "time": raw_data["date"],
        "latitude": raw_data["lat"],
        "longitude": raw_data["lon"],
        "depth": raw_data["depth"],
        **{k: raw_data[k] for k in DATA.keys()},
    }
)
preprocessed_data.head()

## Final plot

---


In [None]:
preprocessed_data.describe()

In [None]:
df_normalized = preprocessed_data.groupby(
    ["time", "latitude", "longitude", "depth"]
).mean()
df_normalized = (df_normalized - df_normalized.min()) / (
    df_normalized.max() - df_normalized.min()
)
fig = df_normalized.plot.box()
fig.update_xaxes(title_text="Variable")
fig.update_yaxes(title_text="Normalized values distribution")
fig.show()

## Export preprocessed data

---


In [None]:
preprocessed_data.to_csv("../2_preprocessed/bats_bottle.csv")