## Train-test split

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
import scipy.stats as stats
from sklearn.model_selection import train_test_split

pd.set_option("display.max_columns", 500)

random_seed = 2024
np.random.seed(random_seed)

In [None]:
bottle_data_path = '../data/bottle'

In [None]:
bottle_df = pd.read_csv(f'{bottle_data_path}/bottle_01.csv', index_col=0)

In [None]:
bottle_df.info()

Perform a random train-test split of the data. R_SALINITY is a target feature here.

In [None]:
bottle_df_train, bottle_df_test = train_test_split(bottle_df, test_size=0.2, random_state=random_seed)

print(bottle_df_train.shape, bottle_df_test.shape)

## Exploring the data

Let's explore the distribution of the features, and investigate the missing values.

In [None]:
def hist_plot(dataset, f_size=(10, 5)):
    plt.figure(figsize=f_size)
    sns.histplot(data=dataset, kde=False, bins=50)
    plt.plot()

def box_plot(dataset, f_size=(10, 5)):
    plt.figure(figsize=f_size)
    sns.boxplot(data=dataset.values)
    plt.plot()

In [None]:
def missing_values_table(df):
        mis_val = df.isnull().sum()

        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)

        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)

        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})

        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)

        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")

        # Return the dataframe with missing information
        return mis_val_table_ren_columns

### R_Depth

Reported Depth (from pressure) in meters

In [None]:
# missing values
bottle_df_train['R_Depth'].isnull().sum()

In [None]:
hist_plot(bottle_df_train['R_Depth'])
box_plot(bottle_df_train['R_Depth'])


#### R_TEMP

Reported Temperature

In [None]:
hist_plot(bottle_df_train['R_TEMP'])
box_plot(bottle_df_train['R_TEMP'])

In [None]:
print(bottle_df_train['R_TEMP'].isnull().sum(), bottle_df_test['R_TEMP'].isnull().sum())

In [None]:
bottle_df_train['R_TEMP'].describe()

In [None]:
hist_plot(bottle_df_train['R_TEMP'])

#### R_SALINITY - target feature

Reported Salinity

In [None]:
bottle_df_train['R_SALINITY'].describe()

In [None]:
hist_plot(bottle_df_train['R_SALINITY'])
box_plot(bottle_df_train['R_SALINITY'])

#### R_SIGMA

Reported Potential Density of water

In [None]:
print(bottle_df_train['R_SIGMA'].isnull().sum(), bottle_df_test['R_SIGMA'].isnull().sum())

Even though this number might seem high, it is less than 1 percent.

In [None]:
hist_plot(bottle_df_train['R_SIGMA'])
box_plot(bottle_df_train['R_SIGMA'])

#### R_SVA

Reported Specific Volume Anomaly

In [None]:
bottle_df_train['R_SVA'].describe()

In [None]:
bottle_df_train['R_SVA'].isnull().sum()

In [None]:
hist_plot(bottle_df_train['R_SVA'])
box_plot(bottle_df_train['R_SVA'])

#### R_DYNHT

Reported Dynamic Height in units of dynamic meters (work per unit mass)

In [None]:
print(bottle_df_train['R_DYNHT'].isnull().sum(), bottle_df_test['R_DYNHT'].isnull().sum())

The number of missing values is < 1%

In [None]:
hist_plot(bottle_df_train['R_DYNHT'])
box_plot(bottle_df_train['R_DYNHT'])

#### R_O2

Here the multimodality is prominent. Also some of the values are negative, replace them with zero.

In [None]:
print(bottle_df_train['R_O2'].isnull().sum(), bottle_df_test['R_O2'].isnull().sum())

In [None]:
print(bottle_df_train[bottle_df_train['R_O2'] < 0].shape, bottle_df_test[bottle_df_test['R_O2'] < 0].shape)

bottle_df_train.loc[bottle_df_train['R_O2'] < 0, 'R_O2'] = np.nan
bottle_df_test.loc[bottle_df_test['R_O2'] < 0, 'R_O2'] = np.nan

print(bottle_df_train[bottle_df_train['R_O2'] < 0].shape, bottle_df_test[bottle_df_test['R_O2'] < 0].shape)

In [None]:
hist_plot(bottle_df_train['R_O2'])
box_plot(bottle_df_train['R_O2'])

In [None]:
print(bottle_df_train['R_O2'].isnull().sum(), bottle_df_test['R_O2'].isnull().sum())

#### R_O2Sat

Reported Oxygen Saturation

In [None]:
print(bottle_df_train['R_O2Sat'].isnull().sum(), bottle_df_test['R_O2Sat'].isnull().sum())

In [None]:
hist_plot(bottle_df_train['R_O2Sat'])
box_plot(bottle_df_train['R_O2Sat'])

#### R_SIO3

Reported Silicate Concentration

In [None]:
print(bottle_df_train['R_SIO3'].isnull().sum(), bottle_df_test['R_SIO3'].isnull().sum())

In [None]:
hist_plot(bottle_df_train['R_SIO3'])
box_plot(bottle_df_train['R_SIO3'])

#### R_PO4

Reported Phosphate Concentration

In [None]:
print(bottle_df_train['R_PO4'].isnull().sum(), bottle_df_test['R_PO4'].isnull().sum())

In [None]:
hist_plot(bottle_df_train['R_PO4'])
box_plot(bottle_df_train['R_PO4'])

#### R_NO3

Reported Nitrate Concentration

In [None]:
print(bottle_df_train['R_NO3'].isnull().sum(), bottle_df_test['R_NO3'].isnull().sum())

In [None]:
hist_plot(bottle_df_train['R_NO3'])
box_plot(bottle_df_train['R_NO3'])

#### R_NO2

Reported Nitrite Concentration

In [None]:
print(bottle_df_train['R_NO2'].isnull().sum(), bottle_df_test['R_NO2'].isnull().sum())

In [None]:
hist_plot(bottle_df_train['R_NO2'])
box_plot(bottle_df_train['R_NO2'])

#### R_NH4

Reported Ammonium Concentration

In [None]:
print(bottle_df_train['R_NH4'].isnull().sum(), bottle_df_test['R_NH4'].isnull().sum())

In [None]:
hist_plot(bottle_df_train['R_NH4'])
box_plot(bottle_df_train['R_NH4'])

#### R_CHLA

Reported Chlorophyll-a

In [None]:
print(bottle_df_train['R_CHLA'].isnull().sum(), bottle_df_test['R_CHLA'].isnull().sum())

In [None]:
hist_plot(bottle_df_train['R_CHLA'])
box_plot(bottle_df_train['R_CHLA'])

#### R_PHAEO

Reported Phaeophytin

Some of the values are negative. Replace them with NaN.

In [None]:
print(bottle_df_train[bottle_df_train['R_PHAEO'] < 0].shape, bottle_df_test[bottle_df_test['R_PHAEO'] < 0].shape)

bottle_df_train.loc[bottle_df_train['R_PHAEO'] < 0, 'R_PHAEO'] = np.nan
bottle_df_test.loc[bottle_df_test['R_PHAEO'] < 0, 'R_PHAEO'] = np.nan

print(bottle_df_train[bottle_df_train['R_PHAEO'] < 0].shape, bottle_df_test[bottle_df_test['R_PHAEO'] < 0].shape)

In [None]:
print(bottle_df_train['R_PHAEO'].isnull().sum(), bottle_df_test['R_PHAEO'].isnull().sum())

In [None]:
hist_plot(bottle_df_train['R_PHAEO'])
box_plot(bottle_df_train['R_PHAEO'])

#### Lat_Dec
Latitude

In [None]:
print(bottle_df_train['Lat_Dec'].isnull().sum(), bottle_df_test['Lat_Dec'].isnull().sum())

In [None]:
hist_plot(bottle_df_train['Lat_Dec'])
box_plot(bottle_df_train['Lat_Dec'])

#### Lon_Dec
Longitude

In [None]:
print(bottle_df_train['Lon_Dec'].isnull().sum(), bottle_df_test['Lon_Dec'].isnull().sum())

In [None]:
hist_plot(bottle_df_train['Lon_Dec'])
box_plot(bottle_df_train['Lon_Dec'])

## Save the data

Save train and test split in separate files.

In [None]:
bottle_df_train.to_csv(f'../data/train.csv')
bottle_df_test.to_csv(f'../data/test.csv')