## Train-test split

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

pd.set_option("display.max_columns", 500)

random_seed = 2024
np.random.seed(random_seed)

In [None]:
df = pd.read_csv(f'../data/dataset.csv', index_col=0)

In [None]:
df.info()

Some of the feature values are negative despite they actually should be positive. We'll replace negative values with NaN.

In [None]:
df.min()

Perform a random train-test split of the data. R_SALINITY is a target feature here.

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=random_seed)

print(train_df.shape, test_df.shape)

## Exploring the data

Let's explore the distribution of the features, and investigate the missing values.

In [None]:
def hist_plot(dataset, f_size=(10, 5)):
    plt.figure(figsize=f_size)
    sns.histplot(data=dataset, kde=False, bins=50)
    plt.plot()

def box_plot(dataset, f_size=(10, 5)):
    plt.figure(figsize=f_size)
    sns.boxplot(data=dataset.values)
    plt.plot()

In [None]:
def missing_values_table(df):
        mis_val = df.isnull().sum()

        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)

        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)

        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})

        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)

        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")

        # Return the dataframe with missing information
        return mis_val_table_ren_columns

def replace_neg_with_nan(col):
    print(train_df[train_df[col] < 0].shape, test_df[test_df[col] < 0].shape)

    train_df.loc[train_df[col] < 0, col] = np.nan
    test_df.loc[test_df[col] < 0, col] = np.nan

    print(train_df[train_df[col] < 0].shape, test_df[test_df[col] < 0].shape)

### R_Depth

Reported Depth in meters

In [None]:
train_df['R_Depth'].isnull().sum()

In [None]:
hist_plot(train_df['R_Depth'])
box_plot(train_df['R_Depth'])


#### R_TEMP

Reported Temperature

In [None]:
hist_plot(train_df['R_TEMP'])
box_plot(train_df['R_TEMP'])

In [None]:
print(train_df['R_TEMP'].isnull().sum(), test_df['R_TEMP'].isnull().sum())

In [None]:
train_df['R_TEMP'].describe()

In [None]:
hist_plot(train_df['R_TEMP'])

#### R_SALINITY - target feature

Reported Salinity

In [None]:
train_df['R_SALINITY'].describe()

In [None]:
hist_plot(train_df['R_SALINITY'])
box_plot(train_df['R_SALINITY'])

#### R_SIGMA

Reported Potential Density of water

In [None]:
print(train_df['R_SIGMA'].isnull().sum(), test_df['R_SIGMA'].isnull().sum())

Even though this number might seem high, it is less than 1 percent.

In [None]:
hist_plot(train_df['R_SIGMA'])
box_plot(train_df['R_SIGMA'])

#### R_SVA

Reported Specific Volume Anomaly

In [None]:
train_df['R_SVA'].describe()

In [None]:
train_df['R_SVA'].isnull().sum()

In [None]:
hist_plot(train_df['R_SVA'])
box_plot(train_df['R_SVA'])

#### R_DYNHT

Reported Dynamic Height in units of dynamic meters (work per unit mass)

In [None]:
print(train_df['R_DYNHT'].isnull().sum(), test_df['R_DYNHT'].isnull().sum())

The number of missing values is < 1%

In [None]:
hist_plot(train_df['R_DYNHT'])
box_plot(train_df['R_DYNHT'])

#### R_O2

Here the multimodality is prominent. Also some of the values are negative, replace them with zero.

In [None]:
print(train_df['R_O2'].isnull().sum(), test_df['R_O2'].isnull().sum())

In [None]:
replace_neg_with_nan('R_O2')

In [None]:
hist_plot(train_df['R_O2'])
box_plot(train_df['R_O2'])

In [None]:
print(train_df['R_O2'].isnull().sum(), test_df['R_O2'].isnull().sum())

#### R_O2Sat

Reported Oxygen Saturation

Here also some values are negative. Replace them with NaN

In [None]:
print(train_df['R_O2Sat'].isnull().sum(), test_df['R_O2Sat'].isnull().sum())

In [None]:
replace_neg_with_nan('R_O2Sat')

In [None]:
hist_plot(train_df['R_O2Sat'])
box_plot(train_df['R_O2Sat'])

#### R_SIO3

Reported Silicate Concentration

In [None]:
print(train_df['R_SIO3'].isnull().sum(), test_df['R_SIO3'].isnull().sum())

In [None]:
hist_plot(train_df['R_SIO3'])
box_plot(train_df['R_SIO3'])

#### R_PO4

Reported Phosphate Concentration

In [None]:
print(train_df['R_PO4'].isnull().sum(), test_df['R_PO4'].isnull().sum())

In [None]:
hist_plot(train_df['R_PO4'])
box_plot(train_df['R_PO4'])

#### R_NO3

Reported Nitrate Concentration

In [None]:
print(train_df['R_NO3'].isnull().sum(), test_df['R_NO3'].isnull().sum())

In [None]:
replace_neg_with_nan('R_NO3')

In [None]:
hist_plot(train_df['R_NO3'])
box_plot(train_df['R_NO3'])

#### R_NO2

Reported Nitrite Concentration

In [None]:
print(train_df['R_NO2'].isnull().sum(), test_df['R_NO2'].isnull().sum())

In [None]:
hist_plot(train_df['R_NO2'])
box_plot(train_df['R_NO2'])

#### R_NH4

Reported Ammonium Concentration

In [None]:
print(train_df['R_NH4'].isnull().sum(), test_df['R_NH4'].isnull().sum())

In [None]:
hist_plot(train_df['R_NH4'])
box_plot(train_df['R_NH4'])

#### R_CHLA

Reported Chlorophyll-a

In [None]:
print(train_df['R_CHLA'].isnull().sum(), test_df['R_CHLA'].isnull().sum())

In [None]:
replace_neg_with_nan('R_CHLA')

In [None]:
hist_plot(train_df['R_CHLA'])
box_plot(train_df['R_CHLA'])

#### R_PHAEO

Reported Phaeophytin

Some of the values are negative. Replace them with NaN.

In [None]:
print(train_df['R_PHAEO'].isnull().sum(), test_df['R_PHAEO'].isnull().sum())

In [None]:
replace_neg_with_nan('R_PHAEO')

In [None]:
hist_plot(train_df['R_PHAEO'])
box_plot(train_df['R_PHAEO'])

#### Lat_Dec
Latitude

In [None]:
print(train_df['Lat_Dec'].isnull().sum(), test_df['Lat_Dec'].isnull().sum())

In [None]:
hist_plot(train_df['Lat_Dec'])
box_plot(train_df['Lat_Dec'])

#### Lon_Dec
Longitude

In [None]:
print(train_df['Lon_Dec'].isnull().sum(), test_df['Lon_Dec'].isnull().sum())

In [None]:
hist_plot(train_df['Lon_Dec'])
box_plot(train_df['Lon_Dec'])

Make sure that there are now no inappropriate negative values.

In [None]:
train_df.min()

In [None]:
test_df.min()

## Save the data

Save train and test split in separate files.

In [None]:
train_df.to_csv(f'../data/barely_processed_train.csv')
test_df.to_csv(f'../data/barely_processed_test.csv')