In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

random_seed = 2024
np.random.seed(random_seed)

In [None]:
initial_train_df = pd.read_csv(f'../data/barely_processed_train.csv', index_col=0)

In [None]:
initial_train_df.info()

## Feature Imputation

We see that most of the columns are missing at least some of the values, and most of them are missing a lot of values, like R_O2, R_O2Sat, R_SIO3, R_PO4, R_NO3, R_NO2, R_NH4, R_CHLA, R_PHAEO. Therefore feature imputation is required for the modelling.

Let's try the multivariate feature imputation from scikit-learn.

Also make sure that the values for most of the columns are non-negative, and coordinates are in appropriate ranges.

In [None]:
min_values_for_imputation = {
    'R_Depth': 0,
    'R_TEMP': -np.inf,
    'R_SALINITY': 0,
    'R_SIGMA': 0,
    'R_SVA': 0,
    'R_DYNHT': 0,
    'R_O2': 0, 
    'R_O2Sat': 0,
    'R_SIO3': 0,
    'R_PO4': 0,
    'R_NO3': 0,
    'R_NO2': 0,
    'R_NH4': 0,
    'R_CHLA': 0,
    'R_PHAEO': 0,
    'R_PRES': 0,
    'Lat_Dec': -90,
    'Lon_Dec': -180
}

max_values_for_imputation = {
    'R_Depth': np.inf,
    'R_TEMP': np.inf,
    'R_SALINITY': np.inf,
    'R_SIGMA': np.inf,
    'R_SVA': np.inf,
    'R_DYNHT': np.inf,
    'R_O2': np.inf, 
    'R_O2Sat': np.inf,
    'R_SIO3': np.inf,
    'R_PO4': np.inf,
    'R_NO3': np.inf,
    'R_NO2': np.inf,
    'R_NH4': np.inf,
    'R_CHLA': np.inf,
    'R_PHAEO': np.inf,
    'R_PRES': np.inf,
    'Lat_Dec': 90,
    'Lon_Dec': 180
}

In [None]:
imputer = IterativeImputer(random_state=random_seed, min_value=list(min_values_for_imputation.values()), max_value=list(max_values_for_imputation.values()))

imputed_train = imputer.fit_transform(initial_train_df)

In [None]:
train_df = pd.DataFrame(imputed_train, index = initial_train_df.index, columns = initial_train_df.columns)
train_df.info()

All the values have been imputed. Let's analyze the resulting distributions of features after the imputation.

In [None]:
fig, axes = plt.subplots(nrows=train_df.shape[1], ncols=2, figsize=(25, 45), layout='constrained')
for col, ax in zip(train_df.columns, axes):
    sns.histplot(initial_train_df[col], ax=ax[0], bins=50).set(title=f'Initial: {col}', xlabel="")
    sns.histplot(train_df[col], ax=ax[1], bins=50).set(title=f'Imputed: {col}', xlabel="")
plt.show()

It seems the shapes of the PDFs are mostly preserved, except for R_O2 and R_O2Sat, where a mean imputation is visible.

In [None]:
fig, axes = plt.subplots(nrows=train_df.shape[1], ncols=2, figsize=(25, 45), layout='constrained')
for col, ax in zip(train_df.columns, axes):
    sns.scatterplot(x=initial_train_df[col], y=initial_train_df['R_SALINITY'], ax=ax[0]).set(title=f'Initial: {col}', xlabel="")
    sns.scatterplot(x=train_df[col], y=train_df['R_SALINITY'], ax=ax[1]).set(title=f'Imputed: {col}', xlabel="")
plt.show()

## Test Dataset

Apply the imputation algorithm trained on the train dataset to the test dataset.

In [None]:
initial_test_df = pd.read_csv(f'../data/barely_processed_test.csv', index_col=0)

In [None]:
initial_test_df.info()

In [None]:
imputed_test = imputer.transform(initial_test_df)

In [None]:
test_df = pd.DataFrame(imputed_test, index = initial_test_df.index, columns = initial_test_df.columns)
test_df.info()

#### The distributions of test imputations

In [None]:
fig, axes = plt.subplots(nrows=test_df.shape[1], ncols=2, figsize=(25, 45), layout='constrained')
for col, ax in zip(test_df.columns, axes):
    sns.histplot(initial_test_df[col], ax=ax[0], bins=50).set(title=f'Initial: {col}', xlabel="")
    sns.histplot(test_df[col], ax=ax[1], bins=50).set(title=f'Imputed: {col}', xlabel="")
plt.show()

The imputation mostly preserves the shapes of the PDFs.

In [None]:
fig, axes = plt.subplots(nrows=test_df.shape[1], ncols=2, figsize=(25, 45), layout='constrained')
for col, ax in zip(train_df.columns, axes):
    sns.scatterplot(x=initial_test_df[col], y=initial_test_df['R_SALINITY'], ax=ax[0]).set(title=f'Initial: {col}', xlabel="")
    sns.scatterplot(x=test_df[col], y=test_df['R_SALINITY'], ax=ax[1]).set(title=f'Imputed: {col}', xlabel="")
plt.show()

## Saving the data

Save the imputed datasets and the imputer model itself for future uses.

In [None]:
train_df.to_csv(f'../data/post_impute_train.csv')
test_df.to_csv(f'../data/post_impute_test.csv')

In [None]:
joblib.dump(imputer, '../data/imputer.joblib')