In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from helpers import scatter_plot


random_seed = 2024
np.random.seed(random_seed)

In [None]:
train_df = pd.read_csv(f'../data/post_impute_train.csv', index_col=0)
test_df = pd.read_csv(f'../data/post_impute_test.csv', index_col=0)

target_var = 'R_SALINITY'

In [None]:
train_df.info()

## Feature engineering

Here I'll apply feature engineering techniques to the selected variables with skewed distributions. I'll keep variables with "somewhat" normal distributions intact. Creating categorical variables was done before imputation.

#### R_O2

Let's take a square root.

In [None]:
scatter_plot(train_df['R_O2'], train_df[target_var])

In [None]:
train_df['R_O2_sqrt'] = np.sqrt(train_df['R_O2'])
test_df['R_O2_sqrt'] = np.sqrt(test_df['R_O2'])

In [None]:
scatter_plot(train_df['R_O2_sqrt'], train_df[target_var])

#### Lat_Dec, Lon_Dec

Let's transform them into the polar system and add two new features.

In [None]:
def cartesian_to_polar(x, y):
    rho = np.sqrt(x*x + y*y)
    phi = np.arctan2(y, x)

    return (rho, phi)

v_cartesian_to_polar = np.vectorize(cartesian_to_polar)

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 5), layout='constrained')
sns.scatterplot(x=train_df['Lon_Dec'], y=train_df[target_var], ax=axes[0])
sns.scatterplot(x=train_df['Lat_Dec'], y=train_df[target_var], ax=axes[1])

plt.show()

In [None]:
train_df['Rho'], train_df['Phi'] = v_cartesian_to_polar(train_df['Lon_Dec'], train_df['Lat_Dec'])
test_df['Rho'], test_df['Phi'] = v_cartesian_to_polar(test_df['Lon_Dec'], test_df['Lat_Dec'])

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 5), layout='constrained')
sns.scatterplot(x=train_df['Rho'], y=train_df[target_var], ax=axes[0])
sns.scatterplot(x=train_df['Phi'], y=train_df[target_var], ax=axes[1])

plt.show()

In [None]:
print(train_df.shape, test_df.shape)
print(all(train_df.columns == test_df.columns))
print(train_df.shape[1] == len(set(train_df.columns)))

## Save the data

In [None]:
train_df.to_csv('../data/post_fe_train.csv')
test_df.to_csv('../data/post_fe_test.csv')