In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn import tree


random_seed = 2024
np.random.seed(random_seed)

In [None]:
def hist_plot(dataset, f_size=(10, 5)):
    plt.figure(figsize=f_size)
    sns.histplot(data=dataset, kde=False, bins=50)
    plt.plot()

def scatter_plot(x, y):
    plt.figure(figsize=(5, 3))
    sns.scatterplot(x=x, y=y)
    plt.plot()

def hist_plot_compare(x1, x2, x1_title, x2_title):
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(25, 45), layout='constrained')

    sns.histplot(x1, ax=axes[0], bins=50).set(title=x1_title, xlabel="")
    sns.histplot(x2, ax=axes[1], bins=50).set(title=x2_title, xlabel="")

    plt.show()

In [None]:
def get_splits(tree_clf):
    splits = np.array(tree_clf.tree_.threshold)

    splits = splits[splits != -2]
    splits = splits.tolist()
    splits.append(np.Inf)
    splits.append(-np.inf)
    splits.sort()

    return splits

def create_cut_and_dummies(dataset, col, splits):
    col_cat = f'{col}_cat'

    dataset[col_cat] = pd.cut(dataset[col], splits)
    return pd.get_dummies(dataset, columns=[col_cat], prefix=[col_cat])


In [None]:
train_df = pd.read_csv(f'../data/post_impute_train.csv', index_col=0)
test_df = pd.read_csv(f'../data/post_impute_test.csv', index_col=0)

target_var = 'R_SALINITY'

In [None]:
train_df.info()

#### Feature engineering

Here I'll apply feature engineering techniques to the selected variables with skewed distributions. I'll keep variables with "somewhat" normal distributions intact for now.

#### R_Depth

Let's try discretizing it by applying the decision tree regressor.

In [None]:
scatter_plot(train_df['R_Depth'], train_df[target_var])

In [None]:
depth_clf = tree.DecisionTreeRegressor(max_depth=3)
depth_clf = depth_clf.fit(np.array(train_df['R_Depth']).reshape(-1, 1), np.array(train_df[target_var]).reshape(-1, 1))

plt.figure(figsize=(12, 12))
tree.plot_tree(depth_clf, fontsize=8)
plt.show()

Here the decision regressor is able to separate surface water and also range depths by their "shallowness", based on target variable. Let's create corresponding features.

In [None]:
splits = get_splits(depth_clf)
splits[0] = 0
splits

In [None]:
train_df = create_cut_and_dummies(train_df, 'R_Depth', splits)
test_df = create_cut_and_dummies(test_df, 'R_Depth', splits)

print(train_df.shape, test_df.shape)
print(all(train_df.columns == test_df.columns))

#### R_O2

Let's take a square root.

In [None]:
scatter_plot(train_df['R_O2'], train_df[target_var])

In [None]:
train_df['R_O2_sqrt'] = np.sqrt(train_df['R_O2'])
test_df['R_O2_sqrt'] = np.sqrt(test_df['R_O2'])

In [None]:
scatter_plot(train_df['R_O2_sqrt'], train_df[target_var])

The following features were missing quite a lot of values before imputation. Perhaps, it will be easier for models to have a categorical interpretation of these variables as well.

#### R_SIO3

Let's categorize this feature. We can see the effects of imputation here.

In [None]:
scatter_plot(train_df['R_SIO3'], train_df[target_var])

In [None]:
sio3_clf = tree.DecisionTreeRegressor(max_depth=2)
sio3_clf = sio3_clf.fit(np.array(train_df['R_SIO3']).reshape(-1, 1), np.array(train_df[target_var]).reshape(-1, 1))

plt.figure(figsize=(12, 12))
tree.plot_tree(sio3_clf, fontsize=8)
plt.show()

In [None]:
splits = get_splits(sio3_clf)
splits[0] = 0
splits

In [None]:
train_df = create_cut_and_dummies(train_df, 'R_SIO3', splits)
test_df = create_cut_and_dummies(test_df, 'R_SIO3', splits)

print(train_df.shape, test_df.shape)
print(all(train_df.columns == test_df.columns))

#### R_PO4

Again, let's classify it.

In [None]:
scatter_plot(train_df['R_PO4'], train_df[target_var])

In [None]:
po4_clf = tree.DecisionTreeRegressor(max_depth=2)
po4_clf = sio3_clf.fit(np.array(train_df['R_PO4']).reshape(-1, 1), np.array(train_df[target_var]).reshape(-1, 1))

plt.figure(figsize=(12, 12))
tree.plot_tree(po4_clf, fontsize=8)
plt.show()

In [None]:
splits = get_splits(po4_clf)
splits[0] = 0
splits

In [None]:
train_df = create_cut_and_dummies(train_df, 'R_PO4', splits)
test_df = create_cut_and_dummies(test_df, 'R_PO4', splits)

print(train_df.shape, test_df.shape)
print(all(train_df.columns == test_df.columns))

#### R_NO3

Let's classify it with a tree.

In [None]:
scatter_plot(train_df['R_NO3'], train_df[target_var])

In [None]:
no3_clf = tree.DecisionTreeRegressor(max_depth=2)
no3_clf = sio3_clf.fit(np.array(train_df['R_NO3']).reshape(-1, 1), np.array(train_df[target_var]).reshape(-1, 1))

plt.figure(figsize=(12, 12))
tree.plot_tree(no3_clf, fontsize=8)
plt.show()

In [None]:
splits = get_splits(no3_clf)
splits[0] = 0
splits

In [None]:
train_df = create_cut_and_dummies(train_df, 'R_NO3', splits)
test_df = create_cut_and_dummies(test_df, 'R_NO3', splits)

print(train_df.shape, test_df.shape)
print(all(train_df.columns == test_df.columns))

#### R_NO2

Again, let's classify it with a tree.

In [None]:
scatter_plot(train_df['R_NO2'], train_df[target_var])

In [None]:
no2_clf = tree.DecisionTreeRegressor(max_depth=2)
no2_clf = sio3_clf.fit(np.array(train_df['R_NO2']).reshape(-1, 1), np.array(train_df[target_var]).reshape(-1, 1))

plt.figure(figsize=(12, 12))
tree.plot_tree(no2_clf, fontsize=8)
plt.show()

In [None]:
splits = get_splits(no2_clf)
splits[0] = 0
splits

In [None]:
train_df = create_cut_and_dummies(train_df, 'R_NO2', splits)
test_df = create_cut_and_dummies(test_df, 'R_NO2', splits)

print(train_df.shape, test_df.shape)
print(all(train_df.columns == test_df.columns))

#### R_NH4

Again, let's classify it with a tree.

In [None]:
scatter_plot(train_df['R_NH4'], train_df[target_var])

In [None]:
nh4_clf = tree.DecisionTreeRegressor(max_depth=2)
nh4_clf = sio3_clf.fit(np.array(train_df['R_NH4']).reshape(-1, 1), np.array(train_df[target_var]).reshape(-1, 1))

plt.figure(figsize=(12, 12))
tree.plot_tree(nh4_clf, fontsize=8)
plt.show()

In [None]:
splits = get_splits(nh4_clf)
splits[0] = 0
splits

In [None]:
train_df = create_cut_and_dummies(train_df, 'R_NH4', splits)
test_df = create_cut_and_dummies(test_df, 'R_NH4', splits)

print(train_df.shape, test_df.shape)
print(all(train_df.columns == test_df.columns))

#### R_CHLA

Again, let's classify it with a tree.

In [None]:
scatter_plot(train_df['R_CHLA'], train_df[target_var])

In [None]:
chla_clf = tree.DecisionTreeRegressor(max_depth=2)
chla_clf = sio3_clf.fit(np.array(train_df['R_CHLA']).reshape(-1, 1), np.array(train_df[target_var]).reshape(-1, 1))

plt.figure(figsize=(12, 12))
tree.plot_tree(chla_clf, fontsize=8)
plt.show()

In [None]:
splits = get_splits(chla_clf)
splits[0] = 0
splits

In [None]:
train_df = create_cut_and_dummies(train_df, 'R_CHLA', splits)
test_df = create_cut_and_dummies(test_df, 'R_CHLA', splits)

print(train_df.shape, test_df.shape)
print(all(train_df.columns == test_df.columns))

#### R_PHAEO

Again, let's classify it with a tree.

In [None]:
scatter_plot(train_df['R_PHAEO'], train_df[target_var])

In [None]:
phaeo_clf = tree.DecisionTreeRegressor(max_depth=2)
phaeo_clf = sio3_clf.fit(np.array(train_df['R_PHAEO']).reshape(-1, 1), np.array(train_df[target_var]).reshape(-1, 1))

plt.figure(figsize=(12, 12))
tree.plot_tree(phaeo_clf, fontsize=8)
plt.show()

In [None]:
splits = get_splits(phaeo_clf)
splits[0] = 0
splits

In [None]:
train_df = create_cut_and_dummies(train_df, 'R_PHAEO', splits)
test_df = create_cut_and_dummies(test_df, 'R_PHAEO', splits)

print(train_df.shape, test_df.shape)
print(all(train_df.columns == test_df.columns))

#### R_PRES

Again, let's classify it with a tree.

In [None]:
scatter_plot(train_df['R_PRES'], train_df[target_var])

In [None]:
pres_clf = tree.DecisionTreeRegressor(max_depth=2)
pres_clf = sio3_clf.fit(np.array(train_df['R_PRES']).reshape(-1, 1), np.array(train_df[target_var]).reshape(-1, 1))

plt.figure(figsize=(12, 12))
tree.plot_tree(pres_clf, fontsize=8)
plt.show()

In [None]:
splits = get_splits(pres_clf)
splits[0] = 0
splits

In [None]:
train_df = create_cut_and_dummies(train_df, 'R_PRES', splits)
test_df = create_cut_and_dummies(test_df, 'R_PRES', splits)

print(train_df.shape, test_df.shape)
print(all(train_df.columns == test_df.columns))

#### Lat_Dec, Lon_Dec

Let's transform them into the polar system and add two new features.

In [None]:
def cartesian_to_polar(x, y):
    rho = np.sqrt(x*x + y*y)
    phi = np.arctan2(y, x)

    return (rho, phi)

v_cartesian_to_polar = np.vectorize(cartesian_to_polar)

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 5), layout='constrained')
sns.scatterplot(x=train_df['Lon_Dec'], y=train_df[target_var], ax=axes[0])
sns.scatterplot(x=train_df['Lat_Dec'], y=train_df[target_var], ax=axes[1])

plt.show()

In [None]:
train_df['Rho'], train_df['Phi'] = v_cartesian_to_polar(train_df['Lon_Dec'], train_df['Lat_Dec'])
test_df['Rho'], test_df['Phi'] = v_cartesian_to_polar(test_df['Lon_Dec'], test_df['Lat_Dec'])

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 5), layout='constrained')
sns.scatterplot(x=train_df['Rho'], y=train_df[target_var], ax=axes[0])
sns.scatterplot(x=train_df['Phi'], y=train_df[target_var], ax=axes[1])

plt.show()

In [None]:
print(train_df.shape, test_df.shape)
print(all(train_df.columns == test_df.columns))

## Save the data

In [None]:
train_df.to_csv(f'../data/post_fe_train.csv')
test_df.to_csv(f'../data/post_fe_test.csv')