In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import mean_squared_error, r2_score
from joblib import dump
import helpers

pd.set_option("display.max_columns", 500)

random_seed = 2024
np.random.seed(random_seed)

In [None]:
# reading in data and dividing into x and y
train = pd.read_csv('../data/post_fs_train.csv', index_col=0)
test = pd.read_csv('../data/post_fs_test.csv', index_col=0)

In [None]:
target_var = 'R_SALINITY'

y_train = train[target_var]
x_train = train.drop(target_var, axis=1)

y_test = test[target_var]
x_test = test.drop(target_var, axis=1)

In [None]:
def cv_validation(model, X, y):
    scores = cross_validate(model, X, y, cv=5, scoring=['neg_mean_squared_error', 'r2'])

    mse_error = np.abs(np.mean(scores['test_neg_mean_squared_error']))
    r2_score = np.mean(scores['test_r2'])

    print(mse_error, r2_score)

### Cross-validation

In [None]:
model = LinearRegression()

cv_validation(model, x_train, y_train)

Let's compare this to the CV on the model with the original continuous columns

In [None]:
original_cols = list(set(helpers.get_calcofi_original_cols()) & set(x_train.columns.to_list()))

cv_validation(model, x_train[original_cols], y_train)

We see that feature engineering slightly improved the scores. But, maybe, we are overfitting on the columns with a lot of missing values (originally). Let's try to remove them and see what happens. Perhaps, new categorical columns will help instead.

In [None]:
without_nan_cols = helpers.get_without_nan_cols(x_train.columns.to_list())

cv_validation(model, x_train[without_nan_cols], y_train)

The effect is the same as with nan-columns.

Finally, let's see what happens if we take the bare minimum of columns: only original ones without columns with a lot of missing values.

In [None]:
originals_without_nan_cols = helpers.get_originals_without_nan_cols()

cv_validation(model, x_train[originals_without_nan_cols], y_train)

All in all, we can conclude that the simplest OLS performs well on this dataset even with the minimum pre-processing. But still, the feature engineering has actually improved result.

For the final model let's choose the model with columns without a lot of NaN-values (the third one).

### Training the model

Train the model and save it for later.

In [None]:
model.fit(x_train[without_nan_cols].values, y_train.values)
dump(model, '../models/ols.sav')

### Check the prediction

In [None]:
y_predict = model.predict(x_test[without_nan_cols].values)

In [None]:
print(mean_squared_error(y_test, y_predict), r2_score(y_test, y_predict))