# Final model comparison and summary

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error, r2_score
import pickle
import helpers

random_seed = 2024
np.random.seed(random_seed)

In [None]:
def plot_jointplot(y_true, y_predict):
    jointplot = pd.DataFrame({
        'y_true': y_true,
        'y_predict': y_predict
    })

    plt.figure(figsize=(10, 5))
    sns.jointplot(jointplot, x='y_true', y='y_predict', kind='reg', truncate=False)

In [None]:
def plot_score_comparison(scores, score_name):
    plt.style.use('ggplot')
    ax = scores.plot(kind='bar', figsize=(10, 5), title=f"{score_name} comparison on test set", width=0.2)

    plt.tick_params(
        axis='x',
        which='both',
        bottom=True,
        top=False,
        labelbottom=True) 
    ax.set_xticklabels(scores.index, rotation=0, fontsize=12)
    ax.set_xlabel("Models", fontsize=13)
    ax.plot()

In [None]:
plot_jointplots = False

In [None]:
train = pd.read_csv('../data/post_fs_train.csv', index_col=0)
test = pd.read_csv('../data/post_fs_test.csv', index_col=0)

In [None]:
target_var = 'R_SALINITY'

y_train = train[target_var].values
x_train = train.drop(target_var, axis=1)

y_test = test[target_var].values
x_test = test.drop(target_var, axis=1)

In [None]:
without_nan_cols = helpers.get_without_nan_cols(x_train.columns.to_list())

In [None]:
final_scores = pd.DataFrame(columns=['MSE', 'R2'])

### OLS

In [None]:
with open('../models/ols.pkl', 'rb') as f:
    ols_model = pickle.load(f)

y_predict = ols_model.predict(x_test[without_nan_cols].values)

final_scores.loc['OLS'] = (mean_squared_error(y_test, y_predict), r2_score(y_test, y_predict))
final_scores

In [None]:
if plot_jointplots:
    plot_jointplot(y_test, y_predict)

## Final comparison

In [None]:
plot_score_comparison(final_scores['MSE'], 'MSE')

In [None]:
plot_score_comparison(final_scores['R2'], 'R2')