In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import f_regression

random_seed = 2024
np.random.seed(random_seed)

In [None]:
compute_mutual_info = False

In [None]:
train_df = pd.read_csv(f'../data/post_fe_train.csv', index_col=0)
test_df = pd.read_csv(f'../data/post_fe_test.csv', index_col=0)

y_col = 'R_SALINITY'
x_cols = train_df.drop(y_col, axis=1).columns

In [None]:
train_df.info()

In [None]:
categorical_cols = train_df.select_dtypes(np.int64).columns
categorical_cols

## Feature selection

Here I'll calculate some statistics for the dataset to see the relevance for features. Based on this information, more of feature selection may be conducted later while constructing specific models.

In [None]:
general_ranking = pd.DataFrame(index=x_cols)

### Variance

Get low-variance features.

In [None]:
var_threshold = VarianceThreshold(0.01)

var_selector = var_threshold.fit_transform(train_df[x_cols])

variable_cols = train_df[train_df[x_cols].columns[var_threshold.get_support(indices=True)]].columns
redundant_cols = set(train_df[x_cols].columns.tolist()) - set(variable_cols)

redundant_cols

### Mutual info

In [None]:
mutual_info_df = {}

# Warning: a long computation
if compute_mutual_info:
    mutual_info = mutual_info_regression(train_df[x_cols], train_df[y_col], discrete_features=train_df[x_cols].columns.get_indexer(categorical_cols), random_state=random_seed)

    mutual_info_df = pd.DataFrame({
        'mutual_info_score': mutual_info
    }, index=x_cols).sort_values('mutual_info_score', ascending=False)
    mutual_info_df.to_excel('../scores/mutual_info_score.xlsx')
else:
    mutual_info_df = pd.read_excel('../scores/mutual_info_score.xlsx', index_col=0)

In [None]:
mutual_info_df.head(10)

In [None]:
mutual_info_df.loc[categorical_cols].sort_values(by='mutual_info_score', ascending=False)

We see that at least some of new categorical features have non-negligible mutual info with a target variable.

### F-score

In [None]:
(f_stats, f_pvalues) = f_regression(train_df[x_cols], train_df[y_col])

f_stats_df = pd.DataFrame({
    'f_score': f_stats,
    'f_pvalues': f_pvalues
}, index=x_cols).sort_values('f_score', ascending=False)

f_stats_df.to_excel('../scores/f_stats_score.xlsx')

In [None]:
f_stats_df.loc[f_stats_df['f_pvalues'] > 0.05]

Here we see that the $H_0$ is failed to be rejected only for several features, which are already eliminated by variance threshold.

## Conclusion of feature selection

I hereby decide to remove features rejected by variance threshold elimination and features with mutual information less than $0.1$.

In [None]:
redundant_mutual_cols = mutual_info_df.loc[mutual_info_df['mutual_info_score'] < 0.1].index.values

redundant_cols = list(set(redundant_cols) | set(redundant_mutual_cols))
redundant_cols

In [None]:
train_df = train_df.drop(redundant_cols, axis=1)
test_df = test_df.drop(redundant_cols, axis=1)

print(train_df.shape, test_df.shape)
print(all(train_df.columns == test_df.columns))

In [None]:
train_df.to_csv('../data/post_fs_train.csv')
test_df.to_csv('../data/post_fs_test.csv')