In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import f_regression
from helpers import random_seed, target_feature

np.random.seed(random_seed)

## 6. Feature Selection

In [2]:
# we will use mututal information to evaluate how usefull some features are
compute_mutual_info = True

In [3]:
train_df = pd.read_csv(f'../data/post_fe_train.csv', index_col=0)
test_df = pd.read_csv(f'../data/post_fe_test.csv', index_col=0)

y_col = target_feature
x_cols = train_df.drop(y_col, axis=1).columns

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 370736 entries, 442244 to 320663
Data columns (total 69 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   R_NH4                       370736 non-null  float64
 1   R_PHAEO                     370736 non-null  float64
 2   R_SIGMA                     370736 non-null  float64
 3   R_O2                        370736 non-null  float64
 4   R_Depth                     370736 non-null  float64
 5   R_TEMP                      370736 non-null  float64
 6   R_SIO3                      370736 non-null  float64
 7   R_SVA                       370736 non-null  float64
 8   Lon_Dec                     370736 non-null  float64
 9   Lat_Dec                     370736 non-null  float64
 10  R_SALINITY                  370736 non-null  float64
 11  R_PO4                       370736 non-null  float64
 12  R_DYNHT                     370736 non-null  float64
 13  R_CHLA        

In [5]:
categorical_cols = train_df.select_dtypes(np.int64).columns
categorical_cols

Index(['R_Depth_cat_(0.0, 15.5]', 'R_Depth_cat_(15.5, 47.5]',
       'R_Depth_cat_(47.5, 68.5]', 'R_Depth_cat_(68.5, 117.5]',
       'R_Depth_cat_(117.5, 167.5]', 'R_Depth_cat_(167.5, 315.5]',
       'R_Depth_cat_(315.5, 671.5]', 'R_Depth_cat_(671.5, inf]',
       'R_Depth_missing', 'R_SIO3_cat_(0.0, 3.55]', 'R_SIO3_cat_(3.55, 10.05]',
       'R_SIO3_cat_(10.05, 42.05]', 'R_SIO3_cat_(42.05, inf]',
       'R_SIO3_missing', 'R_PO4_cat_(0.0, 0.505]', 'R_PO4_cat_(0.505, 1.025]',
       'R_PO4_cat_(1.025, 2.505]', 'R_PO4_cat_(2.505, inf]', 'R_PO4_missing',
       'R_NO2_cat_(0.0, 0.005]', 'R_NO2_cat_(0.005, 0.045]',
       'R_NO2_cat_(0.045, 1.665]', 'R_NO2_cat_(1.665, inf]', 'R_NO2_missing',
       'R_NO3_cat_(0.0, 0.55]', 'R_NO3_cat_(0.55, 10.85]',
       'R_NO3_cat_(10.85, 30.05]', 'R_NO3_cat_(30.05, inf]', 'R_NO3_missing',
       'R_NH4_cat_(0.0, 0.005]', 'R_NH4_cat_(0.005, 0.045]',
       'R_NH4_cat_(0.045, inf]', 'R_NH4_missing', 'R_CHLA_cat_(0.0, 0.015]',
       'R_CHLA_cat_(0.015, 0

## Feature selection

Here we calculated some statistics for the dataset to see the relevance for features. Based on this information, more of feature selection may be conducted later while constructing specific models.

In [6]:
general_ranking = pd.DataFrame(index=x_cols)

### Variance

Get low-variance features.

In [7]:
var_threshold = VarianceThreshold(0.01)

var_selector = var_threshold.fit_transform(train_df[x_cols])

variable_cols = train_df[train_df[x_cols].columns[var_threshold.get_support(indices=True)]].columns
redundant_cols = set(train_df[x_cols].columns.tolist()) - set(variable_cols)

redundant_cols

{'Phi',
 'R_CHLA_cat_(0.055, 0.065]',
 'R_Depth_missing',
 'R_NH4',
 'R_NH4_cat_(0.0, 0.005]',
 'R_NO2',
 'R_NO2_cat_(0.0, 0.005]',
 'R_NO2_cat_(1.665, inf]',
 'R_PHAEO_cat_(0.0, 0.005]',
 'R_PHAEO_cat_(0.005, 0.015]',
 'R_PRES_missing'}

### Mutual info

In [8]:
mutual_info_df = {}

# Warning: a long computation
if compute_mutual_info:
    mutual_info = mutual_info_regression(train_df[x_cols], train_df[y_col], discrete_features=train_df[x_cols].columns.get_indexer(categorical_cols), random_state=random_seed)

    mutual_info_df = pd.DataFrame({
        'mutual_info_score': mutual_info
    }, index=x_cols).sort_values('mutual_info_score', ascending=False)
    mutual_info_df.to_excel('../scores/mutual_info_score.xlsx')
else:
    mutual_info_df = pd.read_excel('../scores/mutual_info_score.xlsx', index_col=0)

In [9]:
mutual_info_df.head(10)

Unnamed: 0,mutual_info_score
R_SVA,1.610244
R_SIGMA,1.597569
R_PRES,1.099207
R_Depth,1.09047
R_DYNHT,0.848832
R_O2Sat,0.731166
R_SALINITY,0.686451
R_O2,0.672186
R_O2_sqrt,0.669059
R_SIO3,0.641994


In [10]:
mutual_info_df.loc[categorical_cols].sort_values(by='mutual_info_score', ascending=False)

Unnamed: 0,mutual_info_score
"R_PRES_cat_(317.5, inf]",0.410539
"R_Depth_cat_(315.5, 671.5]",0.321691
"R_PRES_cat_(118.5, 317.5]",0.265463
"R_PRES_cat_(0.0, 47.5]",0.218518
"R_Depth_cat_(167.5, 315.5]",0.201098
"R_PO4_cat_(0.0, 0.505]",0.165354
"R_Depth_cat_(671.5, inf]",0.164187
"R_CHLA_cat_(0.065, inf]",0.141655
"R_SIO3_cat_(0.0, 3.55]",0.138285
"R_SIO3_cat_(10.05, 42.05]",0.136786


We see that at least some of new categorical features have non-negligible mutual info with a target variable.

### F-score

In [11]:
(f_stats, f_pvalues) = f_regression(train_df[x_cols], train_df[y_col])

f_stats_df = pd.DataFrame({
    'f_score': f_stats,
    'f_pvalues': f_pvalues
}, index=x_cols).sort_values('f_score', ascending=False)

f_stats_df.to_excel('../scores/f_stats_score.xlsx')

In [12]:
f_stats_df.loc[f_stats_df['f_pvalues'] > 0.05]

Unnamed: 0,f_score,f_pvalues
"R_PHAEO_cat_(0.0, 0.005]",0.0,1.0
R_Depth_missing,0.0,1.0
"R_NH4_cat_(0.0, 0.005]",0.0,1.0
R_PRES_missing,0.0,1.0
"R_NO2_cat_(0.0, 0.005]",0.0,1.0


Here we see that the $H_0$ is failed to be rejected only for several features, which are already eliminated by variance threshold.

## Conclusion of feature selection

We hereby decide to remove features rejected by variance threshold elimination and features with mutual information less than $0.1$.

In [13]:
redundant_mutual_cols = mutual_info_df.loc[mutual_info_df['mutual_info_score'] < 0.1].index.values

redundant_cols = list(set(redundant_cols) | set(redundant_mutual_cols))
redundant_cols

['R_NH4_cat_(0.0, 0.005]',
 'R_SIO3_cat_(3.55, 10.05]',
 'R_NO3_cat_(0.55, 10.85]',
 'R_Depth_cat_(68.5, 117.5]',
 'R_CHLA_cat_(0.055, 0.065]',
 'R_PO4_missing',
 'R_NO2_cat_(0.005, 0.045]',
 'R_NO2_cat_(0.0, 0.005]',
 'R_CHLA_cat_(0.015, 0.055]',
 'R_NH4_missing',
 'R_PO4_cat_(0.505, 1.025]',
 'R_Depth_cat_(47.5, 68.5]',
 'Phi',
 'R_NO2',
 'R_CHLA_cat_(0.0, 0.015]',
 'R_NO3_cat_(0.0, 0.55]',
 'R_SIO3_missing',
 'R_PHAEO_cat_(0.0, 0.005]',
 'R_Depth_missing',
 'R_NO2_cat_(1.665, inf]',
 'R_NO3_missing',
 'R_PHAEO_cat_(0.175, inf]',
 'R_NH4_cat_(0.005, 0.045]',
 'R_NO2_missing',
 'R_PHAEO_cat_(0.015, 0.175]',
 'R_PHAEO_cat_(0.005, 0.015]',
 'R_NH4_cat_(0.045, inf]',
 'R_NH4',
 'R_PRES_missing',
 'R_Depth_cat_(117.5, 167.5]',
 'R_NO2_cat_(0.045, 1.665]',
 'R_Depth_cat_(0.0, 15.5]']

In [14]:
train_df = train_df.drop(redundant_cols, axis=1)
test_df = test_df.drop(redundant_cols, axis=1)

print(train_df.shape, test_df.shape)
print(all(train_df.columns == test_df.columns))

(370736, 37) (158887, 37)
True


In [15]:
train_df.to_csv('../data/post_fs_train.csv')
test_df.to_csv('../data/post_fs_test.csv')