# Feature Engineering
---
Describe:

---
## 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn import preprocessing

# outlier detection
from pyod.models.ecod import ECOD
from pyod.models.suod import SUOD
from pyod.models.lof import LOF
from pyod.models.iforest import IForest
from pyod.models.copod import COPOD
from sklearn.covariance import EllipticEnvelope

# feature selection
from xgboost import XGBRegressor
from sklearn.svm import LinearSVC

# feature engineering
from feature_engineering import DealWithMissingValue, VariableTransformation, OutlierDetection,\
FeatureSelection, CreateGroupFeature, GeneratePolynomialFeatures, ReduceDimensionPCA, Standardization

import feature_engineering_config as fe_config

pd.set_option('display.max_columns', 150)

## 2. Read xlsx File

In [2]:
train_data = pd.read_excel('../Data/2022-train-v2.xlsx')

## 3. Data Preparation

In [3]:
train_y = train_data[['sensor_point5_i_value', 'sensor_point6_i_value', 'sensor_point7_i_value', 'sensor_point8_i_value', 'sensor_point9_i_value', 'sensor_point10_i_value']]
train_x = train_data.drop(['sensor_point5_i_value', 'sensor_point6_i_value', 'sensor_point7_i_value', 'sensor_point8_i_value', 'sensor_point9_i_value', 'sensor_point10_i_value'], axis=1)
del train_data

In [4]:
train_x.head()

Unnamed: 0,clean_temp,clean_ec,clean_ph4,clean_ph5,clean_ph7,clean_ph8,clean_pressure11,clean_pressure12,clean_pressure21,clean_pressure22,clean_pressure23,clean_pressure31,clean_pressure33,clean_pressure41,clean_pressure42,clean_pressure51,clean_pressure52,clean_pressure61,clean_pressure62,clean_pressure71,clean_pressure72,clean_pressure81,clean_pressure82,clean_pressure91,clean_pressure92,clean_pressure101,clean_pressure102,oven_pa1,oven_pa2,oven_pb1,oven_pb2,oven_a1,oven_a2,oven_a3,oven_b1,oven_b2,oven_b3,painting_g1_act_a_air,painting_g1_act_f_air,painting_g1_act_t_air,painting_g1_act_hvv,painting_g1_act_hvc,painting_g2_act_a_air,painting_g2_act_f_air,painting_g2_act_t_air,painting_g2_act_hvv,painting_g2_act_hvc,painting_g3_act_a_air,painting_g3_act_f_air,painting_g3_act_t_air,painting_g3_act_hvv,painting_g3_act_hvc,painting_g4_act_a_air,painting_g4_act_f_air,painting_g4_act_t_air,painting_g4_act_hvv,painting_g4_act_hvc,painting_g5_act_a_air,painting_g5_act_f_air,painting_g5_act_t_air,painting_g5_act_hvv,painting_g5_act_hvc,painting_g6_act_a_air,painting_g6_act_f_air,painting_g6_act_t_air,painting_g6_act_hvv,painting_g6_act_hvc,painting_g7_act_a_air,painting_g7_act_f_air,painting_g7_act_t_air,painting_g7_act_hvv,painting_g7_act_hvc,painting_g8_act_a_air,painting_g8_act_f_air,painting_g8_act_t_air,painting_g8_act_hvv,painting_g8_act_hvc,painting_g9_act_a_air,painting_g9_act_f_air,painting_g9_act_t_air,painting_g9_act_hvv,painting_g9_act_hvc,painting_g10_act_a_air,painting_g10_act_f_air,painting_g10_act_t_air,painting_g10_act_hvv,painting_g10_act_hvc,painting_g11_act_a_air,painting_g11_act_f_air,painting_g11_act_t_air,painting_g11_act_hvv,painting_g11_act_hvc,painting_g12_act_a_air,painting_g12_act_f_air,painting_g12_act_t_air,painting_g12_act_hvv,painting_g12_act_hvc,env_rpi05_hum,env_rpi05_pm1,env_rpi05_pm10,env_rpi05_pm25,env_rpi05_temp,env_rpi07_hum,env_rpi07_pm1,env_rpi07_pm10,env_rpi07_pm25,env_rpi07_temp,env_rpi09_hum,env_rpi09_lux,env_rpi09_pm1,env_rpi09_pm10,env_rpi09_pm25,env_rpi09_temp,env_rpi14_hum,env_rpi14_lux,env_rpi14_pm1,env_rpi14_pm10,env_rpi14_pm25,env_rpi14_temp,env_rpi15_hum,env_rpi15_lux,env_rpi15_pm1,env_rpi15_pm10,env_rpi15_pm25,env_rpi15_temp
0,41.1,12.4,,,,6.9,820.24,1262.82,883.46,630.74,640.79,509.75,716.48,1065.79,817.12,833.01,604.2,1139.1,648.61,707.22,1196.38,903.51,825.09,905.59,742.77,414.85,455.39,175.85,203.06,207.45,198.49,174.87,207.38,211.92,190.48,215.01,207.86,7.65,140.93,258.92,56.37,18.78,7.21,150.62,261.26,52.5,23.83,7.03,141.84,255.23,51.51,25.17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.71,137.76,253.8,56.69,11.2,7.15,137.1,253.0,53.26,23.84,7.23,137.79,254.08,56.6,18.23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,63.24,60.33,75.83,69.5,29.05,48.98,11.33,17.16,14.16,26.21,0.0,0.0,0.0,0.0,0.0,0.0,48.96,0.0,26.33,47.5,37.33,34.41,43.73,0.5,29.5,51.5,42.0,34.78
1,41.1,12.4,,,,6.9,820.15,1263.0,883.6,630.7,640.8,509.7,716.5,1065.9,817.05,833.05,604.2,1139.05,648.6,706.95,1196.6,903.6,825.15,905.4,742.8,414.65,455.25,176.0,203.01,207.4,198.35,174.85,207.37,211.9,190.42,215.02,207.85,7.09,125.88,231.92,51.55,17.1,7.04,146.06,253.31,50.61,23.45,7.13,137.42,247.36,49.82,24.54,0.4,5.41,9.48,1.9,1.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.73,123.67,227.93,50.22,9.93,6.41,128.27,236.76,50.1,21.9,6.4,128.93,237.57,53.6,17.22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,63.24,60.33,75.83,69.5,29.05,48.98,11.33,17.16,14.16,26.21,0.0,0.0,0.0,0.0,0.0,0.0,48.96,0.0,26.33,47.5,37.33,34.41,43.73,0.5,29.5,51.5,42.0,34.78
2,41.1,12.4,,,,6.9,820.15,1263.0,883.6,630.7,640.8,509.7,716.5,1065.9,817.05,833.05,604.2,1139.05,648.6,706.95,1196.6,903.6,825.15,905.4,742.8,414.65,455.25,176.0,203.01,207.4,198.35,174.85,207.37,211.9,190.42,215.02,207.85,7.09,125.88,231.92,51.55,17.1,7.04,146.06,253.31,50.61,23.45,7.13,137.42,247.36,49.82,24.54,0.4,5.41,9.48,1.9,1.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.73,123.67,227.93,50.22,9.93,6.41,128.27,236.76,50.1,21.9,6.4,128.93,237.57,53.6,17.22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,63.24,60.33,75.83,69.5,29.05,48.98,11.33,17.16,14.16,26.21,0.0,0.0,0.0,0.0,0.0,0.0,48.96,0.0,26.33,47.5,37.33,34.41,43.73,0.5,29.5,51.5,42.0,34.78
3,41.1,12.4,,,,6.9,820.15,1263.0,883.6,630.7,640.8,509.7,716.5,1065.9,817.05,833.05,604.2,1139.05,648.6,706.95,1196.6,903.6,825.15,905.4,742.8,414.65,455.25,176.0,203.01,207.4,198.35,174.85,207.37,211.9,190.42,215.02,207.85,7.09,125.88,231.92,51.55,17.1,7.04,146.06,253.31,50.61,23.45,7.13,137.42,247.36,49.82,24.54,0.4,5.41,9.48,1.9,1.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.73,123.67,227.93,50.22,9.93,6.41,128.27,236.76,50.1,21.9,6.4,128.93,237.57,53.6,17.22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,63.24,60.33,75.83,69.5,29.05,48.98,11.33,17.16,14.16,26.21,0.0,0.0,0.0,0.0,0.0,0.0,48.96,0.0,26.33,47.5,37.33,34.41,43.73,0.5,29.5,51.5,42.0,34.78
4,41.1,12.4,,,,6.9,820.78,1264.0,883.31,630.74,640.69,508.91,715.64,1065.94,816.86,833.03,604.26,1138.81,648.16,705.34,1196.08,902.72,824.85,905.0,742.62,413.86,454.2,176.2,202.81,205.96,197.1,174.86,207.41,212.0,190.9,215.23,207.72,7.66,130.16,261.21,54.59,23.59,7.29,154.83,260.25,52.31,26.19,7.07,147.8,265.26,48.99,28.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.3,127.7,245.01,52.56,12.39,6.95,132.45,244.33,49.81,24.82,7.15,131.71,242.71,50.92,20.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,63.02,60.71,75.85,69.71,29.12,49.96,13.42,20.14,17.0,26.42,0.0,0.0,0.0,0.0,0.0,0.0,48.78,0.0,26.71,48.85,38.14,34.48,43.94,0.42,29.71,51.85,42.42,34.82


In [5]:
train_y.head()

Unnamed: 0,sensor_point5_i_value,sensor_point6_i_value,sensor_point7_i_value,sensor_point8_i_value,sensor_point9_i_value,sensor_point10_i_value
0,42,56,82,41,73,45
1,37,60,58,40,64,42
2,35,66,62,39,86,45
3,37,58,58,42,52,39
4,57,78,62,51,64,65


## 4. Pipeline

In [6]:
train_process = fe_config.train_process
random_state = fe_config.random_state

In [7]:
# define nan
if fe_config.replace_zero_to_nan:
    train_x = train_x.replace({0: np.nan})

# missing values
if fe_config.deal_with_missing_value == 'drop_na':
    train_x = DealWithMissingValue(train_x).drop_na()
elif fe_config.deal_with_missing_value == 'imputation':
    train_x = DealWithMissingValue(train_x).imputation(strategy='most_frequent', train=train_process)
elif fe_config.deal_with_missing_value == 'k_neighbors_regressor':
    train_x = DealWithMissingValue(train_x).iterative_imputer(method='k_neighbors_regressor', train=train_process)
else:
    pass

# outlier detection
if fe_config.outlier_detection == 'ecod':
    temp = OutlierDetection(train_y).ecod(
        contamination=fe_config.ecod_contamination,
        threshold=fe_config.ecod_threshold,
        train=train_process
    )
elif fe_config.outlier_detection == 'suod':
    # initialized a group of outlier detectors for acceleration
    detector_list = [
        LOF(n_neighbors=15), LOF(n_neighbors=20), LOF(n_neighbors=25), LOF(n_neighbors=35),
        COPOD(),
        IForest(n_estimators=100), IForest(n_estimators=200)
    ]
    temp = OutlierDetection(train_y).suod(
        detector_list,
        threshold=fe_config.suod_threshold,
        train=train_process,
        verbose=False,
        n_jobs=-1
    )
elif fe_config.outlier_detection == 'elliptic_envelope':
    temp = OutlierDetection(train_y).elliptic_envelope(
        random_state=random_state,
        train=train_process
    )
else:
    pass

train_x['outlier_detection_from_y'] = temp
train_y['outlier_detection_from_y'] = temp
# remove outlier and drop outlier_detection_from_y
train_x = train_x.query("outlier_detection_from_y==False")
train_x = train_x.drop('outlier_detection_from_y', axis=1)
train_y = train_y.query("outlier_detection_from_y==False")
train_y = train_y.drop('outlier_detection_from_y', axis=1)

# variable transformation
large_skew_bool = train_x.skew().apply(abs) > fe_config.skew_threshold
for col in train_x._get_numeric_data().columns[large_skew_bool]:
    train_x[col] = VariableTransformation(train_x, col=col).transform(method=fe_config.variable_transformation, train=train_process)

# create group feature  
train_x['group'] = CreateGroupFeature(train_x).kmeans_with_auto_k(
        standardization=fe_config.kmeans_standardization,
        k_range=fe_config.kmeans_k_range,
        random_state=random_state,
        parallel=True,
        parallel_verbose=0,
        train=train_process
    ).astype(str)
# train_x['group'] = train_x['group'].astype('category')


# convert each numeric data to category data
for col in train_x.columns:
    if col == 'group':
        pass
    else:
        train_x[col+'_group'] = CreateGroupFeature(train_x[[col]]).kmeans_with_auto_k(
                standardization=fe_config.kmeans_standardization,
                k_range=fe_config.kmeans_k_range,
                random_state=random_state,
                parallel=True,
                parallel_verbose=0,
                train=train_process
            ).astype(str)
        # train_x[col+'_group'] = train_x[col+'_group'].astype('category')
        
        
# generate polynomial and interaction features
if fe_config.generate_polynomial_features:
    train_x = GeneratePolynomialFeatures(train_x).get_dataframe(
        degree=fe_config.pf_degree,
        interaction_only=fe_config.pf_interaction_only,
        train=train_process
    )
    
if fe_config.reduce_dimension:
    train_x = ReduceDimensionPCA(train_x).pca(n_components=0.99, train=train_process)
    
    
if fe_config.standardization == 'standard_scaler':
    train_x = Standardization(train_x).standard_scaler(train=train_process)
elif fe_config.standardization == 'min_max_scaler':
    train_x = Standardization(train_x).min_max_scaler(train=train_process)


    
# Feature Selection
if 'variance' in fe_config.feature_selection:
    # removing features with zero variance
    train_x = FeatureSelection(train_x).variance(threshold=0, train=train_process)

# mutual information
if 'mutual_information' in fe_config.feature_selection:
    train_x = FeatureSelection(train_x).mutual_information(train_y['sensor_point5_i_value'], k=50, train=train_process)

# importance weight for feature selection
if 'importance_weight' in fe_config.feature_selection:
    estimator = XGBRegressor(n_jobs=-1)
    # 一定要用數結構的模型，因為這樣才能知道類別變數的重要性，而且分群變數(int)沒有做encoding，所以用tree based model 才不會有問題
    train_x = FeatureSelection(train_x).importance_weight(
        y=train_y,
        estimator=estimator,
        train=train_process
    )

# recursive feature elimination with cross-validation to select features
if 'recursive_feature_elimination' in fe_config.feature_selection:
    # 一定要用數結構的模型，因為這樣才能知道類別變數的重要性，而且分群變數(int)沒有做encoding，所以用tree based model 才不會有問題
    estimator = XGBRegressor(n_jobs=-1)
    train_x = FeatureSelection(train_x).recursive_feature_elimination(
        y=train_y,
        estimator=estimator,
        cv=fe_config.feature_selection_cv,
        min_features_to_select=fe_config.min_features_to_select,
        train=train_process,
        n_jobs=-1
    )






[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    6.6s remaining:    6.6s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    9.0s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    6.4s remaining:    6.4s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    8.3s finished





[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    6.3s remaining:    6.3s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    8.3s finished


In [8]:
train_x.head()

Unnamed: 0,clean_pressure11,clean_pressure23,clean_pressure31,clean_pressure41,clean_pressure51,clean_pressure52,clean_pressure61,clean_pressure62,clean_pressure72,clean_pressure81,clean_pressure82,clean_pressure91,clean_pressure92,clean_pressure102,oven_pa1,oven_pa2,oven_pb1,oven_pb2,oven_a2,oven_a3,oven_b1,oven_b2,painting_g1_act_f_air,painting_g1_act_t_air,painting_g1_act_hvc,painting_g2_act_hvv,painting_g2_act_hvc,painting_g4_act_f_air,painting_g4_act_hvv,painting_g5_act_a_air,painting_g5_act_hvc,painting_g6_act_a_air,painting_g7_act_f_air,painting_g7_act_hvv,painting_g10_act_hvc,painting_g11_act_f_air,painting_g11_act_hvv,painting_g11_act_hvc,painting_g12_act_f_air,env_rpi05_hum,env_rpi05_pm1,env_rpi05_temp,env_rpi07_hum,env_rpi07_pm10,env_rpi07_temp,env_rpi09_hum,env_rpi09_lux,env_rpi09_pm1,env_rpi09_temp,env_rpi14_hum,env_rpi14_lux,env_rpi14_pm1,env_rpi14_pm10,env_rpi14_temp,env_rpi15_hum,env_rpi15_lux,env_rpi15_pm1,env_rpi15_pm25,env_rpi15_temp,clean_pressure72_group,oven_b3_group,painting_g4_act_hvv_group,painting_g7_act_hvv_group,painting_g8_act_a_air_group,painting_g8_act_t_air_group,painting_g9_act_t_air_group,painting_g10_act_hvc_group,painting_g11_act_hvc_group,env_rpi05_temp_group,env_rpi07_hum_group,env_rpi15_pm1_group
0,0.996495,-2.393281,-0.839655,2.154155,2.111229,-2.887918,1.884733,-0.765608,2.854976,1.918546,-2.361587,-0.407225,-3.248157,0.825747,1.84,1.35458,3.660967,3.895985,-1.47739,-0.935668,1.284624,1.461166,0.541063,0.535398,0.742549,0.437166,1.416318,-2.670532,-2.583064,-1.465319,-1.292196,-0.327083,0.450766,0.815776,-2.567351,-1.722017,-1.697447,-1.56633,-0.333035,1.605079,0.863365,-0.7032,0.697631,0.544292,-1.330636,-3.348721,-0.308607,-1.462104,-3.484714,0.742585,-1.253593,1.529796,1.373728,0.520444,0.385747,-0.707349,1.569287,1.568502,-0.359653,4,1,2,3,0,4,1,3,1,1,3,2
1,0.994979,-2.393054,-0.840277,2.154155,2.12008,-2.887918,1.884733,-0.765809,2.854976,1.933153,-2.356928,-0.450656,-3.247793,0.825553,1.873648,1.344208,3.641095,3.838903,-1.478266,-0.938145,1.26735,1.465831,0.408525,0.415089,0.58567,0.190399,1.327206,-2.648057,-2.567668,-1.465319,-1.292196,-0.327083,0.328558,0.627313,-2.567351,-1.722017,-1.697447,-1.56633,-0.333035,1.605079,0.863365,-0.7032,0.697631,0.544292,-1.330636,-3.348721,-0.308607,-1.462104,-3.484714,0.742585,-1.253593,1.529796,1.373728,0.520444,0.385747,-0.707349,1.569287,1.568502,-0.359653,4,1,2,3,0,1,1,3,1,1,3,2
2,0.994979,-2.393054,-0.840277,2.154155,2.12008,-2.887918,1.884733,-0.765809,2.854976,1.933153,-2.356928,-0.450656,-3.247793,0.825553,1.873648,1.344208,3.641095,3.838903,-1.478266,-0.938145,1.26735,1.465831,0.408525,0.415089,0.58567,0.190399,1.327206,-2.648057,-2.567668,-1.465319,-1.292196,-0.327083,0.328558,0.627313,-2.567351,-1.722017,-1.697447,-1.56633,-0.333035,1.605079,0.863365,-0.7032,0.697631,0.544292,-1.330636,-3.348721,-0.308607,-1.462104,-3.484714,0.742585,-1.253593,1.529796,1.373728,0.520444,0.385747,-0.707349,1.569287,1.568502,-0.359653,4,1,2,3,0,1,1,3,1,1,3,2
3,0.994979,-2.393054,-0.840277,2.154155,2.12008,-2.887918,1.884733,-0.765809,2.854976,1.933153,-2.356928,-0.450656,-3.247793,0.825553,1.873648,1.344208,3.641095,3.838903,-1.478266,-0.938145,1.26735,1.465831,0.408525,0.415089,0.58567,0.190399,1.327206,-2.648057,-2.567668,-1.465319,-1.292196,-0.327083,0.328558,0.627313,-2.567351,-1.722017,-1.697447,-1.56633,-0.333035,1.605079,0.863365,-0.7032,0.697631,0.544292,-1.330636,-3.348721,-0.308607,-1.462104,-3.484714,0.742585,-1.253593,1.529796,1.373728,0.520444,0.385747,-0.707349,1.569287,1.568502,-0.359653,4,1,2,3,0,1,1,3,1,1,3,2
4,1.005596,-2.395557,-0.850103,2.154155,2.115652,-2.886876,1.884733,-0.774647,2.849485,1.792013,-2.380047,-0.540575,-3.249964,0.824092,1.918842,1.302967,3.092476,3.348366,-1.474754,-0.925713,1.407794,1.565041,0.446926,0.545306,1.172545,0.41182,1.980536,-2.670532,-2.583064,-1.465319,-1.292196,-0.327083,0.364135,0.696399,-2.567351,-1.722017,-1.697447,-1.56633,-0.333035,1.570258,0.943809,-0.662609,0.890605,1.025468,-1.075124,-3.348721,-0.308607,-1.462104,-3.484714,0.724599,-1.253593,1.585154,1.466447,0.529684,0.420025,-0.820966,1.606755,1.614279,-0.346717,4,1,2,3,0,1,1,3,1,1,3,2


In [9]:
with open(f'../Data/meta/train_x_after_feature_engineering.pickle', 'wb' ) as f:
    pickle.dump(train_x, f)

In [10]:
with open(f'../Data/meta/train_y_after_feature_engineering.pickle', 'wb' ) as f:
    pickle.dump(train_y, f)

## 4. Missing values
Imputation describes filling in missing data with estimates based on the rest of the data set.

In [None]:
# define missing value
# train_x.replace({0: np.nan})

* Drop all rows which have na values

In [None]:
# DealWithMissingValue(train_x).drop_na()

* Imputation

In [None]:
# DealWithMissingValue(train_x).imputation(strategy='most_frequent', train=True)

* Iterative imputation of the missing values

A strategy for imputing missing values by modeling each feature with missing values as a function of other features in a round-robin fashion.

In [None]:
# DealWithMissingValue(train_x).iterative_imputer(method='k_neighbors_regressor', train=True)

* Datawig (安裝不了)

[Reference](https://github.com/awslabs/datawig)

Datawig is a library that learns ML models using Deep Neural Networks to impute missing values in the datagram.

### Variable transformation
* Box-Cox Transformation
* Yeo-Johnson Transformation

Box-Cox requires input data to be strictly positive, while Yeo-Johnson supports both positive or negative data.

In [None]:
# train_x.columns

In [None]:
# VariableTransformation(train_x, col='clean_temp').transform(method='yeo-johnson', train=True)

### Drop outlier

* ### ECOD

In [None]:
# OutlierDetection(train_y).ecod(contamination=0.03, threshold=20, train=True)

* ### SUOD

In [None]:
# # initialized a group of outlier detectors for acceleration
# detector_list = [
#     LOF(n_neighbors=15), LOF(n_neighbors=20), LOF(n_neighbors=25), LOF(n_neighbors=35),
#     COPOD(),
#     IForest(n_estimators=100), IForest(n_estimators=200)
# ]
# OutlierDetection(train_y).suod(detector_list, threshold=2, train=True, verbose=False, n_jobs=-1)

* ### Elliptic Envelope

In [None]:
# OutlierDetection(train_y).elliptic_envelope(random_state=0, train=True)

### Feature Selection

* ### Removing features with low variance

In [None]:
# _, t = FeatureSelection(train_x).variance(threshold=1, train=True)

* ### Mutual Information

In [None]:
# _, t = FeatureSelection(train_x.fillna(0)).mutual_information(train_y['sensor_point5_i_value'], k=50, train=True)

* ### Recursive feature elimination with cross-validation to select features.

[Reference](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html#sklearn.feature_selection.RFECV)

In [None]:
# _, t = FeatureSelection(train_x).recursive_feature_elimination(
#     y=train_y['sensor_point5_i_value'],
#     estimator=XGBRegressor(),
#     cv=3,
#     min_features_to_select=50,
#     train=True,
#     n_jobs=-1
# )

* ### importance weight for feature selection
    * L1-based model: 只能一次做一個 Y
    * tree-based model: 可以支援多維度的 Y

In [None]:
# estimator = XGBRegressor()
# # estimator = LinearSVC(C=0.01, penalty="l1", dual=False)

# _, t = FeatureSelection(train_x.fillna(0)).importance_weight(
#     y=train_y,
#     estimator=estimator,
#     train=True
# )

### Create group feature

In [None]:
# CreateGroupFeature(train_x.fillna(0)).kmeans_with_auto_k(standardization='min_max', k_range=range(2,20), random_state=0, parallel=True, train=True)

### Convert each numeric data to category data

In [None]:
# CreateGroupFeature(train_x[['clean_temp']].fillna(0)).kmeans_with_auto_k(standardization='min_max', k_range=range(2,20), random_state=0, parallel=True, train=True)

### Generate polynomial and interaction features.

In [None]:
# GeneratePolynomialFeatures(train_x.fillna(0)).get_dataframe(degree=2, interaction_only=False, train=True)

### Reduce dimension

In [None]:
# ReduceDimensionPCA(train_x.fillna(0)).pca(n_components=0.99, train=True)