# Feature Engineering
---
Description:

Feature engineering is the process of selecting, manipulating, and transforming raw data into features that can be used in supervised learning. 
In this notebook you can see our feature engineering pipeline. It depends on `feature_engineering_config.py` to lead the data flow.

---
## 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn import preprocessing

# outlier detection
from pyod.models.ecod import ECOD
from pyod.models.suod import SUOD
from pyod.models.lof import LOF
from pyod.models.iforest import IForest
from pyod.models.copod import COPOD
from sklearn.covariance import EllipticEnvelope

# feature selection
from xgboost import XGBRegressor
from sklearn.svm import LinearSVC

# feature engineering
from feature_engineering import DealWithMissingValue, VariableTransformation, OutlierDetection,\
FeatureSelection, CreateGroupFeatureFromEachCol, CreateGroupFeatureFromAllCol, GeneratePolynomialFeatures, ReduceDimensionPCA, Standardization

import feature_engineering_config as fe_config

import warnings

pd.set_option('display.max_columns', 150)

## 2. Read xlsx File

In [2]:
train_data = pd.read_excel('../Data/2022-train-v2.xlsx')

## 3. Data Preparation

In [3]:
train_y = train_data[['sensor_point5_i_value', 'sensor_point6_i_value', 'sensor_point7_i_value', 'sensor_point8_i_value', 'sensor_point9_i_value', 'sensor_point10_i_value']]
train_x = train_data.drop(['sensor_point5_i_value', 'sensor_point6_i_value', 'sensor_point7_i_value', 'sensor_point8_i_value', 'sensor_point9_i_value', 'sensor_point10_i_value'], axis=1)
del train_data

In [4]:
train_x.head()

Unnamed: 0,clean_temp,clean_ec,clean_ph4,clean_ph5,clean_ph7,clean_ph8,clean_pressure11,clean_pressure12,clean_pressure21,clean_pressure22,clean_pressure23,clean_pressure31,clean_pressure33,clean_pressure41,clean_pressure42,clean_pressure51,clean_pressure52,clean_pressure61,clean_pressure62,clean_pressure71,clean_pressure72,clean_pressure81,clean_pressure82,clean_pressure91,clean_pressure92,clean_pressure101,clean_pressure102,oven_pa1,oven_pa2,oven_pb1,oven_pb2,oven_a1,oven_a2,oven_a3,oven_b1,oven_b2,oven_b3,painting_g1_act_a_air,painting_g1_act_f_air,painting_g1_act_t_air,painting_g1_act_hvv,painting_g1_act_hvc,painting_g2_act_a_air,painting_g2_act_f_air,painting_g2_act_t_air,painting_g2_act_hvv,painting_g2_act_hvc,painting_g3_act_a_air,painting_g3_act_f_air,painting_g3_act_t_air,painting_g3_act_hvv,painting_g3_act_hvc,painting_g4_act_a_air,painting_g4_act_f_air,painting_g4_act_t_air,painting_g4_act_hvv,painting_g4_act_hvc,painting_g5_act_a_air,painting_g5_act_f_air,painting_g5_act_t_air,painting_g5_act_hvv,painting_g5_act_hvc,painting_g6_act_a_air,painting_g6_act_f_air,painting_g6_act_t_air,painting_g6_act_hvv,painting_g6_act_hvc,painting_g7_act_a_air,painting_g7_act_f_air,painting_g7_act_t_air,painting_g7_act_hvv,painting_g7_act_hvc,painting_g8_act_a_air,painting_g8_act_f_air,painting_g8_act_t_air,painting_g8_act_hvv,painting_g8_act_hvc,painting_g9_act_a_air,painting_g9_act_f_air,painting_g9_act_t_air,painting_g9_act_hvv,painting_g9_act_hvc,painting_g10_act_a_air,painting_g10_act_f_air,painting_g10_act_t_air,painting_g10_act_hvv,painting_g10_act_hvc,painting_g11_act_a_air,painting_g11_act_f_air,painting_g11_act_t_air,painting_g11_act_hvv,painting_g11_act_hvc,painting_g12_act_a_air,painting_g12_act_f_air,painting_g12_act_t_air,painting_g12_act_hvv,painting_g12_act_hvc,env_rpi05_hum,env_rpi05_pm1,env_rpi05_pm10,env_rpi05_pm25,env_rpi05_temp,env_rpi07_hum,env_rpi07_pm1,env_rpi07_pm10,env_rpi07_pm25,env_rpi07_temp,env_rpi09_hum,env_rpi09_lux,env_rpi09_pm1,env_rpi09_pm10,env_rpi09_pm25,env_rpi09_temp,env_rpi14_hum,env_rpi14_lux,env_rpi14_pm1,env_rpi14_pm10,env_rpi14_pm25,env_rpi14_temp,env_rpi15_hum,env_rpi15_lux,env_rpi15_pm1,env_rpi15_pm10,env_rpi15_pm25,env_rpi15_temp
0,41.1,12.4,,,,6.9,820.24,1262.82,883.46,630.74,640.79,509.75,716.48,1065.79,817.12,833.01,604.2,1139.1,648.61,707.22,1196.38,903.51,825.09,905.59,742.77,414.85,455.39,175.85,203.06,207.45,198.49,174.87,207.38,211.92,190.48,215.01,207.86,7.65,140.93,258.92,56.37,18.78,7.21,150.62,261.26,52.5,23.83,7.03,141.84,255.23,51.51,25.17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.71,137.76,253.8,56.69,11.2,7.15,137.1,253.0,53.26,23.84,7.23,137.79,254.08,56.6,18.23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,63.24,60.33,75.83,69.5,29.05,48.98,11.33,17.16,14.16,26.21,0.0,0.0,0.0,0.0,0.0,0.0,48.96,0.0,26.33,47.5,37.33,34.41,43.73,0.5,29.5,51.5,42.0,34.78
1,41.1,12.4,,,,6.9,820.15,1263.0,883.6,630.7,640.8,509.7,716.5,1065.9,817.05,833.05,604.2,1139.05,648.6,706.95,1196.6,903.6,825.15,905.4,742.8,414.65,455.25,176.0,203.01,207.4,198.35,174.85,207.37,211.9,190.42,215.02,207.85,7.09,125.88,231.92,51.55,17.1,7.04,146.06,253.31,50.61,23.45,7.13,137.42,247.36,49.82,24.54,0.4,5.41,9.48,1.9,1.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.73,123.67,227.93,50.22,9.93,6.41,128.27,236.76,50.1,21.9,6.4,128.93,237.57,53.6,17.22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,63.24,60.33,75.83,69.5,29.05,48.98,11.33,17.16,14.16,26.21,0.0,0.0,0.0,0.0,0.0,0.0,48.96,0.0,26.33,47.5,37.33,34.41,43.73,0.5,29.5,51.5,42.0,34.78
2,41.1,12.4,,,,6.9,820.15,1263.0,883.6,630.7,640.8,509.7,716.5,1065.9,817.05,833.05,604.2,1139.05,648.6,706.95,1196.6,903.6,825.15,905.4,742.8,414.65,455.25,176.0,203.01,207.4,198.35,174.85,207.37,211.9,190.42,215.02,207.85,7.09,125.88,231.92,51.55,17.1,7.04,146.06,253.31,50.61,23.45,7.13,137.42,247.36,49.82,24.54,0.4,5.41,9.48,1.9,1.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.73,123.67,227.93,50.22,9.93,6.41,128.27,236.76,50.1,21.9,6.4,128.93,237.57,53.6,17.22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,63.24,60.33,75.83,69.5,29.05,48.98,11.33,17.16,14.16,26.21,0.0,0.0,0.0,0.0,0.0,0.0,48.96,0.0,26.33,47.5,37.33,34.41,43.73,0.5,29.5,51.5,42.0,34.78
3,41.1,12.4,,,,6.9,820.15,1263.0,883.6,630.7,640.8,509.7,716.5,1065.9,817.05,833.05,604.2,1139.05,648.6,706.95,1196.6,903.6,825.15,905.4,742.8,414.65,455.25,176.0,203.01,207.4,198.35,174.85,207.37,211.9,190.42,215.02,207.85,7.09,125.88,231.92,51.55,17.1,7.04,146.06,253.31,50.61,23.45,7.13,137.42,247.36,49.82,24.54,0.4,5.41,9.48,1.9,1.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.73,123.67,227.93,50.22,9.93,6.41,128.27,236.76,50.1,21.9,6.4,128.93,237.57,53.6,17.22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,63.24,60.33,75.83,69.5,29.05,48.98,11.33,17.16,14.16,26.21,0.0,0.0,0.0,0.0,0.0,0.0,48.96,0.0,26.33,47.5,37.33,34.41,43.73,0.5,29.5,51.5,42.0,34.78
4,41.1,12.4,,,,6.9,820.78,1264.0,883.31,630.74,640.69,508.91,715.64,1065.94,816.86,833.03,604.26,1138.81,648.16,705.34,1196.08,902.72,824.85,905.0,742.62,413.86,454.2,176.2,202.81,205.96,197.1,174.86,207.41,212.0,190.9,215.23,207.72,7.66,130.16,261.21,54.59,23.59,7.29,154.83,260.25,52.31,26.19,7.07,147.8,265.26,48.99,28.31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.3,127.7,245.01,52.56,12.39,6.95,132.45,244.33,49.81,24.82,7.15,131.71,242.71,50.92,20.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,63.02,60.71,75.85,69.71,29.12,49.96,13.42,20.14,17.0,26.42,0.0,0.0,0.0,0.0,0.0,0.0,48.78,0.0,26.71,48.85,38.14,34.48,43.94,0.42,29.71,51.85,42.42,34.82


In [5]:
train_y.head()

Unnamed: 0,sensor_point5_i_value,sensor_point6_i_value,sensor_point7_i_value,sensor_point8_i_value,sensor_point9_i_value,sensor_point10_i_value
0,42,56,82,41,73,45
1,37,60,58,40,64,42
2,35,66,62,39,86,45
3,37,58,58,42,52,39
4,57,78,62,51,64,65


## 4. Pipeline

In [6]:
train_process = fe_config.train_process
random_state = fe_config.random_state

In [7]:
# define nan
if fe_config.replace_zero_to_nan:
    train_x = train_x.replace({0: np.nan})

# missing values
if fe_config.deal_with_missing_value == 'drop_na':
    train_x = DealWithMissingValue(train_x).drop_na()
elif fe_config.deal_with_missing_value == 'imputation':
    train_x = DealWithMissingValue(train_x).imputation(strategy='most_frequent', train=train_process)
elif fe_config.deal_with_missing_value == 'k_neighbors_regressor':
    train_x = DealWithMissingValue(train_x).iterative_imputer(method='k_neighbors_regressor', train=train_process)
else:
    pass

# outlier detection on y
if fe_config.outlier_detection == 'ecod':
    temp = OutlierDetection(train_y).ecod(
        contamination=fe_config.ecod_contamination,
        threshold=fe_config.ecod_threshold,
        train=train_process
    )
elif fe_config.outlier_detection == 'suod':
    # initialized a group of outlier detectors for acceleration
    detector_list = [
        LOF(n_neighbors=15), LOF(n_neighbors=20), LOF(n_neighbors=25), LOF(n_neighbors=35),
        COPOD(),
        IForest(n_estimators=100), IForest(n_estimators=200)
    ]
    temp = OutlierDetection(train_y).suod(
        detector_list,
        threshold=fe_config.suod_threshold,
        train=train_process,
        verbose=False,
        n_jobs=4  # 如果是 mac 要設Ｎone
    )
elif fe_config.outlier_detection == 'elliptic_envelope':
    temp = OutlierDetection(train_y).elliptic_envelope(
        random_state=random_state,
        train=train_process
    )
else:
    pass

train_x['outlier_detection_from_y'] = temp
train_y['outlier_detection_from_y'] = temp
# remove outlier and drop outlier_detection_from_y
train_x = train_x.query("outlier_detection_from_y==False")
train_x = train_x.drop('outlier_detection_from_y', axis=1)
train_y = train_y.query("outlier_detection_from_y==False")
train_y = train_y.drop('outlier_detection_from_y', axis=1)

# variable transformation
large_skew_bool = train_x.skew().apply(abs) > fe_config.skew_threshold
with open(f'../Model/variable_transformation.pickle', 'wb' ) as f:
    pickle.dump(large_skew_bool, f)
for col in train_x._get_numeric_data().columns[large_skew_bool]:
    train_x[col] = VariableTransformation(train_x, col=col).transform(method=fe_config.variable_transformation, train=train_process)

    
    
    
    
# create group feature  
train_x['group'] = CreateGroupFeatureFromAllCol(train_x).kmeans_with_auto_k(
    standardization=fe_config.kmeans_standardization,
    k_range=fe_config.kmeans_k_range,
    random_state=random_state,
    parallel=True,
    parallel_verbose=0,
    train=train_process
).astype(str)


# convert each numeric data to category data
for col in train_x.columns:
    if col == 'group':
        pass
    else:
        with warnings.catch_warnings(): 
            # because sometime each column can not promise unique value is bigger than k.
            # So it will raise ConvergenceWarning
            warnings.simplefilter('ignore')
            train_x[col+'_group'] = CreateGroupFeatureFromEachCol(train_x[[col]]).kmeans_with_auto_k(
                standardization=fe_config.kmeans_standardization,
                k_range=fe_config.kmeans_k_range,
                random_state=random_state,
                parallel=True,
                parallel_verbose=0,
                train=train_process
            ).astype(str)
        
        
# generate polynomial and interaction features
if fe_config.generate_polynomial_features:
    train_x = GeneratePolynomialFeatures(train_x).get_dataframe(
        degree=fe_config.pf_degree,
        interaction_only=fe_config.pf_interaction_only,
        train=train_process
    )
    
if fe_config.reduce_dimension:
    train_x = ReduceDimensionPCA(train_x).pca(n_components=0.99, train=train_process)
    
    
if fe_config.standardization == 'standard_scaler':
    train_x = Standardization(train_x, prefix='x').standard_scaler(train=train_process)
elif fe_config.standardization == 'min_max_scaler':
    train_x = Standardization(train_x, prefix='x').min_max_scaler(train=train_process)


    
# Feature Selection
if 'variance' in fe_config.feature_selection:
    # removing features with zero variance
    train_x = FeatureSelection(train_x).variance(threshold=0, train=train_process)

# mutual information
if 'mutual_information' in fe_config.feature_selection:
    s = set()
    for col in train_x.columns:
        temp = FeatureSelection(test_x).mutual_information(k=50, train=train_process, y=train_y[col])
        s.update({temp.columns})
    test_x = test_x[list(s)]
    
# importance weight for feature selection
if 'importance_weight' in fe_config.feature_selection:
    estimator = XGBRegressor(n_jobs=-1)
    # 一定要用數結構的模型，因為這樣才能知道類別變數的重要性，而且分群變數(int)沒有做encoding，所以用tree based model 才不會有問題
    train_x = FeatureSelection(train_x).importance_weight(
        y=train_y,
        estimator=estimator,
        train=train_process
    )

# recursive feature elimination with cross-validation to select features
if 'recursive_feature_elimination' in fe_config.feature_selection:
    # 一定要用數結構的模型，因為這樣才能知道類別變數的重要性，而且分群變數(int)沒有做encoding，所以用tree based model 才不會有問題
    estimator = XGBRegressor(n_jobs=-1)
    train_x = FeatureSelection(train_x).recursive_feature_elimination(
        y=train_y,
        estimator=estimator,
        cv=fe_config.feature_selection_cv,
        min_features_to_select=fe_config.min_features_to_select,
        train=train_process,
        n_jobs=-1
    )




[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    4.7s remaining:    4.7s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    6.5s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    4.9s remaining:    4.9s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    6.3s finished





[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   4 | elapsed:    4.6s remaining:    4.6s
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    6.1s finished


In [8]:
train_x.head()

Unnamed: 0,clean_pressure31,clean_pressure33,clean_pressure41,clean_pressure51,clean_pressure52,clean_pressure61,clean_pressure71,clean_pressure72,clean_pressure81,clean_pressure91,clean_pressure102,oven_pa2,oven_pb1,oven_a2,oven_a3,oven_b1,oven_b2,painting_g1_act_f_air,painting_g1_act_t_air,painting_g1_act_hvc,painting_g4_act_f_air,painting_g4_act_hvv,painting_g5_act_a_air,painting_g5_act_f_air,painting_g5_act_hvc,painting_g6_act_a_air,painting_g7_act_f_air,painting_g7_act_hvv,painting_g10_act_t_air,painting_g10_act_hvc,painting_g12_act_a_air,env_rpi05_hum,env_rpi05_pm1,env_rpi05_temp,env_rpi07_hum,env_rpi07_pm1,env_rpi07_temp,env_rpi09_hum,env_rpi09_lux,env_rpi09_pm1,env_rpi09_temp,env_rpi14_lux,env_rpi14_pm1,env_rpi14_pm10,env_rpi14_temp,env_rpi15_hum,env_rpi15_lux,env_rpi15_pm1,env_rpi15_pm25,env_rpi15_temp,clean_pressure82_group,oven_pb1_group,oven_pb2_group,oven_a1_group,oven_b1_group,oven_b3_group,painting_g3_act_hvv_group,painting_g8_act_f_air_group,painting_g9_act_hvc_group,env_rpi05_temp_group
0,-0.845269,-2.274242,2.327716,2.163639,-2.96591,1.903804,0.434963,2.902725,1.936083,-0.427078,0.829675,1.354979,3.810291,-1.481363,-0.931837,1.288105,1.470054,0.544434,0.538954,0.745927,-2.702191,-2.607632,-1.468674,-1.536139,-1.293654,-0.328227,0.455272,0.818923,-2.767157,-2.608646,-0.334091,1.61189,0.861211,-0.701633,0.701028,0.479899,-1.334382,-3.397109,-0.309701,-1.46511,-3.540811,-1.259066,1.538469,1.379251,0.524158,0.386009,-0.705534,1.5865,1.582081,-0.356139,5,5,3,2,2,1,3,0,4,1
1,-0.845888,-2.273734,2.327716,2.172853,-2.96591,1.903804,0.42707,2.906072,1.950879,-0.471062,0.829483,1.344666,3.789385,-1.482244,-0.934316,1.270825,1.474749,0.413131,0.41973,0.590505,-2.679287,-2.592024,-1.468674,-1.536139,-1.293654,-0.328227,0.33411,0.632247,-2.767157,-2.608646,-0.334091,1.61189,0.861211,-0.701633,0.701028,0.479899,-1.334382,-3.397109,-0.309701,-1.46511,-3.540811,-1.259066,1.538469,1.379251,0.524158,0.386009,-0.705534,1.5865,1.582081,-0.356139,5,5,3,2,2,1,3,0,4,1
4,-0.855656,-2.29559,2.327716,2.168243,-2.964846,1.903804,0.380329,2.902725,1.807915,-0.562123,0.828042,1.30366,3.213134,-1.478717,-0.921874,1.411308,1.574606,0.451183,0.54877,1.171348,-2.702191,-2.607632,-1.468674,-1.536139,-1.293654,-0.328227,0.36939,0.700696,-2.767157,-2.608646,-0.334091,1.576919,0.942138,-0.660935,0.893311,0.993093,-1.076045,-3.397109,-0.309701,-1.46511,-3.540811,-1.259066,1.594165,1.472165,0.533389,0.420164,-0.819231,1.624504,1.628188,-0.343186,5,5,3,2,2,1,3,0,1,1
5,-0.855656,-2.29559,2.327716,2.168243,-2.964846,1.903804,0.380329,2.902725,1.807915,-0.562123,0.828042,1.30366,3.213134,-1.478717,-0.921874,1.411308,1.574606,0.451183,0.54877,1.171348,-2.702191,-2.607632,-1.468674,-1.536139,-1.293654,-0.328227,0.36939,0.700696,-2.767157,-2.608646,-0.334091,1.576919,0.942138,-0.660935,0.893311,0.993093,-1.076045,-3.397109,-0.309701,-1.46511,-3.540811,-1.259066,1.594165,1.472165,0.533389,0.420164,-0.819231,1.624504,1.628188,-0.343186,5,5,3,2,2,1,3,0,1,1
6,-0.855656,-2.29559,2.327716,2.168243,-2.964846,1.903804,0.380329,2.902725,1.807915,-0.562123,0.828042,1.30366,3.213134,-1.478717,-0.921874,1.411308,1.574606,0.451183,0.54877,1.171348,-2.702191,-2.607632,-1.468674,-1.536139,-1.293654,-0.328227,0.36939,0.700696,-2.767157,-2.608646,-0.334091,1.576919,0.942138,-0.660935,0.893311,0.993093,-1.076045,-3.397109,-0.309701,-1.46511,-3.540811,-1.259066,1.594165,1.472165,0.533389,0.420164,-0.819231,1.624504,1.628188,-0.343186,5,5,3,2,2,1,3,0,1,1


In [9]:
with open(f'../Data/meta/train_x_after_feature_engineering.pickle', 'wb' ) as f:
    pickle.dump(train_x, f)

In [10]:
with open(f'../Data/meta/non_scaled_train_y_after_feature_engineering.pickle', 'wb' ) as f:
    pickle.dump(train_y, f)

In [11]:
if fe_config.standardization == 'standard_scaler'                     :
    train_y = Standardization(train_y, prefix='y').standard_scaler(train=fe_config.train_process)
elif fe_config.standardization == 'min_max_scaler':
    train_y = Standardization(train_y, prefix='y').min_max_scaler(train=fe_config.train_process)

In [12]:
with open(f'../Data/meta/train_y_after_feature_engineering.pickle', 'wb' ) as f:
    pickle.dump(train_y, f)