# Model Predict
---
Describe:

---
## 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import pickle

import model_predict_config as mp_config
import feature_engineering_config as fe_config

# feature engineering
from feature_engineering import DealWithMissingValue, VariableTransformation, OutlierDetection,\
FeatureSelection, CreateGroupFeatureFromEachCol, CreateGroupFeatureFromAllCol, GeneratePolynomialFeatures, ReduceDimensionPCA, Standardization


## 2. Read xlsx File

In [2]:
test_x = pd.read_excel('../Data/2022-test-v1.xlsx')

In [3]:
test_x.head()

Unnamed: 0,clean_temp,clean_ec,clean_ph4,clean_ph5,clean_ph7,clean_ph8,clean_pressure11,clean_pressure12,clean_pressure21,clean_pressure22,...,env_rpi14_pm1,env_rpi14_pm10,env_rpi14_pm25,env_rpi14_temp,env_rpi15_hum,env_rpi15_lux,env_rpi15_pm1,env_rpi15_pm10,env_rpi15_pm25,env_rpi15_temp
0,41.9,12.5,,,,7.0,820.26,1266.29,882.58,630.71,...,26.87,48.62,38.0,34.57,43.95,0.5,30.37,53.0,43.37,34.91
1,41.1,10.67,,,,6.75,810.2,1266.53,885.96,630.76,...,17.71,30.14,24.42,36.16,41.48,0.42,13.71,21.42,18.14,37.81
2,41.1,10.67,,,,6.75,809.84,1266.31,885.65,630.65,...,17.71,30.14,24.42,36.16,41.48,0.42,13.71,21.42,18.14,37.81
3,50.2,12.67,8.6,7.8,7.3,7.13,848.06,1454.1,1000.42,899.14,...,26.5,49.37,39.37,34.71,48.44,1.25,27.12,45.5,38.0,34.79
4,50.2,12.67,8.6,7.8,7.3,7.13,848.06,1454.1,1000.42,899.14,...,26.5,49.37,39.37,34.71,48.44,1.25,27.12,45.5,38.0,34.79


In [4]:
test_x.shape

(100, 125)

## 3. Pipeline

In [5]:
# variable transformation
train_data = pd.read_excel('../Data/2022-train-v2.xlsx')
train_x = train_data.drop(['sensor_point5_i_value', 'sensor_point6_i_value', 'sensor_point7_i_value', 'sensor_point8_i_value', 'sensor_point9_i_value', 'sensor_point10_i_value'], axis=1)
del train_data
large_skew_bool = train_x.skew().apply(abs) > fe_config.skew_threshold


In [6]:
large_skew_bool[0:10]

clean_temp           True
clean_ec             True
clean_ph4           False
clean_ph5            True
clean_ph7           False
clean_ph8            True
clean_pressure11     True
clean_pressure12     True
clean_pressure21     True
clean_pressure22     True
dtype: bool

In [7]:
train_process = mp_config.train_process
random_state = mp_config.random_state

In [8]:
# define nan
if mp_config.replace_zero_to_nan:
    test_x = test_x.replace({0: np.nan})

# missing values
if mp_config.deal_with_missing_value == 'drop_na':
    test_x = DealWithMissingValue(test_x).drop_na()
elif mp_config.deal_with_missing_value == 'imputation':
    test_x = DealWithMissingValue(test_x).imputation(strategy='most_frequent', train=train_process)
elif mp_config.deal_with_missing_value == 'k_neighbors_regressor':
    test_x = DealWithMissingValue(test_x).iterative_imputer(method='k_neighbors_regressor', train=train_process)
else:
    pass


# variable transformation
with open(f'../Model/variable_transformation.pickle', 'rb') as f:
    large_skew_bool = pickle.load(f)

for col in test_x._get_numeric_data().columns[large_skew_bool]:
    test_x[col] = VariableTransformation(test_x, col=col).transform(method=mp_config.variable_transformation, train=train_process)

    
    
    
# create group feature  
test_x['group'] = CreateGroupFeatureFromAllCol(test_x).kmeans_with_auto_k(
    standardization=mp_config.kmeans_standardization,
    k_range=mp_config.kmeans_k_range,
    random_state=random_state,
    parallel=True,
    parallel_verbose=0,
    train=train_process
).astype(str)


# convert each numeric data to category data
for col in test_x.columns:
    if col == 'group':
        pass
    else:
        test_x[col+'_group'] = CreateGroupFeatureFromEachCol(test_x[[col]]).kmeans_with_auto_k(
            standardization=mp_config.kmeans_standardization,
            k_range=mp_config.kmeans_k_range,
            random_state=random_state,
            parallel=True,
            parallel_verbose=0,
            train=train_process
        ).astype(str)
        
        
# generate polynomial and interaction features
if mp_config.generate_polynomial_features:
    test_x = GeneratePolynomialFeatures(test_x).get_dataframe(
        degree=mp_config.pf_degree,
        interaction_only=mp_config.pf_interaction_only,
        train=train_process
    )
    
if mp_config.reduce_dimension:
    test_x = ReduceDimensionPCA(test_x).pca(n_components=0.99, train=train_process)
    
    
if mp_config.standardization == 'standard_scaler':
    test_x = Standardization(test_x, prefix='x').standard_scaler(train=train_process)
elif mp_config.standardization == 'min_max_scaler':
    test_x = Standardization(test_x, prefix='x').min_max_scaler(train=train_process)


    
# Feature Selection
test_x = test_x[mp_config.feature_select]

In [9]:
test_x.head()

Unnamed: 0,clean_pressure31,clean_pressure41,clean_pressure72,clean_pressure81,clean_pressure91,clean_pressure92,clean_pressure102,oven_pa1,oven_pa2,oven_pb1,...,painting_g9_act_hvc_group,painting_g10_act_hvv_group,painting_g10_act_hvc_group,painting_g11_act_a_air_group,painting_g11_act_hvc_group,env_rpi05_temp_group,env_rpi07_pm10_group,env_rpi07_pm25_group,env_rpi14_pm1_group,env_rpi15_pm1_group
0,-0.870386,2.370398,2.908424,1.730852,-0.536805,-3.369914,0.830347,1.887407,1.523932,3.588046,...,4,4,4,4,4,4,4,4,4,4
1,-3.14683,2.370398,2.762858,1.471067,-1.004266,-3.362543,0.831111,1.901039,1.495959,2.018955,...,4,4,4,4,4,4,4,4,4,4
2,-3.180098,2.370398,2.762858,1.493588,-1.029611,-3.363904,0.831315,1.889676,1.491671,2.00929,...,4,4,4,4,4,4,4,4,4,4
3,-0.611778,2.464574,1.150083,1.410034,-0.480847,1.107103,0.815411,0.878227,2.036491,1.393034,...,4,4,4,4,4,4,4,4,4,4
4,-0.611778,2.464574,1.150083,1.410034,-0.480847,1.107103,0.815411,0.878227,2.036491,1.393034,...,4,4,4,4,4,4,4,4,4,4


In [10]:
with open(f'../Data/meta/test_x_after_feature_engineering.pickle', 'wb' ) as f:
    pickle.dump(test_x, f)