# Model Prediction
---
Description:

This notebook shows the test processing. In the end of content, you can get two predictive result by different method.
It depends on `model_predict_config.py` to lead the data flow.

---
## 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import pickle

import model_predict_config as mp_config
import feature_engineering_config as fe_config

# feature engineering
from feature_engineering import DealWithMissingValue, VariableTransformation, OutlierDetection,\
FeatureSelection, CreateGroupFeatureFromEachCol, CreateGroupFeatureFromAllCol, GeneratePolynomialFeatures, ReduceDimensionPCA, Standardization

## 2. Read xlsx File

In [2]:
test_x = pd.read_excel('../Data/2022-test-v1.xlsx')

In [3]:
test_x.head()

Unnamed: 0,clean_temp,clean_ec,clean_ph4,clean_ph5,clean_ph7,clean_ph8,clean_pressure11,clean_pressure12,clean_pressure21,clean_pressure22,...,env_rpi14_pm1,env_rpi14_pm10,env_rpi14_pm25,env_rpi14_temp,env_rpi15_hum,env_rpi15_lux,env_rpi15_pm1,env_rpi15_pm10,env_rpi15_pm25,env_rpi15_temp
0,41.9,12.5,,,,7.0,820.26,1266.29,882.58,630.71,...,26.87,48.62,38.0,34.57,43.95,0.5,30.37,53.0,43.37,34.91
1,41.1,10.67,,,,6.75,810.2,1266.53,885.96,630.76,...,17.71,30.14,24.42,36.16,41.48,0.42,13.71,21.42,18.14,37.81
2,41.1,10.67,,,,6.75,809.84,1266.31,885.65,630.65,...,17.71,30.14,24.42,36.16,41.48,0.42,13.71,21.42,18.14,37.81
3,50.2,12.67,8.6,7.8,7.3,7.13,848.06,1454.1,1000.42,899.14,...,26.5,49.37,39.37,34.71,48.44,1.25,27.12,45.5,38.0,34.79
4,50.2,12.67,8.6,7.8,7.3,7.13,848.06,1454.1,1000.42,899.14,...,26.5,49.37,39.37,34.71,48.44,1.25,27.12,45.5,38.0,34.79


In [4]:
test_x.shape

(100, 125)

## 3. Pipeline

In [5]:
train_process = mp_config.train_process
random_state = mp_config.random_state

In [6]:
# define nan
if mp_config.replace_zero_to_nan:
    test_x = test_x.replace({0: np.nan})

# missing values
if mp_config.deal_with_missing_value == 'drop_na':
    test_x = DealWithMissingValue(test_x).drop_na()
elif mp_config.deal_with_missing_value == 'imputation':
    test_x = DealWithMissingValue(test_x).imputation(strategy='most_frequent', train=train_process)
elif mp_config.deal_with_missing_value == 'k_neighbors_regressor':
    test_x = DealWithMissingValue(test_x).iterative_imputer(method='k_neighbors_regressor', train=train_process)
else:
    pass


# variable transformation
with open(f'../Model/variable_transformation.pickle', 'rb') as f:
    large_skew_bool = pickle.load(f)

for col in test_x._get_numeric_data().columns[large_skew_bool]:
    test_x[col] = VariableTransformation(test_x, col=col).transform(method=mp_config.variable_transformation, train=train_process)

    
    
    
# create group feature  
test_x['group'] = CreateGroupFeatureFromAllCol(test_x).kmeans_with_auto_k(
    standardization=mp_config.kmeans_standardization,
    k_range=mp_config.kmeans_k_range,
    random_state=random_state,
    parallel=True,
    parallel_verbose=0,
    train=train_process
).astype(str)


# convert each numeric data to category data
for col in test_x.columns:
    if col == 'group':
        pass
    else:
        test_x[col+'_group'] = CreateGroupFeatureFromEachCol(test_x[[col]]).kmeans_with_auto_k(
            standardization=mp_config.kmeans_standardization,
            k_range=mp_config.kmeans_k_range,
            random_state=random_state,
            parallel=True,
            parallel_verbose=0,
            train=train_process
        ).astype(str)
        
        
# generate polynomial and interaction features
if mp_config.generate_polynomial_features:
    test_x = GeneratePolynomialFeatures(test_x).get_dataframe(
        degree=mp_config.pf_degree,
        interaction_only=mp_config.pf_interaction_only,
        train=train_process
    )
    
if mp_config.reduce_dimension:
    test_x = ReduceDimensionPCA(test_x).pca(n_components=0.99, train=train_process)
    
    
if mp_config.standardization == 'standard_scaler':
    test_x = Standardization(test_x, prefix='x').standard_scaler(train=train_process)
elif mp_config.standardization == 'min_max_scaler':
    test_x = Standardization(test_x, prefix='x').min_max_scaler(train=train_process)


    
# Feature Selection
test_x = test_x[mp_config.feature_select]

In [7]:
test_x.head()

Unnamed: 0,clean_pressure11,clean_pressure12,clean_pressure21,clean_pressure22,clean_pressure23,clean_pressure31,clean_pressure33,clean_pressure41,clean_pressure42,clean_pressure51,...,painting_g8_act_hvv_group,painting_g9_act_a_air_group,painting_g9_act_t_air_group,painting_g9_act_hvv_group,painting_g10_act_hvc_group,painting_g11_act_hvc_group,env_rpi05_temp_group,env_rpi07_pm25_group,env_rpi15_lux_group,env_rpi15_pm25_group
0,0.994488,-1.505209,-1.857353,-2.289067,-2.387218,-0.8643,-2.302365,2.291633,-2.614809,1.967087,...,4,4,4,4,4,4,4,4,4,0
1,0.827452,-1.5042,-1.83713,-2.288962,-2.344703,-3.152051,-2.31176,2.339435,-2.609028,1.136114,...,4,4,4,4,4,4,4,4,4,4
2,0.821558,-1.505125,-1.839048,-2.289193,-2.346529,-3.185647,-2.311506,2.339435,-2.607978,1.061318,...,4,4,4,4,4,4,4,4,4,4
3,1.47982,0.8004,1.168644,1.741143,-0.13777,-0.605238,0.176959,2.387237,2.56899,0.694179,...,4,4,4,4,4,4,4,4,4,3
4,1.47982,0.8004,1.168644,1.741143,-0.13777,-0.605238,0.176959,2.387237,2.56899,0.694179,...,4,4,4,4,4,4,4,4,4,3


## 4. Save test data after feature engineering

In [8]:
with open(f'../Data/meta/test_x_after_feature_engineering.pickle', 'wb' ) as f:
    pickle.dump(test_x, f)

## 5. Muti-output Model and Single Output Model

* Muti-output Model

In [11]:
with open(f'../Model/catboost_all_20220913134806.pickle', 'rb') as f:
    muti_output_model = pickle.load(f)

* Single output Model

In [12]:
with open(f'../Model/catboost_single_20220913134806.pickle', 'rb' ) as f:
    single_output_model_dic = pickle.load(f)

## 6. Load y Scaler Model

In [13]:
with open(f'../Model/standard_scaler_y.pickle', 'rb') as f:
    scaler_y = pickle.load(f)

## 7. Prediction

* Muti-output Model

In [14]:
prediction_muti = scaler_y.inverse_transform( muti_output_model.predict(test_x) )

* Single output Model

In [15]:
with open(f'../Model/target_col_list.pickle', 'rb') as f:
    test_y_col = pickle.load(f)

In [16]:
array_li= [single_output_model_dic[col_y]['model'].predict(test_x).reshape(-1,1) for col_y in test_y_col]
all_res = np.concatenate(array_li, axis=1)

In [17]:
prediction_single = scaler_y.inverse_transform(all_res)

In [18]:
prediction_muti

array([[ 53.65818671,  64.10623683,  76.82469694,  47.33872365,
         63.97590392,  58.30788958],
       [ 64.01180614,  85.01512415, 100.70627372,  51.65203596,
         79.71400649,  78.30908306],
       [ 64.04388225,  85.05094042, 100.39763656,  51.57262284,
         79.21756163,  78.40302331],
       [ 73.60210625,  59.61104616,  80.06147895,  48.93761145,
         65.94845948,  69.71019712],
       [ 73.60210625,  59.61104616,  80.06147895,  48.93761145,
         65.94845948,  69.71019712],
       [ 74.2672145 ,  59.01809718,  81.32219743,  48.76188285,
         67.47084586,  70.17041115],
       [ 68.31813388,  63.71162735,  86.55829562,  53.03003534,
         68.54925177,  71.34434843],
       [ 78.60262299,  89.39787883, 108.15822501,  74.01345807,
         86.50157816,  90.65908064],
       [ 76.36041911,  82.63683567, 115.99728992,  75.02003001,
         81.53846445,  86.15541539],
       [ 77.56564945,  82.88993236, 117.64547865,  78.28241725,
         82.36872281,  86.9

In [19]:
prediction_single

array([[ 52.14704294,  66.321226  ,  90.84475956,  54.76267482,
         73.94940352,  71.69725534],
       [ 61.70683541,  75.15891026,  95.22505696,  47.00660349,
         78.18630536,  79.24943191],
       [ 60.49783199,  75.85050379,  95.08817897,  47.61481468,
         79.77926102,  79.4061594 ],
       [ 79.11604713,  59.30716052,  90.14095329,  53.22403907,
         68.10063989,  79.0113036 ],
       [ 79.11604713,  59.30716052,  90.14095329,  53.22403907,
         68.10063989,  79.0113036 ],
       [ 78.50487646,  58.03349348,  89.69281892,  52.7288354 ,
         68.9032474 ,  79.33230987],
       [ 71.89947357,  59.9299992 ,  91.17231753,  50.59462904,
         70.23812837,  77.87327267],
       [ 80.39341835,  75.27167392,  97.28060274,  64.60740844,
         83.72870641,  90.17570555],
       [ 80.97445075,  67.08701065, 105.09629117,  70.02913368,
         78.22199183,  86.18300026],
       [ 80.67512126,  67.77160726, 104.58070545,  83.29538053,
         79.12404623,  85.9