In [1]:
import config
import ast
import pickle
import pandas as pd
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pickle
import statistics

from utils.pandas_dataframe import grid_display
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from itertools import permutations
import gc

# validation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# model
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
import xgboost as xgb
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from catboost import CatBoostRegressor


from IPython.display import display
%matplotlib inline

  from pandas import MultiIndex, Int64Index


In [2]:
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.float_format',lambda x: '%.2f' % x)
# pd.set_option('display.max_rows', 5)
pd.set_option('display.max_rows', None)

In [3]:
def Col_types(Data):
    Column_types = Data.dtypes.to_frame().reset_index()
    Column_types.columns = ['ColumnName', 'Type']
    Column_types.sort_values( by= 'Type', inplace = True)
    return Column_types

def Missing_Counts(Data):
    missing = Data.isnull().sum()
    missing = missing[ missing >0]
    missing.sort_values( inplace=True)
    Missing_Count = pd.DataFrame({ 'ColumnName':missing.index, 'MissingCount':missing.values})
    Missing_Count['Percentage(%)'] = Missing_Count['MissingCount'].apply(lambda x:round(x/Data.shape[0]*100,2))
    return Missing_Count

In [4]:
with open('../Data/meta/test_x_after_feature_engineering.pickle', 'rb') as f:
    data_x = pickle.load(f)
    
data_x

Unnamed: 0,clean_pressure31,clean_pressure41,clean_pressure72,clean_pressure81,clean_pressure91,clean_pressure92,clean_pressure102,oven_pa1,oven_pa2,oven_pb1,...,painting_g9_act_hvc_group,painting_g10_act_hvv_group,painting_g10_act_hvc_group,painting_g11_act_a_air_group,painting_g11_act_hvc_group,env_rpi05_temp_group,env_rpi07_pm10_group,env_rpi07_pm25_group,env_rpi14_pm1_group,env_rpi15_pm1_group
0,-0.87,2.37,2.91,1.73,-0.54,-3.37,0.83,1.89,1.52,3.59,...,4,4,4,4,4,4,4,4,4,4
1,-3.15,2.37,2.76,1.47,-1.0,-3.36,0.83,1.9,1.5,2.02,...,4,4,4,4,4,4,4,4,4,4
2,-3.18,2.37,2.76,1.49,-1.03,-3.36,0.83,1.89,1.49,2.01,...,4,4,4,4,4,4,4,4,4,4
3,-0.61,2.46,1.15,1.41,-0.48,1.11,0.82,0.88,2.04,1.39,...,4,4,4,4,4,4,4,4,4,4
4,-0.61,2.46,1.15,1.41,-0.48,1.11,0.82,0.88,2.04,1.39,...,4,4,4,4,4,4,4,4,4,4
5,-0.6,2.46,1.11,1.46,-0.48,1.05,0.82,0.85,2.0,0.84,...,4,4,4,4,4,4,4,4,4,4
6,-0.65,2.46,0.96,1.0,-0.9,0.75,0.81,1.13,1.84,0.18,...,4,4,4,4,4,4,4,4,4,4
7,-1.06,-2.15,-1.89,-3.07,-4.34,-3.49,0.78,-0.52,1.77,2.15,...,4,4,4,4,4,4,4,4,4,4
8,-0.47,-0.27,0.96,-0.53,0.48,0.38,-1.25,0.61,0.5,-0.2,...,4,4,4,4,4,4,4,4,4,4
9,-0.47,-0.27,0.94,-0.52,0.53,0.41,-1.25,0.56,0.49,-0.3,...,4,4,4,4,4,4,4,4,4,4


In [51]:
display(Col_types(data_x))

Unnamed: 0,ColumnName,Type
0,clean_pressure31,float64
29,painting_g7_act_hvv,float64
30,painting_g9_act_hvv,float64
32,painting_g11_act_hvv,float64
33,painting_g12_act_a_air,float64
34,env_rpi05_hum,float64
35,env_rpi05_pm1,float64
36,env_rpi07_hum,float64
37,env_rpi07_pm10,float64
38,env_rpi07_temp,float64


In [52]:
# onehot encoding
for index , col in enumerate(data_x.columns.tolist()):
    if data_x[col].dtype == 'object':
        data_x = data_x.join(pd.get_dummies(data_x[col],prefix=col))
        data_x = data_x.drop(col, axis = 1)

In [53]:
# onehot encoding後有些train的欄位在測試集沒有，補上欄位

for i in config.x_train_columns:
    if i not in data_x.columns.tolist():
        data_x[i] = np.zeros(100, dtype='uint8')

# 欄位排序
data_x = data_x[config.x_train_columns]

In [54]:
display(Col_types(data_x))

Unnamed: 0,ColumnName,Type
54,painting_g8_act_a_air_group_1,uint8
78,painting_g11_act_a_air_group_3,uint8
77,painting_g11_act_a_air_group_2,uint8
76,painting_g11_act_a_air_group_1,uint8
75,painting_g11_act_a_air_group_0,uint8
74,painting_g10_act_hvc_group_5,uint8
73,painting_g10_act_hvc_group_4,uint8
72,painting_g10_act_hvc_group_3,uint8
71,painting_g10_act_hvc_group_2,uint8
70,painting_g10_act_hvc_group_1,uint8


In [55]:
# X_train, X_test, Y_train, Y_test = train_test_split(data_x, data_y, train_size = 0.8, random_state = 3)

# X變數標準化
scaler = StandardScaler().fit(data_x)
data_x_S = scaler.transform(data_x)

In [57]:
X_PT = data_x.copy()
predict_y_test = pd.DataFrame({})

for k in range(6):
    
    with open('../model/model_Y'+ str(config.order_list[k]) + '(Albert).pickle', 'rb') as f:
        model = pickle.load(f)
    
    # 儲存預測結果
    predict_y_test[config.data_y_col[config.order_list[k]]] = model.predict(X_PT)
    # 把取得的預測值當作變數放進X
    X_PT[config.data_y_col[config.order_list[k]]] = model.predict(X_PT)
    
    # 清空 model
    del model
    
# 調整欄位順序
predict_y_test = predict_y_test[config.data_y_col]
display(predict_y_test)

with open('../Output/predict_y_test(Albert).pickle', 'wb') as f:
    pickle.dump(predict_y_test, f)

Unnamed: 0,sensor_point5_i_value,sensor_point6_i_value,sensor_point7_i_value,sensor_point8_i_value,sensor_point9_i_value,sensor_point10_i_value
0,-1.42,-0.72,-0.91,-1.27,-1.04,-0.98
1,-1.24,-0.57,-1.02,-1.73,-0.61,-0.81
2,-1.27,-0.58,-1.02,-1.73,-0.61,-0.82
3,-0.04,-1.08,-1.24,-1.53,-1.03,-0.6
4,-0.04,-1.08,-1.24,-1.53,-1.03,-0.6
5,-0.21,-1.08,-1.14,-1.51,-0.98,-0.51
6,-0.48,-1.02,-1.24,-1.53,-1.04,-0.65
7,0.38,-0.25,-0.73,-0.48,-0.3,-0.08
8,0.27,-0.42,0.14,0.11,-0.48,-0.38
9,0.14,-0.42,0.14,0.11,-0.48,-0.41
