# Remove not important variable And Ensemble
- catboost 결과 중요하지 않은 상위 99개 변수를 제거한 모델
- 필요없는 열을 제거하자, score가 소폭 향상

### Packages Loading

In [1]:
# basic packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rc('font', family = 'Malgun Gothic')

# catboostregressor
from catboost import CatBoostRegressor, Pool

# RMSE
from sklearn.metrics import mean_squared_error

# r2_score
from sklearn.metrics import r2_score

# fix random seed
import random
random.seed(2020)
random_seed = 2020

### preprocessing

#### preprocessing train set

In [2]:
train = pd.read_csv("C:/dust/data/train.csv")
train = train.fillna(method = 'ffill')
train.drop(columns = ["Unnamed: 0","PM25"], inplace = True)
train = train.sort_values(by = ["datetime"])

train_y = train[["datetime","PM10"]]
train_y.rename(columns = {"datetime":"forecast"}, inplace = True)
train_x = train.rename(columns = {"PM10":"actual_PM10_before1"})
train_x = train_x.shift(1)
train = pd.concat([train_x, train_y],axis = 1)
train = train.dropna()
train = train.reset_index(drop = True)
train["datetime"] = pd.to_datetime(train["datetime"])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_y.rename(columns = {"datetime":"forecast"}, inplace = True)


#### preprocessing test set

In [3]:
test = pd.read_csv("C:/dust/data/test.csv")
test = test.fillna(method = 'ffill')
test.drop(columns = ["Unnamed: 0","PM25"], inplace = True)
test = test.sort_values(by = ["datetime"])

test_y = test[["datetime","PM10"]]
test_y.rename(columns = {"datetime":"forecast"}, inplace = True)
test_x = test.rename(columns = {"PM10":"actual_PM10_before1"})
test_x = test_x.shift(1)
test = pd.concat([test_x, test_y],axis = 1)
test = test.dropna()
test = test.reset_index(drop = True)
test["datetime"] = pd.to_datetime(test["datetime"])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_y.rename(columns = {"datetime":"forecast"}, inplace = True)


#### shifting 
- 1단위 차분시 3차시 correlation을 features로 사용

In [None]:
# train data shifting
train_shift2 = train[["datetime","actual_PM10_before1"]].shift(1)
train_shift3 = train[["datetime","actual_PM10_before1"]].shift(2)
train_shift4 = train[["datetime","actual_PM10_before1"]].shift(3)
# test data shifting
test_shift2 = test[["datetime","actual_PM10_before1"]].shift(1)
test_shift3 = test[["datetime","actual_PM10_before1"]].shift(2)
test_shift4 = test[["datetime","actual_PM10_before1"]].shift(3)
# features renaming
train_shift2.rename(columns = {"actual_PM10_before1":"actual_PM10_before2"}, inplace = True)
train_shift3.rename(columns = {"actual_PM10_before1":"actual_PM10_before3"}, inplace = True)
test_shift2.rename(columns = {"actual_PM10_before1":"actual_PM10_before2"}, inplace = True)
test_shift3.rename(columns = {"actual_PM10_before1":"actual_PM10_before3"}, inplace = True)
# drop columns :: datetime
train_shift2.drop(columns = ["datetime"], inplace = True)
train_shift3.drop(columns = ["datetime"], inplace = True)
test_shift2.drop(columns = ["datetime"], inplace = True)
test_shift3.drop(columns = ["datetime"], inplace = True)
train_shift2.reset_index(drop = True, inplace = True)
train_shift3.reset_index(drop = True, inplace = True)
test_shift2.reset_index(drop = True, inplace = True)
test_shift3.reset_index(drop = True, inplace = True)
# concat for merging to Modeling DataSet
train = pd.concat([train,train_shift2,train_shift3],axis = 1)
test = pd.concat([test,test_shift2,test_shift3],axis = 1)
# drop time at not data
train = train.dropna()
test = test.dropna()
# Merging to Modeling DataSet
concat_actual_train = train.drop(columns = ["forecast","PM10"])
concat_actual_test = test.drop(columns = ["forecast","PM10"])
# rename columns for not duplicates columns
datetime_index_train = concat_actual_train["datetime"]
concat_actual_train = concat_actual_train.drop(columns = ["datetime"])
concat_actual_train.columns = [i + "_1" for i in concat_actual_train.columns]
concat_actual_train = pd.concat([datetime_index_train, concat_actual_train], axis = 1)
datetime_index_test = concat_actual_test["datetime"]
concat_actual_test = concat_actual_test.drop(columns = ["datetime"])
concat_actual_test.columns = [i + "_1" for i in concat_actual_test.columns]
concat_actual_test = pd.concat([datetime_index_test, concat_actual_test], axis = 1)
# concat time and target
train_for_concat = train[["datetime","PM10"]]
test_for_concat = test[["datetime","PM10"]]

#### Resid, Trend, Seasonal
- 앞전에 시계열 분해에서 얻었던 columns 별 Residual, Trend, Seasonal을 Modeling Data와 결합

In [13]:
# load Residual, Trend, Seasonal
resid_train = pd.read_csv("C:/dust/model/resid_train.csv")
resid_test = pd.read_csv("C:/dust/model/resid_test.csv")
trend_train = pd.read_csv("C:/dust/model/trend_train.csv")
trend_test = pd.read_csv("C:/dust/model/trend_test.csv")
seasonal_train = pd.read_csv("C:/dust/model/seasonal_train.csv")
seasonal_test = pd.read_csv("C:/dust/model/seasonal_test.csv")
# Change to Datetime type
resid_train["datetime"] = pd.to_datetime(resid_train["datetime"])
resid_test["datetime"] = pd.to_datetime(resid_test["datetime"])
trend_train["datetime"] = pd.to_datetime(trend_train["datetime"])
trend_test["datetime"] = pd.to_datetime(trend_test["datetime"])
seasonal_train["datetime"] = pd.to_datetime(seasonal_train["datetime"])
seasonal_test["datetime"] = pd.to_datetime(seasonal_test["datetime"])
# merging Residual, Trend, Seasonal
train = pd.merge(resid_train,train_for_concat, on = "datetime", how = "inner")
test = pd.merge(resid_test,test_for_concat, on = "datetime", how = "inner")
train = pd.merge(trend_train,train, on = "datetime", how = "inner")
test = pd.merge(trend_test,test, on = "datetime", how = "inner")
train = pd.merge(seasonal_train,train, on = "datetime", how = "inner")
test = pd.merge(seasonal_test,test, on = "datetime", how = "inner")
# merge to Modeling DataSet
train = pd.merge(train, concat_actual_train, on = "datetime", how = "inner")
test = pd.merge(test, concat_actual_test, on = "datetime", how = "inner")

#### Datetime Features
- Datetime type에서 각 시간에 해당하는 월,일,시간을 추출

In [17]:
# Train Set
train["month"] = train["datetime"].dt.month
train["day"] = train["datetime"].dt.day
train["hour"] = train["datetime"].dt.hour
# Test Set
test["month"] = test["datetime"].dt.month
test["day"] = test["datetime"].dt.day
test["hour"] = test["datetime"].dt.hour

#### Log Transformation to Target
- Target값을 로그 변환을 취해줘서 outlier와 변동에 모델이 견고하도록 만듦

In [18]:
# Train
train_x = train.drop(columns = ["datetime","PM10"])
train_y = train[["PM10"]]
train_y = np.log1p(train_y)
# Test
test_x = test.drop(columns = ["datetime","PM10"])
test_y = test[["PM10"]]
test_y = np.log1p(test_y)

#### Categorical Pool To Learning Catboost
- Catboost 모델이 범주형 자료를 인식하도록 하기 위해선 features들을 CatPool시켜줘야함
- 범주형 변수에 해당하는 16방위와 10분위를 CatPool시킴

In [22]:
# Train
cart_train = []
for i in train_x.columns :
    if "16방위" in i :
        cart_train.append(i)
    elif "10분위" in i :
        cart_train.append(i)

# Test
cart_test = []
for i in test_x.columns :
    if "16방위" in i :
        cart_test.append(i)
    elif "10분위" in i :
        cart_test.append(i)

# Switch numbers to string
for i in cart_train :
    train_x[i] = train_x[i].astype("str")
    test_x[i] = test_x[i].astype("str")

# CatPool
pool_train = Pool(train_x, train_y, cat_features = cart_train)
pool_test = Pool(test_x, cat_features = cart_test)

#### Remove unimportant variables
- 중요하지 않은 변수들은 과감하게 분석에서 제외한다.

In [None]:
feature = pd.read_csv("C:/dust/model/feature.csv")
col = list(feature.iloc[:99]['columns'])
train_x = train_x[col]
test_x = test_x[col]

### Modeling
- iterations = 2000으로 catboost 시켰으며, 나머지 파라미터의 경우 모두 기본 파라미터를 사용
- GPU로 학습

In [None]:
# Modeling
model = CatBoostRegressor(iterations = 2000, learning_rate = 0.1, depth = 10,random_state = 42 ,loss_function = 'RMSE',task_type='GPU').fit(train_x, train_y)
# prediction
predict_y = model.predict(test_x)
# save
model.save_model("C:/dust/model/catboost_model.cbm")
# Reverse Transformation for check real results
test_y = np.expm1(test_y)
predict_y = np.expm1(predict_y)

#### variance important
- 변수 중요도를 시각화하고, 어떠한 변수가 가장 영향을 많이 미치는지 파악

In [None]:
# Visualization
feature_importance = model.feature_importances_
sorted_idx = np.argsort(feature_importance)
fig = plt.figure(figsize = (10,10))
plt.barh(train_x.columns[sorted_idx], feature_importance[sorted_idx])
plt.xlabel("CatBoost Feature Importance")
plt.show()

In [30]:
# Save to DataFrame
feature_df = pd.DataFrame()
feature_df["columns"] = list(train_x.columns[sorted_idx])
feature_df["score"] = list(feature_importance[sorted_idx])
feature_df.sort_values(by = "score", ascending = False, inplace = True)
feature_df.reset_index(drop = True, inplace = True)
feature_df.to_csv("feature.csv", index = False)

#### result

In [None]:
# Print RMSE and r2 score
print(mean_squared_error(test_y, predict_y, squared = False))
print(r2_score(test_y, predict_y))
# Saving the result
pd.DataFrame(predict_y).to_csv("C:/dust/model/predict_y_not_remove.csv", index = False)

In [39]:
# Visualization results
plt.plot(test_y, label = "실제값")
plt.plot(predict_y, label = "예측값")
plt.legend()

### Ensemble
- 모든 변수를 사용한 모형과 중요하지 않은 변수를 삭제조치한 모형 2개를 앙상블
- 성능이 소폭 개선되었고, Outlier에 견고해지는 것을 확인할 수 있었음

In [None]:
# Load Baseline results
predict_y_not_remove = pd.read_csv("C:/dust/model/predict_y_not_remove.csv")
# Ensemble
predict_esb = (np.array(predict_y_not_remove[["0"]]).reshape(-1) + predict_y) / 2

In [None]:
# print RMSE and r2 score
print(mean_squared_error(test_y, predict_esb, squared = False))
print(r2_score(test_y, predict_esb))

In [None]:
# Visualization results
plt.plot(test_y, label = "실제값")
plt.plot(predict_esb, label = "예측값")
plt.legend()