## Install Library (for Colab)

In [None]:
# !pip install --pre pycaret

In [None]:
# !pip install cudf-cu11 dask-cudf-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
# !pip install cuml-cu11 --extra-index-url=https://pypi.ngc.nvidia.com
# !pip install cugraph-cu11 --extra-index-url=https://pypi.ngc.nvidia.com

In [None]:
# !pip install catboost

## Data Load

In [0]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder

In [1]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

## Data Preprocessing

In [None]:
train_x1 = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class'])
train_x2 = train_df[['Y_Quality', 'Y_Class']]

test_x = test_df.drop(columns=['TIMESTAMP'])

In [None]:
# qualitative to quantitative
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x1[i])
    train_x1[i] = le.transform(train_x1[i])

    for label in np.unique(test_x[i]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i])
print('Done.')

## Table divide

In [None]:
# train_df를 PRODUCT_CODE에 따라 분류

In [None]:
train_a31_103 = train_x1[((train_x1['LINE'] == 'T010306') | (train_x1['LINE'] == 'T010305')) & (train_x1['PRODUCT_CODE'] == 'A_31')]
train_a31_103 = train_a31_103.dropna(axis=1, how='all')

In [None]:
train_a31_503 = train_x1[((train_x1['LINE'] == 'T050304') | (train_x1['LINE'] == 'T050307')) & (train_x1['PRODUCT_CODE'] == 'A_31')]
train_a31_503 = train_a31_503.dropna(axis=1, how='all')

In [None]:
train_o31 = train_x1[(train_x1['PRODUCT_CODE'] == 'O_31')]
train_o31 = train_o31.dropna(axis=1, how='all')

In [None]:
train_t31 = train_x1[(train_x1['PRODUCT_CODE'] == 'T_31')]
train_t31 = train_t31.dropna(axis=1, how='all')

In [4]:
# test data에 대해서도 같은 작업 수행
test_a31_503 = test_x[(test_x['PRODUCT_CODE'] == 'A31') & ((test_x['LINE'] == 'T050304') | (test_x['LINE'] == 'T050307'))]
test_a31_503 = test_a31_503.dropna(axis=1, how='all')

test_a31_103 = test_x[(test_x['PRODUCT_CODE'] == 'A31') & ((test_x['LINE'] == 'T010305') | (test_x['LINE'] == 'T010306'))]
test_a31_103 = test_a31_103.dropna(axis=1, how='all')

test_o31 = test_x[test_x['PRODUCT_CODE'] == 'O31']
test_o31 = test_o31.dropna(axis=1, how='all')

test_t31 = test_x[test_x['PRODUCT_CODE'] == 'T31']
test_t31 = test_t31.dropna(axis=1, how='all')

In [5]:
test_a31_503.describe()

Unnamed: 0,Y_Class,Y_Quality,X_128,X_129,X_130,X_131,X_132,X_133,X_134,X_135,...,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871
count,120.0,120.0,78.0,78.0,42.0,42.0,120.0,120.0,120.0,120.0,...,120.0,120.0,120.0,120.0,100.0,100.0,100.0,100.0,100.0,99.0
mean,0.933333,0.530676,14699.833333,14791.923077,10904.333333,10904.333333,0.195917,0.206917,0.195,0.2,...,165.656667,384.4,367.933817,351.266667,50.8073,53.6077,49.6062,51.6598,66.6497,1.0
std,0.785727,0.010773,8290.625484,8233.160897,9234.859281,9234.859281,0.005103,0.004637,0.0055,2.7871950000000004e-17,...,11.515497,3.336581,2.9162,3.613079,7.011828,8.13899,7.158917,8.913065,4.52781,0.0
min,0.0,0.513751,837.0,837.0,806.0,806.0,0.18,0.2,0.18,0.2,...,125.7,373.0,357.698113,342.0,32.12,31.7,32.56,30.49,61.67,1.0
25%,0.0,0.522361,7827.75,7830.5,2255.5,2255.5,0.19,0.2,0.19,0.2,...,161.7,383.0,367.311321,352.0,49.485,52.2,42.16,49.915,63.645,1.0
50%,1.0,0.528926,13493.5,13493.5,8513.5,8513.5,0.2,0.21,0.2,0.2,...,166.05,384.0,368.196366,352.0,53.425,55.925,51.46,56.175,65.14,1.0
75%,2.0,0.537116,23426.25,23426.25,19865.5,19865.5,0.2,0.21,0.2,0.2,...,170.575,384.0,369.023585,353.0,55.2875,58.975,55.03,57.175,67.115,1.0
max,2.0,0.578841,26596.0,26596.0,26155.0,26155.0,0.2,0.21,0.2,0.2,...,194.6,394.0,376.698113,362.0,60.24,68.66,60.41,59.93,79.75,1.0


In [6]:
# train_a31_type1 데이터셋의 결측치를 데이터프레임으로 출력

test_a31_503.isnull().sum().to_frame()

Unnamed: 0,0
Y_Class,0
Y_Quality,0
LINE,0
PRODUCT_CODE,0
X_128,42
...,...
X_2867,20
X_2868,20
X_2869,20
X_2870,20


In [7]:
test_a31_503.replace([np.inf, -np.inf], np.nan)

Unnamed: 0,Y_Class,Y_Quality,LINE,PRODUCT_CODE,X_128,X_129,X_130,X_131,X_132,X_133,...,X_2862,X_2863,X_2864,X_2865,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871
0,1,0.533433,T050304,A_31,7813.0,7813.0,,,0.19,0.20,...,189.0,383.0,368.296296,353.0,39.34,40.89,32.56,34.09,77.77,
1,2,0.541819,T050307,A_31,,,19854.0,19854.0,0.20,0.21,...,185.6,383.0,367.735849,353.0,38.89,42.82,43.92,35.34,72.55,
2,1,0.531267,T050304,A_31,7815.0,7815.0,,,0.19,0.20,...,165.5,383.0,367.320755,353.0,39.19,36.65,42.47,36.53,78.35,
3,2,0.537325,T050307,A_31,,,19856.0,19856.0,0.20,0.21,...,165.8,384.0,369.188679,353.0,37.74,39.17,52.17,30.58,71.78,
4,1,0.531590,T050304,A_31,7817.0,7817.0,,,0.19,0.20,...,182.6,383.0,367.351852,352.0,38.70,41.89,46.93,33.09,76.97,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,0,0.523465,T050304,A_31,11864.0,11864.0,,,0.19,0.20,...,176.0,384.0,367.333333,352.0,,,,,,1.0
116,0,0.522233,T050304,A_31,11898.0,11898.0,,,0.18,0.20,...,174.0,384.0,367.037037,352.0,50.88,53.23,52.44,56.28,66.83,1.0
117,0,0.522340,T050304,A_31,11920.0,11920.0,,,0.19,0.20,...,181.6,394.0,371.943396,353.0,51.71,59.64,54.61,57.05,63.18,1.0
118,0,0.524022,T050304,A_31,14810.0,14810.0,,,0.19,0.20,...,168.7,384.0,369.811321,353.0,49.47,53.07,50.89,55.10,66.49,1.0


In [8]:
test_a31_503.isnull().sum().to_frame()

Unnamed: 0,0
Y_Class,0
Y_Quality,0
LINE,0
PRODUCT_CODE,0
X_128,42
...,...
X_2867,20
X_2868,20
X_2869,20
X_2870,20


In [9]:
# test_a31_503 = test_a31_503.dropna(axis=1, how='any')

In [10]:
# NaN 값 0으로 채우기
train_a31_503 = train_a31_503.fillna(0)
train_a31_103 = train_a31_103.fillna(0)
train_o31 = train_o31.fillna(0)
train_t31 = train_t31.fillna(0)

test_a31_503 = test_a31_503.fillna(0)
test_a31_103 = test_a31_103.fillna(0)
test_o31 = test_o31.fillna(0)
test_t31 = test_t31.fillna(0)

In [11]:
# train_a31_type1.describe()

## Create Model (with Pycaret)

In [12]:
from pycaret import regression as reg

# model setup
setup_regression = reg.setup(data = train_a31_503, target = 'Y_Quality')

print("Complete")

ValueError: Input X contains infinity or a value too large for dtype('float64').

## Model Comparison & Selection & Tuning

In [None]:
# 모델 성능 비교하는 부분 (스킵)

reg.models()

In [None]:
# 성능이 매우 안좋거나 동작 시간이 너무 오래 걸리는 모델 제외
best_model_regression = reg.compare_models(exclude = ['br', 'lar', 'rf', 'et', 'par', 'huber', 'catboost'])

# best_model_regression = reg.compare_models(exclude = ['br', 'lar', 'rf', 'et', 'par', 'huber', 'catboost'], n_select = 5) # 여러 개 섞어서 쓸 때
# best_model_regression = reg.create_model('gbr') # create_model 사용 시

In [None]:
# 모델 튜닝
tuned_model_regression = reg.tune_model(best_model_regression, choose_better = True)

In [None]:
# 모델 앙상블 (여러 모델 사용 시)
# blended = blend_models(estimator_list=best3models, fold=10, method='soft')

In [None]:
final_model_regression_a31_503 = reg.finalize_model(tuned_model_regression)

In [None]:
# 반복
setup_regression = reg.setup(data = train_a31_103, target = 'Y_Quality')

best_model_regression = reg.compare_models(exclude = ['br', 'lar', 'rf', 'et', 'par', 'huber', 'catboost'])

tuned_model_regression = reg.tune_model(best_model_regression, choose_better = True)

final_model_regression_a31_103 = reg.finalize_model(tuned_model_regression)

print("Complete")

In [None]:
setup_regression = reg.setup(data = train_o31, target = 'Y_Quality')

best_model_regression = reg.compare_models(exclude = ['br', 'lar', 'rf', 'et', 'par', 'huber', 'catboost'])

tuned_model_regression = reg.tune_model(best_model_regression, choose_better = True)

final_model_regression_o31 = reg.finalize_model(tuned_model_regression)

print("Complete")

In [None]:
setup_regression = reg.setup(data = train_t31, target = 'Y_Quality')

best_model_regression = reg.compare_models(exclude = ['br', 'lar', 'rf', 'et', 'par', 'huber', 'catboost'])

tuned_model_regression = reg.tune_model(best_model_regression, choose_better = True)

final_model_regression_t31 = reg.finalize_model(tuned_model_regression)

print("Complete")

In [None]:
from pycaret import classification as cls

# model setup
setup_classification = cls.setup(data = train_x2, target = 'Y_Class',)

print("Complete")

In [None]:
# 모델 성능 비교하는 부분

cls.models()

In [None]:
best_model_classification = cls.compare_models()

In [None]:
# 모델 튜닝
tuned_model_classification = cls.tune_model(best_model_classification)

In [None]:
# 모델 학습
final_model_classification = cls.finalize_model(tuned_model_classification)

## Save & Load Model

In [None]:
# save_model(blended, './blended')

In [None]:
# model = load_model('./blended')

## Predict

In [None]:
# pred_y = predict_model(model, data=test_data)

In [None]:
# Quality 예측
pred_y_a31_503 = reg.predict_model(final_model_regression_a31_503, data=test_a31_503)
pred_y_a31_103 = reg.predict_model(final_model_regression_a31_103, data=test_a31_103)
pred_y_o31 = reg.predict_model(final_model_regression_o31, data=test_o31)
pred_y_t31 = reg.predict_model(final_model_regression_t31, data=test_t31)

In [None]:
# 예측 결과를 하나의 데이터프레임으로 합침
pred_y1 = pd.concat([pred_y_a31_503, pred_y_a31_103, pred_y_o31, pred_y_t31])

In [None]:
# PRODUCT_ID를 기준으로 오름차순 정렬
pred_y1 = pred_y1.sort_values(by=['PRODUCT_ID'])

In [None]:
# Y_Class만 추출
pred_regression_y = pd.DataFrame(pred_y1['prediction_label'])

In [None]:
pred_regression_y.rename(columns={'prediction_label':'Y_Quality'}, inplace = True)

In [None]:
pred_y = cls.predict_model(final_model_classification, data=pred_y1)

## Submission

In [None]:
pred_y

In [None]:
submit = pd.read_csv('./data/sample_submission.csv')
submit['Y_Class'] = pd.DataFrame(pred_y['prediction_label'])
submit.to_csv('./data/submission_blackcows_divide.csv', index=False)