In [None]:
import os
from pprint import pprint

import numpy as np
import pandas as pd

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    make_scorer,
    roc_curve,
    auc,
    precision_recall_curve,
    recall_score,
)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import random
import string

from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

import seaborn as sns
import matplotlib.pyplot as plt
import optuna

import torch
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from catboost import CatBoostClassifier, Pool
from lightgbm import LGBMClassifier, plot_metric
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [None]:
f1_scorer = make_scorer(f1_score, pos_label=1, average = 'binary')

def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [None]:
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

In [None]:
# divide
dam = train.filter(regex='_Dam')
fill1 = train.filter(regex='_Fill1')
fill2 = train.filter(regex='_Fill2')
autoclave = train.filter(regex='_AutoClave')
target = train['target']

# dam
dam = dam.dropna(axis=1, how='all')
dam = dam.drop(columns='HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam')
dam_mask = dam[dam['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].isin(['OK', np.nan])].iloc[:, 24:].shift(-1, axis = 1).values
dam.loc[dam['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].isin(['OK', np.nan]), dam.columns[24:]] = dam_mask
dam = dam.drop(columns='WorkMode Collect Result_Dam')

# fill1
fill1 = fill1.dropna(axis=1, how='all')
fill1 = fill1.drop(columns='HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1')
fill1_mask = fill1[fill1['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].isin(['OK', np.nan])].iloc[:, 14:].shift(-1, axis = 1).values
fill1.loc[fill1['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].isin(['OK', np.nan]), fill1.columns[14:]] = fill1_mask
fill1 = fill1.drop(columns='WorkMode Collect Result_Fill1')

# fill2
fill2 = fill2.dropna(axis=1, how='all')
fill2 = fill2.drop(columns='HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2')
fill2_mask = fill2[fill2['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'].isin(['OK', np.nan])].iloc[:, 24:].shift(-1, axis = 1).values
fill2.loc[fill2['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'].isin(['OK', np.nan]), fill2.columns[24:]] = fill2_mask
fill2 = fill2.drop(columns='WorkMode Collect Result_Fill2')

# CONCAT
train = pd.concat([dam, fill1, fill2, autoclave, target], axis=1)

# divide
dam_test = test.filter(regex='_Dam')
fill1_test = test.filter(regex='_Fill1')
fill2_test = test.filter(regex='_Fill2')
autoclave_test = test.filter(regex='_AutoClave')

# dam
dam_test = dam_test.dropna(axis=1, how='all')
dam_test = dam_test.drop(columns='HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam')
dam_mask_test = dam_test[dam_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].isin(['OK', np.nan])].iloc[:, 24:].shift(-1, axis = 1).values
dam_test.loc[dam_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].isin(['OK', np.nan]), dam_test.columns[24:]] = dam_mask_test
dam_test = dam_test.drop(columns='WorkMode Collect Result_Dam')

# fill1
fill1_test = fill1_test.dropna(axis=1, how='all')
fill1_test = fill1_test.drop(columns='HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1')
fill1_mask_test = fill1_test[fill1_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].isin(['OK', np.nan])].iloc[:, 14:].shift(-1, axis = 1).values
fill1_test.loc[fill1_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].isin(['OK', np.nan]), fill1_test.columns[14:]] = fill1_mask_test
fill1_test = fill1_test.drop(columns='WorkMode Collect Result_Fill1')

# fill2
fill2_test = fill2_test.dropna(axis=1, how='all')
fill2_test = fill2_test.drop(columns='HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2')
fill2_mask_test = fill2_test[fill2_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'].isin(['OK', np.nan])].iloc[:, 24:].shift(-1, axis = 1).values
fill2_test.loc[fill2_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'].isin(['OK', np.nan]), fill2_test.columns[24:]] = fill2_mask_test
fill2_test = fill2_test.drop(columns='WorkMode Collect Result_Fill2')

# CONCAT
test = pd.concat([dam_test, fill1_test, fill2_test, autoclave_test], axis=1)

# Swap 전 좌표 평균보정

In [None]:
# train
train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'] = train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].astype(float)
train['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam'] = train['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam'].astype(float)


# 이동 전
X_sum_down_1 = train[train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].astype(float) < 500]['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].astype(float).mean()
X_sum_down_2 = train[train['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam'].astype(float) < 500]['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam'].astype(float).mean()
X_sum_up_1 = train[train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].astype(float) > 500]['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].astype(float).mean()
X_sum_up_2 = train[train['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam'].astype(float) > 500]['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam'].astype(float).mean()

X_sum_down = (X_sum_down_1 - X_sum_down_2) / 2 # stage1에서 빼고, Stage3에서 더하기 <500
X_sum_up = (X_sum_up_2 - X_sum_up_1) / 2 # stage1에서 더하고, Stage 3에서 빼기

train.loc[train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].astype(float) > 500, 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'] += X_sum_up
train.loc[train['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam'].astype(float) > 500, 'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam'] -= X_sum_up

train.loc[train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].astype(float) < 500, 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'] -= X_sum_down
train.loc[train['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam'].astype(float) < 500, 'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam'] += X_sum_down

# test
test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'] = test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].astype(float)
test['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam'] = test['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam'].astype(float)


# 이동 전
test.loc[test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].astype(float) > 500, 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'] += X_sum_up
test.loc[test['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam'].astype(float) > 500, 'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam'] -= X_sum_up

test.loc[test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].astype(float) < 500, 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'] -= X_sum_down
test.loc[test['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam'].astype(float) < 500, 'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam'] += X_sum_down

In [None]:
# train
Y_sum_dam_1 = train[train['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam'].astype(float) < 500]['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam'].astype(float).mean()
Y_sum_dam_2 = train[train['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam'].astype(float) > 500]['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam'].astype(float).mean()

train.loc[train['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam'].astype(float) > 500, 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam'] = Y_sum_dam_1 + Y_sum_dam_2 - train.loc[train['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam'].astype(float) > 500, 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam']

Y_sum_dam_3 = train[train['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam'].astype(float) < 500]['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam'].astype(float).mean()
Y_sum_dam_4 = train[train['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam'].astype(float) > 500]['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam'].astype(float).mean()

train.loc[train['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam'].astype(float) > 500, 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam'] = Y_sum_dam_3 + Y_sum_dam_4 - train.loc[train['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam'].astype(float) > 500, 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam']

Y_sum_dam_5 = train[train['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'].astype(float) < 500]['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'].astype(float).mean()
Y_sum_dam_6 = train[train['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'].astype(float) > 500]['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'].astype(float).mean()

train.loc[train['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'].astype(float) > 500, 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'] = Y_sum_dam_5 + Y_sum_dam_6 - train.loc[train['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'].astype(float) > 500, 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam']

Y_sum_fill_1 = train[train['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1'].astype(float) > 500]['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1'].astype(float).mean()
Y_sum_fill_2 = train[train['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1'].astype(float) < 500]['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1'].astype(float).mean()

train.loc[train['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1'].astype(float) > 500, 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1'] = Y_sum_fill_1 + Y_sum_fill_2 - train.loc[train['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1'].astype(float) > 500, 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1']

Y_sum_fill_3 = train[train['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1'].astype(float) > 500]['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1'].astype(float).mean()
Y_sum_fill_4 = train[train['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1'].astype(float) < 500]['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1'].astype(float).mean()

train.loc[train['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1'].astype(float) > 500, 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1'] = Y_sum_fill_3 + Y_sum_fill_4 - train.loc[train['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1'].astype(float) > 500, 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1']

Y_sum_fill_5 = train[train['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1'].astype(float) > 500]['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1'].astype(float).mean()
Y_sum_fill_6 = train[train['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1'].astype(float) < 500]['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1'].astype(float).mean()
train.loc[train['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1'].astype(float) > 500, 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1'] = Y_sum_fill_5 + Y_sum_fill_6 - train.loc[train['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1'].astype(float) > 500, 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1']


# test
test.loc[test['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam'].astype(float) > 500, 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam'] = Y_sum_dam_1 + Y_sum_dam_2 - test.loc[test['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam'].astype(float) > 500, 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam']
test.loc[test['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'].astype(float) > 500, 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'] = Y_sum_dam_5 + Y_sum_dam_6 - test.loc[test['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'].astype(float) > 500, 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam']
test.loc[test['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam'].astype(float) > 500, 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam'] = Y_sum_dam_3 + Y_sum_dam_4 - test.loc[test['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam'].astype(float) > 500, 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam']
test.loc[test['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1'].astype(float) > 500, 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1'] = Y_sum_fill_1 + Y_sum_fill_2 - test.loc[test['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1'].astype(float) > 500, 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1']
test.loc[test['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1'].astype(float) > 500, 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1'] = Y_sum_fill_5 + Y_sum_fill_6 - test.loc[test['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1'].astype(float) > 500, 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1']
test.loc[test['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1'].astype(float) > 500, 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1'] = Y_sum_fill_3 + Y_sum_fill_4 - test.loc[test['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1'].astype(float) > 500, 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1']

# Swap

In [None]:
def swap_columns(df, condition, col1, col2):
    # 조건에 해당하는 행 필터링
    filtered_df = df[condition]
    
    # 값 교환
    df.loc[condition, [col1, col2]] = filtered_df[[col1, col2]].copy().iloc[:, ::-1].values

    return df

### Train
# 조건을 만족하는 행 인덱스를 찾음
condition = train['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam'].astype(float) >= 200

# DISCHARGED TIME OF RESIN(Stage1) 
swap_columns(train, condition, 'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam', 'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam')

# Dispense Volume(Stage1)
swap_columns(train, condition, 'Dispense Volume(Stage1) Collect Result_Dam', 'Dispense Volume(Stage3) Collect Result_Dam')

# HEAD NORMAL COORDINATE Y AXIS(Stage1)
swap_columns(train, condition, 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam', 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam')

# HEAD NORMAL COORDINATE Z AXIS(Stage1)
swap_columns(train, condition, 'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam', 'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam')

# Stage1 Circle1 Distance Speed Collect
swap_columns(train, condition, 'Stage1 Circle1 Distance Speed Collect Result_Dam', 'Stage3 Circle1 Distance Speed Collect Result_Dam')
swap_columns(train, condition, 'Stage1 Circle2 Distance Speed Collect Result_Dam', 'Stage3 Circle2 Distance Speed Collect Result_Dam')
swap_columns(train, condition, 'Stage1 Circle3 Distance Speed Collect Result_Dam', 'Stage3 Circle3 Distance Speed Collect Result_Dam')
swap_columns(train, condition, 'Stage1 Circle4 Distance Speed Collect Result_Dam', 'Stage3 Circle4 Distance Speed Collect Result_Dam')

# Stage1 Line1 Distance Speed Collect
swap_columns(train, condition, 'Stage1 Line1 Distance Speed Collect Result_Dam', 'Stage3 Line1 Distance Speed Collect Result_Dam')
swap_columns(train, condition, 'Stage1 Line2 Distance Speed Collect Result_Dam', 'Stage3 Line2 Distance Speed Collect Result_Dam')
swap_columns(train, condition, 'Stage1 Line3 Distance Speed Collect Result_Dam', 'Stage3 Line3 Distance Speed Collect Result_Dam')
swap_columns(train, condition, 'Stage1 Line4 Distance Speed Collect Result_Dam', 'Stage3 Line4 Distance Speed Collect Result_Dam')

# THICKNESS 1
# swap_columns(train, condition, 'THICKNESS 1 Collect Result_Dam', 'THICKNESS 3 Collect Result_Dam')

### 젤 마지막에 와야됨!!!!
# HEAD NORMAL COORDINATE X AXIS(Stage1)
swap_columns(train, condition, 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam', 'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam')


### Test
# 조건을 만족하는 행 인덱스를 찾음
condition = test['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam'].astype(float) >= 200

# DISCHARGED TIME OF RESIN(Stage1) 
swap_columns(test, condition, 'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam', 'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam')

# Dispense Volume(Stage1)
swap_columns(test, condition, 'Dispense Volume(Stage1) Collect Result_Dam', 'Dispense Volume(Stage3) Collect Result_Dam')

# HEAD NORMAL COORDINATE Y AXIS(Stage1)
swap_columns(test, condition, 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam', 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam')

# HEAD NORMAL COORDINATE Z AXIS(Stage1)
swap_columns(test, condition, 'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam', 'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam')

# Stage1 Circle1 Distance Speed Collect
swap_columns(test, condition, 'Stage1 Circle1 Distance Speed Collect Result_Dam', 'Stage3 Circle1 Distance Speed Collect Result_Dam')
swap_columns(test, condition, 'Stage1 Circle2 Distance Speed Collect Result_Dam', 'Stage3 Circle2 Distance Speed Collect Result_Dam')
swap_columns(test, condition, 'Stage1 Circle3 Distance Speed Collect Result_Dam', 'Stage3 Circle3 Distance Speed Collect Result_Dam')
swap_columns(test, condition, 'Stage1 Circle4 Distance Speed Collect Result_Dam', 'Stage3 Circle4 Distance Speed Collect Result_Dam')

# Stage1 Line1 Distance Speed Collect
swap_columns(test, condition, 'Stage1 Line1 Distance Speed Collect Result_Dam', 'Stage3 Line1 Distance Speed Collect Result_Dam')
swap_columns(test, condition, 'Stage1 Line2 Distance Speed Collect Result_Dam', 'Stage3 Line2 Distance Speed Collect Result_Dam')
swap_columns(test, condition, 'Stage1 Line3 Distance Speed Collect Result_Dam', 'Stage3 Line3 Distance Speed Collect Result_Dam')
swap_columns(test, condition, 'Stage1 Line4 Distance Speed Collect Result_Dam', 'Stage3 Line4 Distance Speed Collect Result_Dam')

# THICKNESS 1
# swap_columns(train, condition, 'THICKNESS 1 Collect Result_Dam', 'THICKNESS 3 Collect Result_Dam')

### 젤 마지막에 와야됨!!!!
# HEAD NORMAL COORDINATE X AXIS(Stage1)
swap_columns(test, condition, 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam', 'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam')

In [None]:
### Train
condition = train['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1'].astype(float) > 500

# DISCHARGED TIME OF RESIN(Stage1)
swap_columns(train, condition, 'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1', 'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1')

# Dispense Volume(Stage1)
swap_columns(train, condition, 'Dispense Volume(Stage1) Collect Result_Fill1', 'Dispense Volume(Stage2) Collect Result_Fill1')

# HEAD NORMAL COORDINATE Y AXIS(Stage1)
swap_columns(train, condition, 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1', 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1')

# HEAD NORMAL COORDINATE Z AXIS(Stage1)
swap_columns(train, condition, 'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1', 'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1')

# 반드시 마지막으로 와야함!!!!
# HEAD NORMAL COORDINATE X AXIS(Stage1)
swap_columns(train, condition, 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1', 'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1')

### Test
condition = test['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1'].astype(float) > 500

# DISCHARGED TIME OF RESIN(Stage1)
swap_columns(test, condition, 'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1', 'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1')

# Dispense Volume(Stage1)
swap_columns(test, condition, 'Dispense Volume(Stage1) Collect Result_Fill1', 'Dispense Volume(Stage2) Collect Result_Fill1')

# HEAD NORMAL COORDINATE Y AXIS(Stage1)
swap_columns(test, condition, 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1', 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1')

# HEAD NORMAL COORDINATE Z AXIS(Stage1)
swap_columns(test, condition, 'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1', 'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1')

# 반드시 마지막으로 와야함!!!!
# HEAD NORMAL COORDINATE X AXIS(Stage1)
swap_columns(test, condition, 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1', 'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1')

In [None]:
### Train
# 조건을 만족하는 행 인덱스를 찾음
condition = train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].astype(float) < 200

# DISCHARGED TIME OF RESIN(Stage1)
swap_columns(train, condition, 'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1', 'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1')

# Dispense Volume(Stage1)
swap_columns(train, condition, 'Dispense Volume(Stage1) Collect Result_Fill1', 'Dispense Volume(Stage3) Collect Result_Fill1')

# HEAD NORMAL COORDINATE Y AXIS(Stage1)
swap_columns(train, condition, 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1', 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1')

# HEAD NORMAL COORDINATE Z AXIS(Stage1)
swap_columns(train, condition, 'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1', 'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1')

# 반드시 마지막으로 와야함!!!!
# HEAD NORMAL COORDINATE X AXIS(Stage1)
swap_columns(train, condition, 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1', 'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1')

### Test
condition = test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].astype(float) < 200

# DISCHARGED TIME OF RESIN(Stage1)
swap_columns(test, condition, 'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1', 'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1')

# Dispense Volume(Stage1)
swap_columns(test, condition, 'Dispense Volume(Stage1) Collect Result_Fill1', 'Dispense Volume(Stage3) Collect Result_Fill1')

# HEAD NORMAL COORDINATE Y AXIS(Stage1)
swap_columns(test, condition, 'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1', 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1')

# HEAD NORMAL COORDINATE Z AXIS(Stage1)
swap_columns(test, condition, 'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1', 'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1')

# 반드시 마지막으로 와야함!!!!
# HEAD NORMAL COORDINATE X AXIS(Stage1)
swap_columns(test, condition, 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1', 'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1')

In [None]:
df_train = train
df_test = test

# Type Change

In [None]:
type_change = ['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam', 'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1']

for i in type_change:
    df_train[i] = df_train[i].astype(float)
    df_test[i] = df_test[i].astype(float)

# New Column

In [None]:
train = df_train
test = df_test
train['Equipment_Dam'] = train['Equipment_Dam'].str.slice(15, 16)
train['Equipment_Fill1'] = train['Equipment_Fill1'].str.slice(17, 18)
train['Equipment_Fill2'] = train['Equipment_Fill2'].str.slice(17, 18)

test['Equipment_Dam'] = test['Equipment_Dam'].str.slice(15, 16)
test['Equipment_Fill1'] = test['Equipment_Fill1'].str.slice(17, 18)
test['Equipment_Fill2'] = test['Equipment_Fill2'].str.slice(17, 18)
df_train = train
df_test = test

In [None]:
# Dam, Fill1, Fill2에서 지정된 값이 다를 경우 Abnormal 
def inconsistant(data, columnname, iwantthiscolumnsname, is_train = True):
    # 장비 번호가 다르면 불일치
    if is_train:
        cri = [
            df_train[columnname + '_Dam'] != df_train[columnname + '_Fill1'],
            df_train[columnname + '_Dam'] != df_train[columnname + '_Fill2'],
            df_train[columnname + '_Fill1'] != df_train[columnname + '_Fill2'],
            data[iwantthiscolumnsname] == 1
        ]
        
    else:
        cri = [
            df_test[columnname + '_Dam'] != df_test[columnname + '_Fill1'],
            df_test[columnname + '_Dam'] != df_test[columnname + '_Fill2'],
            df_test[columnname + '_Fill1'] != df_test[columnname + '_Fill1'],
            data[iwantthiscolumnsname] == 1
        ]
    con = [1, 1, 1, 1]

    data[iwantthiscolumnsname] = np.select(cri, con, default = 0)
    
# 불일치 변수
df_train['inconsistant'] = 0
df_test['inconsistant'] = 0

# 기준
columnname = ['Equipment', 'Receip No Collect Result', 'Production Qty Collect Result', 'PalletID Collect Result', ]

# 장착
for i in columnname:
    inconsistant(df_train, i, 'inconsistant', True)
    inconsistant(df_test, i, 'inconsistant', False)

In [None]:
# 시간이 0이하, 900이상인 값은 이상치로 분류
for j in ['Machine Tact time Collect Result_Dam', 'Machine Tact time Collect Result_Fill1', 'Machine Tact time Collect Result_Fill2']:
    cri = [
        df_train[j] <= 0,
        df_train[j] > 900
    ]
    cri2 = [
        df_test[j] <= 0,
        df_test[j] > 900
    ]
    con = [
        1, 1
    ]
    df_train['inconsistant'] = np.select(cri, con, default = df_train['inconsistant'])
    df_test['inconsistant'] = np.select(cri2, con, default = df_test['inconsistant'])

In [None]:
df_train['1st Pressure x Time x Temp AutoClave'] = df_train['1st Pressure Collect Result_AutoClave']*df_train['1st Pressure 1st Pressure Unit Time_AutoClave']*df_train['Chamber Temp. Collect Result_AutoClave']
df_train['2nd Pressure x Time x Temp AutoClave'] = df_train['2nd Pressure Collect Result_AutoClave']*df_train['2nd Pressure Unit Time_AutoClave']*df_train['Chamber Temp. Collect Result_AutoClave']
df_train['3rd Pressure x Time x Temp AutoClave'] = df_train['3rd Pressure Collect Result_AutoClave']*df_train['3rd Pressure Unit Time_AutoClave']*df_train['Chamber Temp. Collect Result_AutoClave']

df_test['1st Pressure x Time x Temp AutoClave'] = df_test['1st Pressure Collect Result_AutoClave']*df_test['1st Pressure 1st Pressure Unit Time_AutoClave']*df_test['Chamber Temp. Collect Result_AutoClave']
df_test['2nd Pressure x Time x Temp AutoClave'] = df_test['2nd Pressure Collect Result_AutoClave']*df_test['2nd Pressure Unit Time_AutoClave']*df_test['Chamber Temp. Collect Result_AutoClave']
df_test['3rd Pressure x Time x Temp AutoClave'] = df_test['3rd Pressure Collect Result_AutoClave']*df_test['3rd Pressure Unit Time_AutoClave']*df_test['Chamber Temp. Collect Result_AutoClave']

In [None]:
df_train['Minus1_Dam']= df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'] - df_train['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam']
df_train['Minus2_Dam']= df_train['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam'] - df_train['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam']

df_test['Minus1_Dam']= df_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'] - df_test['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam']
df_test['Minus2_Dam']= df_test['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam'] - df_test['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam']

df_train['Minus1_Fill1']= df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'] - df_train['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1']
df_train['Minus2_Fill1']= df_train['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1'] - df_train['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1']

df_test['Minus1_Fill1']= df_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'] - df_test['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1']
df_test['Minus2_Fill1']= df_test['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1'] - df_test['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1']

df_train['Minus1Y_Dam']= df_train['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam'] - df_train['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam']
df_train['Minus2Y_Dam']= df_train['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'] - df_train['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam']

df_test['Minus1Y_Dam']= df_test['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam'] - df_test['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam']
df_test['Minus2Y_Dam']= df_test['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'] - df_test['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam']

df_train['Minus1Y_Fill1']= df_train['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1'] - df_train['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1']
df_train['Minus2Y_Fill1']= df_train['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1'] - df_train['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1']

df_test['Minus1Y_Fill1']= df_test['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1'] - df_test['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1']
df_test['Minus2Y_Fill1']= df_test['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1'] - df_test['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1']

df_train['Minus1Y_Dam'] = df_train['Minus1Y_Dam'].apply(lambda x: 1 if x > 2 or x < -2 else 0)
df_train['Minus2Y_Dam'] = df_train['Minus2Y_Dam'].apply(lambda x: 1 if x > 2 or x < -2 else 0)

df_test['Minus1Y_Dam'] = df_test['Minus1Y_Dam'].apply(lambda x: 1 if x > 2 or x < -2 else 0)
df_test['Minus2Y_Dam'] = df_test['Minus2Y_Dam'].apply(lambda x: 1 if x > 2 or x < -2 else 0)

In [None]:
df_train['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2'] = df_train['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2'].astype(float)
df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'] = df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'].astype(float)

# Column Drop

In [None]:
df_test = df_test.drop(columns= [ 
 'Stage1 Circle2 Distance Speed Collect Result_Dam',
 'Stage1 Circle3 Distance Speed Collect Result_Dam',
 'Stage1 Circle4 Distance Speed Collect Result_Dam', 
 'Stage2 Circle2 Distance Speed Collect Result_Dam',
 'Stage2 Circle3 Distance Speed Collect Result_Dam',
 'Stage2 Circle4 Distance Speed Collect Result_Dam', 
 'Stage3 Circle2 Distance Speed Collect Result_Dam',
 'Stage3 Circle3 Distance Speed Collect Result_Dam',
 'Stage3 Circle4 Distance Speed Collect Result_Dam'] )

df_test = df_test.rename(columns={'Stage1 Circle1 Distance Speed Collect Result_Dam': 'Stage1 Circle Distance Speed_Dam', 
                                    'Stage2 Circle1 Distance Speed Collect Result_Dam': 'Stage2 Circle Distance Speed_Dam',
                                    'Stage3 Circle1 Distance Speed Collect Result_Dam': 'Stage3 Circle Distance Speed_Dam'})

# Dam, Fill2의 경우 Z값이 서로 같다. -> 그렇다면 Fill1은 높이값에서 흔들린 경우가 있다는 것을 의미한다.
df_test = df_test.drop(columns= [
    'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2',
    'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2'
])

df_test = df_test.rename(columns={'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2': 'HEAD NORMAL COORDINATE Z AXIS_Fill2', 
                                    'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam': 'HEAD NORMAL COORDINATE Z AXIS_Dam',
                                    })

# Model.Suffix, Workorder이 같다.
df_test = df_test.drop(columns=['Model.Suffix_Fill1', 'Model.Suffix_Fill2', 'Model.Suffix_AutoClave'])
df_test = df_test.drop(columns=['Workorder_Fill1', 'Workorder_Fill2', 'Workorder_AutoClave'])
df_test = df_test.rename(columns={'Workorder_Dam': 'Workorder', 'Model.Suffix_Dam': 'Model.Suffix'})

# 의미를 찾을 수 없는 컬럼들 제거
df_test = df_test.drop(columns=['Wip Line_Fill1', 
                                  'Process Desc._Fill1', 
                                  'Insp. Seq No._Fill1', 
                                  'Insp Judge Code_Fill1', 
                                  'Equipment_AutoClave',
                                  'Process Desc._AutoClave', 
                                  'Wip Line_AutoClave', 
                                  'Insp Judge Code_AutoClave',
                                  'Insp. Seq No._AutoClave',
                                  '1st Pressure Judge Value_AutoClave', 
                                  '2nd Pressure Judge Value_AutoClave', 
                                  '3rd Pressure Judge Value_AutoClave', 
                                  'GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave',
                                  'GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave',
                                  'GMES_ORIGIN_INSP_JUDGE_CODE Unit Time_AutoClave',
                                  'Wip Line_Fill2', 
                                  'Process Desc._Fill2', 
                                  'Insp. Seq No._Fill2', 
                                  'Insp Judge Code_Fill2', 
                                  'Wip Line_Dam', 
                                  'Process Desc._Dam', 
                                  'Insp. Seq No._Dam', 
                                  'Insp Judge Code_Dam',
                                  'CURE END POSITION X Collect Result_Dam',
                                  'CURE END POSITION Z Collect Result_Dam',
                                  'CURE END POSITION Θ Collect Result_Dam',
                                  'CURE STANDBY POSITION X Collect Result_Dam',
                                  'CURE STANDBY POSITION Z Collect Result_Dam',
                                  'CURE STANDBY POSITION Θ Collect Result_Dam',
                                  ])  

# Fill2는 레진을 살포하지 않는다. UV만 진행하는 과정이므로 싹 삭제해 준다.          
df_test = df_test.drop(columns=['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2',
                                'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2',
                                'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2',
                                'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2',
                                'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2',
                                'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2',
                                'HEAD NORMAL COORDINATE Z AXIS_Fill2',
                                'HEAD Standby Position X Collect Result_Fill2',
                                'HEAD Standby Position Y Collect Result_Fill2',
                                'HEAD Standby Position Z Collect Result_Fill2',
                                'Head Clean Position X Collect Result_Fill2',
                                'Head Clean Position Y Collect Result_Fill2',
                                'Head Clean Position Z Collect Result_Fill2',
                                'Head Purge Position X Collect Result_Fill2',
                                'Head Purge Position Y Collect Result_Fill2',
                                'Head Purge Position Z Collect Result_Fill2',
                                'DISCHARGED SPEED OF RESIN Collect Result_Fill2',
                                'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill2',
                                'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill2',
                                'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill2',
                                'Dispense Volume(Stage1) Collect Result_Fill2',
                                'Dispense Volume(Stage2) Collect Result_Fill2',
                                'Dispense Volume(Stage3) Collect Result_Fill2',])  

# 라인별로 속도가 같아야 정상이다.
df_test['Stage1 Line diffent Distance Speed_Dam'] = ((df_test['Stage1 Line1 Distance Speed Collect Result_Dam'] != df_test['Stage1 Line2 Distance Speed Collect Result_Dam']) |
                                                  (df_test['Stage1 Line1 Distance Speed Collect Result_Dam'] != df_test['Stage1 Line3 Distance Speed Collect Result_Dam']) |
                                                  (df_test['Stage1 Line1 Distance Speed Collect Result_Dam'] != df_test['Stage1 Line4 Distance Speed Collect Result_Dam']) |
                                                  (df_test['Stage1 Line3 Distance Speed Collect Result_Dam'] != df_test['Stage1 Line4 Distance Speed Collect Result_Dam'])).astype(int)
df_test['Stage1 Line Sum Speed_Dam'] = df_test['Stage1 Line1 Distance Speed Collect Result_Dam'] + df_test['Stage1 Line2 Distance Speed Collect Result_Dam'] + df_test['Stage1 Line3 Distance Speed Collect Result_Dam'] + df_test['Stage1 Line4 Distance Speed Collect Result_Dam']

df_test['Stage2 Line diffent Distance Speed_Dam'] = ((df_test['Stage2 Line1 Distance Speed Collect Result_Dam'] != df_test['Stage2 Line2 Distance Speed Collect Result_Dam']) |
                                                  (df_test['Stage2 Line1 Distance Speed Collect Result_Dam'] != df_test['Stage2 Line3 Distance Speed Collect Result_Dam']) |
                                                  (df_test['Stage2 Line1 Distance Speed Collect Result_Dam'] != df_test['Stage2 Line4 Distance Speed Collect Result_Dam']) |
                                                  (df_test['Stage2 Line3 Distance Speed Collect Result_Dam'] != df_test['Stage2 Line4 Distance Speed Collect Result_Dam'])).astype(int)
df_test['Stage2 Line Sum Speed_Dam'] = df_test['Stage2 Line1 Distance Speed Collect Result_Dam'] + df_test['Stage2 Line2 Distance Speed Collect Result_Dam'] + df_test['Stage2 Line3 Distance Speed Collect Result_Dam'] + df_test['Stage2 Line4 Distance Speed Collect Result_Dam']

df_test['Stage3 Line diffent Distance Speed_Dam'] = ((df_test['Stage3 Line1 Distance Speed Collect Result_Dam'] != df_test['Stage3 Line2 Distance Speed Collect Result_Dam']) |
                                                  (df_test['Stage3 Line1 Distance Speed Collect Result_Dam'] != df_test['Stage3 Line3 Distance Speed Collect Result_Dam']) |
                                                  (df_test['Stage3 Line1 Distance Speed Collect Result_Dam'] != df_test['Stage3 Line4 Distance Speed Collect Result_Dam']) |
                                                  (df_test['Stage3 Line3 Distance Speed Collect Result_Dam'] != df_test['Stage3 Line4 Distance Speed Collect Result_Dam'])).astype(int)
df_test['Stage3 Line Sum Speed_Dam'] = df_test['Stage3 Line1 Distance Speed Collect Result_Dam'] + df_test['Stage3 Line2 Distance Speed Collect Result_Dam'] + df_test['Stage3 Line3 Distance Speed Collect Result_Dam'] + df_test['Stage3 Line4 Distance Speed Collect Result_Dam']

df_test = df_test.drop(columns=[
                                'Stage1 Line1 Distance Speed Collect Result_Dam',
                                'Stage1 Line2 Distance Speed Collect Result_Dam',
                                'Stage1 Line3 Distance Speed Collect Result_Dam',
                                'Stage1 Line4 Distance Speed Collect Result_Dam',
                                'Stage2 Line1 Distance Speed Collect Result_Dam',
                                'Stage2 Line2 Distance Speed Collect Result_Dam',
                                'Stage2 Line3 Distance Speed Collect Result_Dam',
                                'Stage2 Line4 Distance Speed Collect Result_Dam',
                                'Stage3 Line1 Distance Speed Collect Result_Dam',
                                'Stage3 Line2 Distance Speed Collect Result_Dam',
                                'Stage3 Line3 Distance Speed Collect Result_Dam',
                                'Stage3 Line4 Distance Speed Collect Result_Dam',
                                ])

# 단일값이 하나인 컬럼들, 의미를 찾고싶다면 주석처리 해야하는 것들
df_test = df_test.drop(columns=['CURE START POSITION X Collect Result_Dam', # Equipment에 따라서 정해지며, 하나로 책정됨.
                                'CURE START POSITION Z Collect Result_Dam', # START POSITION
                                'CURE START POSITION Θ Collect Result_Dam', # Equipment에 따라서 정해지며, 하나로 책정됨.
                                'HEAD Standby Position X Collect Result_Dam',
                                'HEAD Standby Position Y Collect Result_Dam',
                                'HEAD Standby Position Z Collect Result_Dam',
                                'Head Clean Position X Collect Result_Dam',
                                'Head Clean Position Y Collect Result_Dam', # 흔들림에 따라 Z
                                'Head Purge Position X Collect Result_Dam',
                                'Head Purge Position Y Collect Result_Dam',
                                'Head Zero Position X Collect Result_Dam',
                                'HEAD Standby Position X Collect Result_Fill1',
                                'HEAD Standby Position Y Collect Result_Fill1',
                                'HEAD Standby Position Z Collect Result_Fill1',
                                'Head Clean Position X Collect Result_Fill1',
                                'Head Clean Position Y Collect Result_Fill1',
                                'Head Clean Position Z Collect Result_Fill1',
                                'Head Purge Position X Collect Result_Fill1',
                                'Head Purge Position Y Collect Result_Fill1',
                                'CURE END POSITION X Collect Result_Fill2',
                                'CURE END POSITION Θ Collect Result_Fill2',
                                'CURE STANDBY POSITION X Collect Result_Fill2',
                                'CURE STANDBY POSITION Z Collect Result_Fill2',
                                'CURE STANDBY POSITION Θ Collect Result_Fill2',
                                'CURE START POSITION X Collect Result_Fill2',
                                'CURE START POSITION Θ Collect Result_Fill2',
                                ])

# AutoClave 의미없어보이는거 제거
df_test = df_test.drop(columns=[ 'Chamber Temp. Collect Result_AutoClave',
                                  'Chamber Temp. Judge Value_AutoClave',
                                  'Chamber Temp. Unit Time_AutoClave',
                                  '1st Pressure Collect Result_AutoClave',
                                  '1st Pressure 1st Pressure Unit Time_AutoClave',
                                  '2nd Pressure Collect Result_AutoClave',
                                  '2nd Pressure Unit Time_AutoClave',
                                  '3rd Pressure Collect Result_AutoClave',
                                  '3rd Pressure Unit Time_AutoClave',
                                  ])

In [None]:
# 값들이 같은 컬럼 하나로 합치는 과정
# 같은 Stage에 Circle 값들끼리 같다.
df_train = df_train.drop(columns= [ 
 'Stage1 Circle2 Distance Speed Collect Result_Dam',
 'Stage1 Circle3 Distance Speed Collect Result_Dam',
 'Stage1 Circle4 Distance Speed Collect Result_Dam', 
 'Stage2 Circle2 Distance Speed Collect Result_Dam',
 'Stage2 Circle3 Distance Speed Collect Result_Dam',
 'Stage2 Circle4 Distance Speed Collect Result_Dam', 
 'Stage3 Circle2 Distance Speed Collect Result_Dam',
 'Stage3 Circle3 Distance Speed Collect Result_Dam',
 'Stage3 Circle4 Distance Speed Collect Result_Dam'] )

df_train = df_train.rename(columns={'Stage1 Circle1 Distance Speed Collect Result_Dam': 'Stage1 Circle Distance Speed_Dam', 
                                    'Stage2 Circle1 Distance Speed Collect Result_Dam': 'Stage2 Circle Distance Speed_Dam',
                                    'Stage3 Circle1 Distance Speed Collect Result_Dam': 'Stage3 Circle Distance Speed_Dam'})

# Dam, Fill2의 경우 Z값이 서로 같다. -> 그렇다면 Fill1은 높이값에서 흔들린 경우가 있다는 것을 의미한다.
df_train = df_train.drop(columns= [
    'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam',
    'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2',
    'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2'
])

df_train = df_train.rename(columns={'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2': 'HEAD NORMAL COORDINATE Z AXIS_Fill2', 
                                    'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam': 'HEAD NORMAL COORDINATE Z AXIS_Dam',
                                    })

# Model.Suffix, Workorder이 같다.
df_train = df_train.drop(columns=['Model.Suffix_Fill1', 'Model.Suffix_Fill2', 'Model.Suffix_AutoClave'])
df_train = df_train.drop(columns=['Workorder_Fill1', 'Workorder_Fill2', 'Workorder_AutoClave'])
df_train = df_train.rename(columns={'Workorder_Dam': 'Workorder', 'Model.Suffix_Dam': 'Model.Suffix'})


# 의미를 찾을 수 없는 컬럼들 제거
df_train = df_train.drop(columns=['Wip Line_Fill1', 
                                  'Process Desc._Fill1', 
                                  'Insp. Seq No._Fill1', 
                                  'Insp Judge Code_Fill1', 
                                  'Equipment_AutoClave',
                                  'Process Desc._AutoClave', 
                                  'Wip Line_AutoClave', 
                                  'Insp Judge Code_AutoClave',
                                  'Insp. Seq No._AutoClave',
                                  '1st Pressure Judge Value_AutoClave', 
                                  '2nd Pressure Judge Value_AutoClave', 
                                  '3rd Pressure Judge Value_AutoClave', 
                                  'GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave',
                                  'GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave',
                                  'GMES_ORIGIN_INSP_JUDGE_CODE Unit Time_AutoClave',
                                  'Wip Line_Fill2', 
                                  'Process Desc._Fill2', 
                                  'Insp. Seq No._Fill2', 
                                  'Insp Judge Code_Fill2', 
                                  'Wip Line_Dam', 
                                  'Process Desc._Dam', 
                                  'Insp. Seq No._Dam', 
                                  'Insp Judge Code_Dam',
                                  'CURE END POSITION X Collect Result_Dam',
                                  'CURE END POSITION Z Collect Result_Dam',
                                  'CURE END POSITION Θ Collect Result_Dam',
                                  'CURE STANDBY POSITION X Collect Result_Dam',
                                  'CURE STANDBY POSITION Z Collect Result_Dam',
                                  'CURE STANDBY POSITION Θ Collect Result_Dam',
                                  ])  

# Fill2는 레진을 살포하지 않는다. UV만 진행하는 과정이므로 싹 삭제해 준다.          
df_train = df_train.drop(columns=['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2',
                                'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2',
                                'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2',
                                'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2',
                                'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2',
                                'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2',
                                'HEAD NORMAL COORDINATE Z AXIS_Fill2',
                                'HEAD Standby Position X Collect Result_Fill2',
                                'HEAD Standby Position Y Collect Result_Fill2',
                                'HEAD Standby Position Z Collect Result_Fill2',
                                'Head Clean Position X Collect Result_Fill2',
                                'Head Clean Position Y Collect Result_Fill2',
                                'Head Clean Position Z Collect Result_Fill2',
                                'Head Purge Position X Collect Result_Fill2',
                                'Head Purge Position Y Collect Result_Fill2',
                                'Head Purge Position Z Collect Result_Fill2',
                                'DISCHARGED SPEED OF RESIN Collect Result_Fill2',
                                'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill2',
                                'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill2',
                                'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill2',
                                'Dispense Volume(Stage1) Collect Result_Fill2',
                                'Dispense Volume(Stage2) Collect Result_Fill2',
                                'Dispense Volume(Stage3) Collect Result_Fill2',])  

# 라인별로 속도가 같아야 정상이다.
df_train['Stage1 Line diffent Distance Speed_Dam'] = ((df_train['Stage1 Line1 Distance Speed Collect Result_Dam'] != df_train['Stage1 Line2 Distance Speed Collect Result_Dam']) |
                                                  (df_train['Stage1 Line1 Distance Speed Collect Result_Dam'] != df_train['Stage1 Line3 Distance Speed Collect Result_Dam']) |
                                                  (df_train['Stage1 Line1 Distance Speed Collect Result_Dam'] != df_train['Stage1 Line4 Distance Speed Collect Result_Dam']) |
                                                  (df_train['Stage1 Line3 Distance Speed Collect Result_Dam'] != df_train['Stage1 Line4 Distance Speed Collect Result_Dam'])).astype(int)
df_train['Stage1 Line Sum Speed_Dam'] = df_train['Stage1 Line1 Distance Speed Collect Result_Dam'] + df_train['Stage1 Line2 Distance Speed Collect Result_Dam'] + df_train['Stage1 Line3 Distance Speed Collect Result_Dam'] + df_train['Stage1 Line4 Distance Speed Collect Result_Dam']

df_train['Stage2 Line diffent Distance Speed_Dam'] = ((df_train['Stage2 Line1 Distance Speed Collect Result_Dam'] != df_train['Stage2 Line2 Distance Speed Collect Result_Dam']) |
                                                  (df_train['Stage2 Line1 Distance Speed Collect Result_Dam'] != df_train['Stage2 Line3 Distance Speed Collect Result_Dam']) |
                                                  (df_train['Stage2 Line1 Distance Speed Collect Result_Dam'] != df_train['Stage2 Line4 Distance Speed Collect Result_Dam']) |
                                                  (df_train['Stage2 Line3 Distance Speed Collect Result_Dam'] != df_train['Stage2 Line4 Distance Speed Collect Result_Dam'])).astype(int)
df_train['Stage2 Line Sum Speed_Dam'] = df_train['Stage2 Line1 Distance Speed Collect Result_Dam'] + df_train['Stage2 Line2 Distance Speed Collect Result_Dam'] + df_train['Stage2 Line3 Distance Speed Collect Result_Dam'] + df_train['Stage2 Line4 Distance Speed Collect Result_Dam']

df_train['Stage3 Line diffent Distance Speed_Dam'] = ((df_train['Stage3 Line1 Distance Speed Collect Result_Dam'] != df_train['Stage3 Line2 Distance Speed Collect Result_Dam']) |
                                                  (df_train['Stage3 Line1 Distance Speed Collect Result_Dam'] != df_train['Stage3 Line3 Distance Speed Collect Result_Dam']) |
                                                  (df_train['Stage3 Line1 Distance Speed Collect Result_Dam'] != df_train['Stage3 Line4 Distance Speed Collect Result_Dam']) |
                                                  (df_train['Stage3 Line3 Distance Speed Collect Result_Dam'] != df_train['Stage3 Line4 Distance Speed Collect Result_Dam'])).astype(int)
df_train['Stage3 Line Sum Speed_Dam'] = df_train['Stage3 Line1 Distance Speed Collect Result_Dam'] + df_train['Stage3 Line2 Distance Speed Collect Result_Dam'] + df_train['Stage3 Line3 Distance Speed Collect Result_Dam'] + df_train['Stage3 Line4 Distance Speed Collect Result_Dam']

df_train = df_train.drop(columns=[
                                'Stage1 Line1 Distance Speed Collect Result_Dam',
                                'Stage1 Line2 Distance Speed Collect Result_Dam',
                                'Stage1 Line3 Distance Speed Collect Result_Dam',
                                'Stage1 Line4 Distance Speed Collect Result_Dam',
                                'Stage2 Line1 Distance Speed Collect Result_Dam',
                                'Stage2 Line2 Distance Speed Collect Result_Dam',
                                'Stage2 Line3 Distance Speed Collect Result_Dam',
                                'Stage2 Line4 Distance Speed Collect Result_Dam',
                                'Stage3 Line1 Distance Speed Collect Result_Dam',
                                'Stage3 Line2 Distance Speed Collect Result_Dam',
                                'Stage3 Line3 Distance Speed Collect Result_Dam',
                                'Stage3 Line4 Distance Speed Collect Result_Dam',
                                ])

# 단일값이 하나인 컬럼들, 의미를 찾고싶다면 주석처리 해야하는 것들
df_train = df_train.drop(columns=['CURE START POSITION X Collect Result_Dam', # Equipment에 따라서 정해지며, 하나로 책정됨.
                                'CURE START POSITION Z Collect Result_Dam', # START POSITION
                                'CURE START POSITION Θ Collect Result_Dam', # Equipment에 따라서 정해지며, 하나로 책정됨.
                                'HEAD Standby Position X Collect Result_Dam',
                                'HEAD Standby Position Y Collect Result_Dam',
                                'HEAD Standby Position Z Collect Result_Dam',
                                'Head Clean Position X Collect Result_Dam',
                                'Head Clean Position Y Collect Result_Dam', # 흔들림에 따라 Z
                                'Head Purge Position X Collect Result_Dam',
                                'Head Purge Position Y Collect Result_Dam',
                                'Head Zero Position X Collect Result_Dam',
                                'HEAD Standby Position X Collect Result_Fill1',
                                'HEAD Standby Position Y Collect Result_Fill1',
                                'HEAD Standby Position Z Collect Result_Fill1',
                                'Head Clean Position X Collect Result_Fill1',
                                'Head Clean Position Y Collect Result_Fill1',
                                'Head Clean Position Z Collect Result_Fill1',
                                'Head Purge Position X Collect Result_Fill1',
                                'Head Purge Position Y Collect Result_Fill1',
                                'CURE END POSITION X Collect Result_Fill2',
                                'CURE END POSITION Θ Collect Result_Fill2',
                                'CURE STANDBY POSITION X Collect Result_Fill2',
                                'CURE STANDBY POSITION Z Collect Result_Fill2',
                                'CURE STANDBY POSITION Θ Collect Result_Fill2',
                                'CURE START POSITION X Collect Result_Fill2',
                                'CURE START POSITION Θ Collect Result_Fill2',
                                ])

# AutoClave 의미없어보이는거 제거
df_train = df_train.drop(columns=['Chamber Temp. Collect Result_AutoClave',
                                  'Chamber Temp. Judge Value_AutoClave',
                                  'Chamber Temp. Unit Time_AutoClave',
                                  '1st Pressure Collect Result_AutoClave',
                                  '1st Pressure 1st Pressure Unit Time_AutoClave',
                                  '2nd Pressure Collect Result_AutoClave',
                                  '2nd Pressure Unit Time_AutoClave',
                                  '3rd Pressure Collect Result_AutoClave',
                                  '3rd Pressure Unit Time_AutoClave',
                                  ])

# QTY
# df_train = df_train.drop(columns=['Production Qty Collect Result_Dam',
#                                 'Production Qty Collect Result_Fill1',
#                                 'Production Qty Collect Result_Fill2',
    
# ])

In [None]:
# Receip 단일화
dtype = 'string'  # 원하는 데이터 타입
for column in ['Receip No Collect Result_Dam', 'Receip No Collect Result_Fill1', 'Receip No Collect Result_Fill2']:
    df_train[column] = df_train[column].astype(dtype)
    df_test[column] = df_test[column].astype(dtype)

df_train['Receip No'] = df_train['Receip No Collect Result_Dam'] + df_train['Receip No Collect Result_Fill1'] + df_train['Receip No Collect Result_Fill2']
df_test['Receip No'] = df_test['Receip No Collect Result_Dam'] + df_test['Receip No Collect Result_Fill1'] + df_test['Receip No Collect Result_Fill2']

df_train = df_train.drop(columns = ['Receip No Collect Result_Dam', 'Receip No Collect Result_Fill1', 'Receip No Collect Result_Fill2'])
df_test = df_test.drop(columns = ['Receip No Collect Result_Dam', 'Receip No Collect Result_Fill1', 'Receip No Collect Result_Fill2'])

In [None]:
# Equipment와 PalletID 하나로 만들기
df_train['Equipment'] = df_train['Equipment_Dam'] + df_train['Equipment_Fill1'] + df_train['Equipment_Fill2']
df_test['Equipment'] = df_test['Equipment_Dam'] + df_test['Equipment_Fill1'] + df_test['Equipment_Fill2']

df_train = df_train.drop(columns = ['Equipment_Dam', 'Equipment_Fill1', 'Equipment_Fill2'])
df_test = df_test.drop(columns = ['Equipment_Dam', 'Equipment_Fill1', 'Equipment_Fill2'])

In [None]:
# PalletID 단일화
dtype = 'int'  # 원하는 데이터 타입
for column in ['PalletID Collect Result_Dam', 'PalletID Collect Result_Fill1', 'PalletID Collect Result_Fill2']:
    df_train[column] = df_train[column].astype(dtype)
    df_test[column] = df_test[column].astype(dtype)
    
dtype = 'string'  # 원하는 데이터 타입
for column in ['PalletID Collect Result_Dam', 'PalletID Collect Result_Fill1', 'PalletID Collect Result_Fill2']:
    df_train[column] = df_train[column].astype(dtype)
    df_test[column] = df_test[column].astype(dtype)
    
df_train['PalletID'] = df_train['PalletID Collect Result_Dam'] + df_train['PalletID Collect Result_Fill1']
df_test['PalletID'] = df_test['PalletID Collect Result_Dam'] + df_test['PalletID Collect Result_Fill1']

df_train = df_train.drop(columns = ['PalletID Collect Result_Dam', 'PalletID Collect Result_Fill1', 'PalletID Collect Result_Fill2'])
df_test = df_test.drop(columns = ['PalletID Collect Result_Dam', 'PalletID Collect Result_Fill1', 'PalletID Collect Result_Fill2'])

In [None]:
# Production QTY 단일화
# dtype = 'string'  # 원하는 데이터 타입
# for column in ['Production Qty Collect Result_Dam', 'Production Qty Collect Result_Fill1', 'Production Qty Collect Result_Fill2']:
#     df_train[column] = df_train[column].astype(dtype)
#     df_test[column] = df_test[column].astype(dtype)

# df_train['Production Qty'] = df_train['Production Qty Collect Result_Dam'] + df_train['Production Qty Collect Result_Fill1'] + df_train['Production Qty Collect Result_Fill2']
# df_test['Production Qty'] = df_test['Production Qty Collect Result_Dam'] + df_test['Production Qty Collect Result_Fill1'] + df_test['Production Qty Collect Result_Fill2']

# df_train = df_train.drop(columns = ['Production Qty Collect Result_Dam', 'Production Qty Collect Result_Fill1', 'Production Qty Collect Result_Fill2'])
# df_test = df_test.drop(columns = ['Production Qty Collect Result_Dam', 'Production Qty Collect Result_Fill1', 'Production Qty Collect Result_Fill2'])

# Type 수정

In [None]:
label_encoders = {}
categorical_features = ['Workorder', 'Model.Suffix']

for feature in categorical_features:
    le = LabelEncoder()
    df_train[feature] = le.fit_transform(df_train[feature])
    
    # 검증 데이터에 있는 새로운 값에 대해 처리
    unique_values = set(df_test[feature].unique()) - set(le.classes_)
    if unique_values:
        # 새로운 값들을 인코딩할 무작위 숫자 생성
        new_labels = np.random.randint(0, len(le.classes_), size=len(unique_values))
        # 새로운 값들을 인코딩
        le.classes_ = np.append(le.classes_, list(unique_values))
        le.transform(list(unique_values))  # transform을 호출해서 classes_ 업데이트
    
    df_test[feature] = le.transform(df_test[feature])
    label_encoders[feature] = le

In [None]:
cat_train = df_train.copy()
cat_test = df_test.copy()

In [None]:
lgbm_train = df_train.copy()
lgbm_test = df_test.copy()

In [None]:
xgb_train = df_train.copy()
xgb_test = df_test.copy()

# 데이터 학습

In [None]:
df_train.columns

In [None]:
len(df_train.columns)

### CatBoost

In [None]:
# 'Receip No Collect Result_Dam', 'Receip No Collect Result_Fill1','Receip No Collect Result_Fill2', 'PalletID Collect Result_Dam', 'PalletID Collect Result_Fill1', 'PalletID Collect Result_Fill2', 'Equipment_Dam', 'Equipment_Fill1', 'Equipment_Fill2',

In [None]:
columns_to_convert = ['Receip No', 'Equipment', 'PalletID', 'Model.Suffix', 'Workorder']  # 변환할 컬럼명 리스트
columns_to = ['Head Zero Position Y Collect Result_Dam',
                'Head Zero Position Z Collect Result_Dam',
                'Head Clean Position Z Collect Result_Dam',
                'Head Purge Position Z Collect Result_Dam',
                'Head Purge Position Z Collect Result_Fill1',
                'CURE START POSITION Z Collect Result_Fill2',
                'CURE END POSITION Z Collect Result_Fill2',
                'CURE SPEED Collect Result_Fill2',
                'Stage1 Circle Distance Speed_Dam',
                'Stage2 Circle Distance Speed_Dam',
                'Stage3 Circle Distance Speed_Dam',
                'Stage1 Line diffent Distance Speed_Dam',
                'Stage1 Line Sum Speed_Dam',
                'Stage2 Line diffent Distance Speed_Dam',
                'Stage2 Line Sum Speed_Dam',
                'Stage3 Line diffent Distance Speed_Dam',
                'Stage3 Line Sum Speed_Dam', 'Minus1Y_Dam', 'Minus2Y_Dam',
                'inconsistant'
             ]

dtype = 'string'  # 원하는 데이터 타입
for column in columns_to_convert + columns_to:
    cat_train[column] = cat_train[column].astype(dtype)
    cat_test[column] = cat_test[column].astype(dtype)
    
dtype = 'category'  # 원하는 데이터 타입
for column in columns_to_convert + columns_to:
    cat_train[column] = cat_train[column].astype(dtype)
    cat_test[column] = cat_test[column].astype(dtype)

In [None]:
X = cat_train.drop(columns=['target'])
y = cat_train['target'].apply(lambda x: True if x == 'AbNormal' else False)

cat_features_indices = ['Receip No', 'Equipment', 'PalletID', 'Model.Suffix', 'Workorder']  + columns_to

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

train_pool = Pool(X_train, y_train, cat_features=cat_features_indices)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_features_indices)

def objective(trial):
    # 하이퍼파라미터를 샘플링
    params = {
        "iterations": trial.suggest_int("iterations", 100, 1000),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1.0, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-2, 10.0),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "random_strength": trial.suggest_float("random_strength", 1e-9, 10.0),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0),
        "od_type": trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
        "od_wait": trial.suggest_int("od_wait", 10, 50),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "verbose": 0,
        "random_seed": 42
    }
    
    # CatBoost 모델 학습
    model = CatBoostClassifier(**params)
    model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=50, verbose=0)
    
    # 검증 세트에 대한 예측 및 평가
    preds = model.predict(X_valid)
    f1 = f1_score(y_valid, preds)
    
    return f1

# Optuna 스터디 생성 및 최적화
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=12)

# 최적의 하이퍼파라미터 출력
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
# 최적의 하이퍼파라미터로 모델 재학습
cat_best_params = study.best_trial.params
cat_best_params["random_seed"] = 42
cat_best_model = CatBoostClassifier(**cat_best_params)
cat_best_model.fit(X_train, y_train, cat_features=cat_features_indices)

In [None]:
# 위 feature importance를 시각화해봅니다.
importances = pd.Series(cat_best_model.feature_importances_, index=list(X_train.columns))
importances = importances.sort_values(ascending=False)

plt.figure(figsize=(10,8))
plt.title("Feature Importances")
sns.barplot(x=importances, y=importances.index)
plt.show()

In [None]:
pred = cat_best_model.predict(X_valid)
get_clf_eval(y_valid, pred)

In [None]:
# Precision - Recall
y_pred_proba = cat_best_model.predict_proba(X_valid)[:, 1]

precision, recall, thresholds = precision_recall_curve(y_valid, y_pred_proba)
f1_scores = 2*recall*precision / (recall + precision)
cat_best_threshold = thresholds[np.argmax(f1_scores)]
y_pred_custom_threshold = (y_pred_proba >= cat_best_threshold).astype(int)
get_clf_eval(y_valid, y_pred_custom_threshold)

In [None]:
proba1 = y_pred_proba

### LGBM

In [None]:
columns_to_convert = ['Receip No', 'Equipment', 'PalletID', 'Workorder', 'Model.Suffix']  # 변환할 컬럼명 리스트
columns_to = ['Head Zero Position Y Collect Result_Dam',
                'Head Zero Position Z Collect Result_Dam',
                'Head Clean Position Z Collect Result_Dam',
                'Head Purge Position Z Collect Result_Dam',
                'Head Purge Position Z Collect Result_Fill1',
                'CURE START POSITION Z Collect Result_Fill2',
                'CURE END POSITION Z Collect Result_Fill2',
                'CURE SPEED Collect Result_Fill2',
                'Stage1 Circle Distance Speed_Dam',
                'Stage2 Circle Distance Speed_Dam',
                'Stage3 Circle Distance Speed_Dam',
                'Stage1 Line diffent Distance Speed_Dam',
                'Stage1 Line Sum Speed_Dam',
                'Stage2 Line diffent Distance Speed_Dam',
                'Stage2 Line Sum Speed_Dam',
                'Stage3 Line diffent Distance Speed_Dam',
                'Stage3 Line Sum Speed_Dam', 'Minus1Y_Dam', 'Minus2Y_Dam',
                'inconsistant'
             ]

dtype = 'float'  # 원하는 데이터 타입
for column in columns_to_convert + columns_to:
    lgbm_train[column] = lgbm_train[column].astype(dtype)
    lgbm_test[column] = lgbm_test[column].astype(dtype)
    
dtype = 'category'  # 원하는 데이터 타입
for column in columns_to_convert + columns_to:
    lgbm_train[column] = lgbm_train[column].astype(dtype)
    lgbm_test[column] = lgbm_test[column].astype(dtype)

In [None]:
X = lgbm_train.drop(columns=['target'])
y = lgbm_train['target'].apply(lambda x: True if x == 'AbNormal' else False)

cat_features_indices = ['Receip No', 'Equipment', 'PalletID', 'Model.Suffix', 'Workorder'] + columns_to

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial):
    lgbm_params = {
        'n_estimators': trial.suggest_int('n_estimators', 400, 1500),
        "max_depth": trial.suggest_int('max_depth', 3, 63),
        "learning_rate": trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True), 
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        "min_child_weight": trial.suggest_float('min_child_weight', 0.5, 4),
        "min_child_samples": trial.suggest_int('min_child_samples', 5, 100),
        "subsample": trial.suggest_float('subsample', 0.4, 1),
        "subsample_freq": trial.suggest_int('subsample_freq', 0, 5),
        "colsample_bytree": trial.suggest_float('colsample_bytree', 0.2, 1),
        'num_leaves': trial.suggest_int('num_leaves', 2, 64),
        "random_seed": 42,
    }

    model = LGBMClassifier(**lgbm_params, device='cpu', random_state=42, verbose=-1)

    # 범주형 피처 적용
    model.fit(X_train, y_train, categorical_feature=cat_features_indices)

    # 검증 데이터에서 예측 수행
    y_pred = model.predict(X_valid)

    # F1 점수 계산
    f1 = f1_score(y_valid, y_pred)

    return f1

# Optuna 스터디 생성 및 최적화
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=50)

# 최적의 하이퍼파라미터 출력
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
lgbm_best_params = study.best_trial.params
lgbm_best_params["random_state"] = 42
lgbm_best_model = LGBMClassifier(**lgbm_best_params)
lgbm_best_model.fit(X_train, y_train, categorical_feature=cat_features_indices)

In [None]:
# 위 feature importance를 시각화해봅니다.
importances = pd.Series(lgbm_best_model.feature_importances_, index=list(X_train.columns))
importances = importances.sort_values(ascending=False)

plt.figure(figsize=(10,8))
plt.title("Feature Importances")
sns.barplot(x=importances, y=importances.index)
plt.show()

In [None]:
pred = lgbm_best_model.predict(X_valid)
get_clf_eval(y_valid, pred)

In [None]:
# Precision - Recall
y_pred_proba = lgbm_best_model.predict_proba(X_valid)[:, 1]

precision, recall, thresholds = precision_recall_curve(y_valid, y_pred_proba)
f1_scores = 2*recall*precision / (recall + precision)
lgbm_best_threshold = thresholds[np.argmax(f1_scores)]
y_pred_custom_threshold = (y_pred_proba >= lgbm_best_threshold).astype(int)
get_clf_eval(y_valid, y_pred_custom_threshold)

In [None]:
proba2 = y_pred_proba

# XGB

In [None]:
columns_to_convert = ['Receip No', 'Equipment', 'PalletID', 'Workorder', 'Model.Suffix']  # 변환할 컬럼명 리스트
columns_to = ['Head Zero Position Y Collect Result_Dam',
                'Head Zero Position Z Collect Result_Dam',
                'Head Clean Position Z Collect Result_Dam',
                'Head Purge Position Z Collect Result_Dam',
                'Head Purge Position Z Collect Result_Fill1',
                'CURE START POSITION Z Collect Result_Fill2',
                'CURE END POSITION Z Collect Result_Fill2',
                'CURE SPEED Collect Result_Fill2',
                'Stage1 Circle Distance Speed_Dam',
                'Stage2 Circle Distance Speed_Dam',
                'Stage3 Circle Distance Speed_Dam',
                'Stage1 Line diffent Distance Speed_Dam',
                'Stage1 Line Sum Speed_Dam',
                'Stage2 Line diffent Distance Speed_Dam',
                'Stage2 Line Sum Speed_Dam',
                'Stage3 Line diffent Distance Speed_Dam',
                'Stage3 Line Sum Speed_Dam',
                'inconsistant'
             ]

dtype = 'float'  # 원하는 데이터 타입
for column in columns_to_convert + columns_to:
    xgb_train[column] = xgb_train[column].astype(dtype)
    xgb_test[column] = xgb_test[column].astype(dtype)

In [None]:
X = xgb_train.drop(columns=['target'])
y = xgb_train['target'].apply(lambda x: True if x == 'AbNormal' else False)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0, log=True),
        'random_state': 42
    }

    model = xgb.XGBClassifier(eval_metric='logloss', **params)
    
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=0)
    
    preds = model.predict(X_valid)
    f1 = f1_score(y_valid, preds)
    
    return f1

# Optuna 스터디 생성 및 최적화
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=50)

# 최적의 하이퍼파라미터 출력
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
# 최적의 하이퍼파라미터로 모델 재학습
xgb_best_params = study.best_trial.params
xgb_best_params["random_state"] = 42
xgb_best_model = xgb.XGBClassifier(**xgb_best_params)
xgb_best_model.fit(X_train, y_train)

In [None]:
# 위 feature importance를 시각화해봅니다.
importances = pd.Series(xgb_best_model.feature_importances_, index=list(X_train.columns))
importances = importances.sort_values(ascending=False)

plt.figure(figsize=(10,8))
plt.title("Feature Importances")
sns.barplot(x=importances, y=importances.index)
plt.show()

In [None]:
pred = xgb_best_model.predict(X_valid)
get_clf_eval(y_valid, pred)

In [None]:
# Precision - Recall

y_pred_proba = xgb_best_model.predict_proba(X_valid)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_valid, y_pred_proba)
f1_scores = 2*recall*precision / (recall + precision)
xgb_best_threshold = thresholds[np.argmax(f1_scores)]
y_pred_custom_threshold = (y_pred_proba >= xgb_best_threshold).astype(int)
get_clf_eval(y_valid, y_pred_custom_threshold)

In [None]:
proba3 = y_pred_proba

### Voting 실험?

In [None]:
y_1_2 = proba1 + proba2 / 2
y_1_3 = proba1 + proba3 / 2
y_2_3 = proba2 + proba3 / 2
y_1_2_3 = proba1 + proba2 + proba3 / 3
# y_1_2_3_4 = proba1 + proba2 + proba3 + proba4 / 4
# y_1_2_3_4_5 = proba1 + proba2 + proba3 + proba4 + proba5 / 5

In [None]:
precision, recall, thresholds = precision_recall_curve(y_valid, y_1_2)
f1_scores = 2*recall*precision / (recall + precision)
best_threshold_1 = thresholds[np.argmax(f1_scores)]
y_pred_custom_threshold_1 = (y_1_2 >= best_threshold_1).astype(int)
get_clf_eval(y_valid, y_pred_custom_threshold_1)

In [None]:
precision, recall, thresholds = precision_recall_curve(y_valid, y_1_3)
f1_scores = 2*recall*precision / (recall + precision)
best_threshold_2 = thresholds[np.argmax(f1_scores)]
y_pred_custom_threshold = (y_1_3 >= best_threshold_2).astype(int)
get_clf_eval(y_valid, y_pred_custom_threshold)

In [None]:
precision, recall, thresholds = precision_recall_curve(y_valid, y_2_3)
f1_scores = 2*recall*precision / (recall + precision)
best_threshold_3 = thresholds[np.argmax(f1_scores)]
y_pred_custom_threshold = (y_2_3 >= best_threshold_3).astype(int)
get_clf_eval(y_valid, y_pred_custom_threshold)

In [None]:
precision, recall, thresholds = precision_recall_curve(y_valid, y_1_2_3)
f1_scores = 2*recall*precision / (recall + precision)
best_threshold_4 = thresholds[np.argmax(f1_scores)]
y_pred_custom_threshold = (y_1_2_3 >= best_threshold_4).astype(int)
get_clf_eval(y_valid, y_pred_custom_threshold)

In [None]:
best_f1 = 0
best_f1_t = 0
best_weights = None
best_weights_t = None

# 가중치 조합 테스트
for w1 in range(1, 15):
    for w2 in range(0, 15):
        for w3 in range(0, 30):
                # 가중 평균 계산
                final_proba = (w1 * proba1 + w2 * proba2 + w3 * proba3) / (w1 + w2 + w3)
                y_pred = (final_proba > 0.5).astype(int)
            
                # F1 스코어 계산
                f1 = f1_score(y_valid, y_pred)
                print(f1)
                
                # Threshold 스코어 계산
                precision, recall, thresholds = precision_recall_curve(y_valid, final_proba)
                f1_scores = 2*recall*precision / (recall + precision)
                best_threshold = thresholds[np.argmax(f1_scores)]
                y_pred_custom_threshold = (final_proba >= best_threshold).astype(int)
                f1_t = f1_score(y_valid, y_pred_custom_threshold)
                print(f1_t)
                
                # 최고 성능 저장
                if f1 > best_f1:
                    best_f1 = f1
                    best_weights = (w1, w2, w3)
                
                if f1_t > best_f1_t:
                    best_f1_t = f1_t
                    best_weights_t = (w1, w2, w3)

print("Best F1 Score: ", best_f1)
print("Best Weights: ", best_weights)
print("Best F1_t Score: ", best_f1_t)
print("Best Weights_t: ", best_weights_t)

In [None]:
y_best = (best_weights_t[0] * proba1 + best_weights_t[1] * proba2 + best_weights_t[2] * proba3) / (best_weights_t[0] + best_weights_t[1] + best_weights_t[2])

In [None]:
precision, recall, thresholds = precision_recall_curve(y_valid, y_best)
f1_scores = 2*recall*precision / (recall + precision)
weights_best_threshold = thresholds[np.argmax(f1_scores)]
y_pred_custom_threshold = (y_best >= weights_best_threshold).astype(int)
get_clf_eval(y_valid, y_pred_custom_threshold)

In [None]:
weights_best_threshold

# 학습 후 예측 및 제출용 데이터 생성

In [None]:
set_id = pd.read_csv('test_df.csv')
cat_test = pd.concat([cat_test, set_id['Set ID']], axis = 1)
xgb_test = pd.concat([xgb_test, set_id['Set ID']], axis = 1)
lgbm_test = pd.concat([lgbm_test, set_id['Set ID']], axis = 1)
# extra_test = pd.concat([extra_test, set_id['Set ID']], axis = 1)

### Catboost

In [None]:
cat_pred = cat_best_model.predict(cat_test.drop(columns='Set ID'))

In [None]:
sum(cat_pred)

In [None]:
cat_pred_proba = cat_best_model.predict_proba(cat_test.drop(columns='Set ID'))[:, 1]

In [None]:
y_pred_cat = (cat_pred_proba >= cat_best_threshold).astype(int)

In [None]:
sum(y_pred_cat)

### LGBM

In [None]:
lgbm_pred = lgbm_best_model.predict(lgbm_test.drop(columns='Set ID'))

In [None]:
sum(lgbm_pred)

In [None]:
lgbm_pred_proba = lgbm_best_model.predict_proba(lgbm_test.drop(columns='Set ID'))[:, 1]

In [None]:
y_pred_lgbm = (lgbm_pred_proba >= lgbm_best_threshold).astype(int)

In [None]:
sum(y_pred_lgbm)

### XGB

In [None]:
xgb_pred = xgb_best_model.predict(xgb_test.drop(columns='Set ID'))

In [None]:
sum(xgb_pred)

In [None]:
xgb_pred_proba = xgb_best_model.predict_proba(xgb_test.drop(columns='Set ID'))[:, 1]

In [None]:
y_pred_xgb = (xgb_pred_proba >= best_threshold).astype(int)

In [None]:
sum(y_pred_lgbm)

### Voting

In [None]:
y_result = (cat_pred_proba * 1 + lgbm_pred_proba * 11 + xgb_pred_proba * 4) / 16

In [None]:
y_pred_custom_threshold = (y_result >= weights_best_threshold).astype(int)

In [None]:
sum(y_pred_custom_threshold)

### 데이터 결정 및 결합

In [None]:
result = y_pred_custom_threshold

In [None]:
y_pred = np.where(result == 0, "Normal", "AbNormal")

In [None]:
y_pred

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = y_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)