# 불량품 예측

불량품을 예측하기 위해 다음과 같은 함수화 정리를 진행한다. 혼란을 막기 위해 모든 과정을 함수화 하기로 한다.  
목차는 다음과 같다.

- 1. Load packages & Data
- 2. Function

## Equip1 유의한 변수

'DISCHARGED SPEED OF RESIN Collect Result_Dam',  
'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam',  
'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam',  
'Stage2 Circle1 Distance Speed Collect Result_Dam',  
'THICKNESS 1 Collect Result_Dam',  
'THICKNESS 2 Collect Result_Dam',  
'THICKNESS 3 Collect Result_Dam',  
'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1',  
'Dispense Volume(Stage3) Collect Result_Fill1',  
'Stage2 Line diffent Distance Speed_Dam',  
'round_1st_time',  
'round_2nd_time'

## Equip2 유의한 변수

'Model.Suffix_Dam',  
'DISCHARGED SPEED OF RESIN Collect Result_Dam',  
'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam',  
'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam',  
'Head Zero Position Y Collect Result_Dam',  
'Stage2 Circle1 Distance Speed Collect Result_Dam',  
'DISCHARGED SPEED OF RESIN Collect Result_Fill1',  
'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1',  
'Dispense Volume(Stage3) Collect Result_Fill1',  
'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',  
'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1',  
'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1',  
'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1',  
'Machine Tact time Collect Result_Fill1',  
'CURE SPEED Collect Result_Fill2',  
'CURE START POSITION Z Collect Result_Fill2',  
'Stage2 Line diffent Distance Speed_Dam',  
'round_1st_time',  
'round_2nd_time',  
'round_3rd_time',  
'workorder_third'  

## 1. Load Packages & Data

In [1]:
### ide packages
import os
from pprint import pprint
import numpy as np
import pandas as pd
import random
import string
from tqdm import tqdm

# sklearn preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    make_scorer,
    roc_curve,
    auc,
    precision_recall_curve,
    recall_score,
    silhouette_score,
)

# models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
from catboost import CatBoostClassifier, Pool
from lightgbm import LGBMClassifier, plot_metric
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.cluster import KMeans

# plot
import seaborn as sns
import matplotlib.pyplot as plt
import plotly
import plotly.graph_objects as go
import plotly.express as px

# tuning
import optuna

# validation 
from scipy.stats import chi2_contingency
from scipy.stats import levene, ttest_ind
from scipy.stats import bartlett
from scipy.stats import shapiro

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load data
load_dir = './data/'
train = pd.read_csv(load_dir + "train.csv")
test = pd.read_csv(load_dir + "test.csv")

## 2. Functions

In [3]:
# 평가 스코어 지정하기
f1_scorer = make_scorer(f1_score, pos_label=1, average = 'binary')

# 평가 매트릭 계산 결과 보여주기
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [4]:
# 해당 공정 맞춤형 위치 옮기기
def move_data(data):
    # divide
    dam = data.filter(regex='_Dam')
    fill1 = data.filter(regex='_Fill1')
    fill2 = data.filter(regex='_Fill2')
    autoclave = data.filter(regex='_AutoClave')
    target = data['target']

    # dam
    dam = dam.dropna(axis=1, how='all')
    dam = dam.drop(columns='HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam')
    dam_mask = dam[dam['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].isin(['OK', np.nan])].iloc[:, 24:].shift(-1, axis = 1).values
    dam.loc[dam['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].isin(['OK', np.nan]), dam.columns[24:]] = dam_mask
    dam = dam.drop(columns='WorkMode Collect Result_Dam')

    # fill1
    fill1 = fill1.dropna(axis=1, how='all')
    fill1 = fill1.drop(columns='HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1')
    fill1_mask = fill1[fill1['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].isin(['OK', np.nan])].iloc[:, 14:].shift(-1, axis = 1).values
    fill1.loc[fill1['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].isin(['OK', np.nan]), fill1.columns[14:]] = fill1_mask
    fill1 = fill1.drop(columns='WorkMode Collect Result_Fill1')

    # fill2
    fill2 = fill2.dropna(axis=1, how='all')
    fill2 = fill2.drop(columns='HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2')
    fill2_mask = fill2[fill2['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'].isin(['OK', np.nan])].iloc[:, 24:].shift(-1, axis = 1).values
    fill2.loc[fill2['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'].isin(['OK', np.nan]), fill2.columns[24:]] = fill2_mask
    fill2 = fill2.drop(columns='WorkMode Collect Result_Fill2')

    # CONCAT
    data = pd.concat([dam, fill1, fill2, autoclave, target], axis=1)
    
    return data

In [5]:
# Dam, Fill1, Fill2에서 지정된 값이 다를 경우 Abnormal 
def inconsistant(data, columnname, iwantthiscolumnsname, is_train = True):
    # 장비 번호가 다르면 불일치
    if is_train:
        cri = [
            df_train[columnname + '_Dam'] != df_train[columnname + '_Fill1'],
            df_train[columnname + '_Dam'] != df_train[columnname + '_Fill2'],
            df_train[columnname + '_Fill1'] != df_train[columnname + '_Fill2'],
            data[iwantthiscolumnsname] == 1
        ]
        
    else:
        cri = [
            df_test[columnname + '_Dam'] != df_test[columnname + '_Fill1'],
            df_test[columnname + '_Dam'] != df_test[columnname + '_Fill2'],
            df_test[columnname + '_Fill1'] != df_test[columnname + '_Fill1'],
            data[iwantthiscolumnsname] == 1
        ]
    con = [1, 1, 1, 1]

    data[iwantthiscolumnsname] = np.select(cri, con, default = 0)

In [6]:
# 변수 세팅
def variable_setting(types, tr, te, cat_col):
    train = tr.copy()
    test = te.copy()
    
    if types == 'catboost':
        dtype = 'string'  # 원하는 데이터 타입
        for column in cat_col:
            train[column] = train[column].astype(dtype)
            test[column] = test[column].astype(dtype)

        dtype = 'category'  # 원하는 데이터 타입
        for column in cat_col:
            train[column] = train[column].astype(dtype)
            test[column] = test[column].astype(dtype)
            
    elif types == 'lightgbm':
        dtype = 'float'  # 원하는 데이터 타입
        for column in cat_col:
            train[column] = train[column].astype(dtype)
            test[column] = test[column].astype(dtype)

        dtype = 'category'  # 원하는 데이터 타입
        for column in cat_col:
            train[column] = train[column].astype(dtype)
            test[column] = test[column].astype(dtype)
            
    elif types == 'xgboost':
        dtype = 'float'  # 원하는 데이터 타입
        for column in cat_col:
            train[column] = train[column].astype(dtype)
            test[column] = test[column].astype(dtype)
            
    return train, test

In [7]:
def model_best_threshold(model, X_valid, y_valid):
    
    # Precision - Recall
    y_pred_proba = model.predict_proba(X_valid)[:, 1]

    precision, recall, thresholds = precision_recall_curve(y_valid, y_pred_proba)
    f1_scores = 2*recall*precision / (recall + precision)
    cat_best_threshold = thresholds[np.argmax(f1_scores)]
    y_pred_custom_threshold = (y_pred_proba >= cat_best_threshold).astype(int)
    
    return thresholds, y_pred_custom_threshold

In [8]:
def xgboost_optuna(train, cat_features_indices):
    X = train.drop(columns=['target'])
    y = train['target']

    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1.0, log=True),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'gamma': trial.suggest_float('gamma', 0, 5),
            'lambda': trial.suggest_float('lambda', 1e-3, 10.0, log=True),
            'alpha': trial.suggest_float('alpha', 1e-3, 10.0, log=True),
            'seed': 42,
        }

        model = XGBClassifier(eval_metric='logloss', **params, early_stopping_rounds = 50)

        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=0)

        # 검증 세트에 대한 예측 및 평가
        preds = model.predict_proba(X_valid)[:, 1]

        # thresholds
        precision, recall, thresholds = precision_recall_curve(y_valid, preds)
        f1_scores = 2*recall*precision / (recall + precision)
        cat_best_threshold = thresholds[np.argmax(f1_scores)]
        y_pred_custom_threshold_cat = (preds >= cat_best_threshold).astype(int)

        f1 = f1_score(y_valid, y_pred_custom_threshold_cat)

        return f1

    # Optuna 스터디 생성 및 최적화
    sampler = optuna.samplers.TPESampler(seed=42)
    study = optuna.create_study(direction="maximize", sampler=sampler)
    study.optimize(objective, n_trials=50)

    # 최적의 하이퍼파라미터 출력
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))
        
    
#     study_best_trial_params = {
#         'n_estimators': 389,
#         'max_depth': 7,
#         'learning_rate': 0.5624862523377674,
#         'subsample': 0.6901977494513396,
#         'colsample_bytree': 0.6626522287203345,
#         'gamma': 1.5788663422268037,
#         'lambda': 0.006161637899604562,
#         'alpha': 0.040928088401419954
#     }
    
    return study.best_trial.params, X, y, X_train.index, X_valid.index

In [9]:
def lightgbm_optuna(train, cat_features_indices):

    X = train.drop(columns=['target'])
    y = train['target']

    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    def objective(trial):
        lgbm_params = {
            'n_estimators': trial.suggest_int('n_estimators', 400, 1500),
            "max_depth": trial.suggest_int('max_depth', 3, 63),
            "learning_rate": trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True), 
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
            "min_child_weight": trial.suggest_float('min_child_weight', 0.5, 4),
            "min_child_samples": trial.suggest_int('min_child_samples', 5, 100),
            "subsample": trial.suggest_float('subsample', 0.4, 1),
            "subsample_freq": trial.suggest_int('subsample_freq', 0, 5),
            "colsample_bytree": trial.suggest_float('colsample_bytree', 0.2, 1),
            'num_leaves': trial.suggest_int('num_leaves', 2, 64),
        }

        model = LGBMClassifier(**lgbm_params, device='cpu', random_state=42, verbose=-1)

        # 범주형 피처 적용
        model.fit(X_train, y_train, categorical_feature=cat_features_indices,
            eval_set = [(X_valid, y_valid)],
            callbacks=[
                lgb.early_stopping(stopping_rounds=50),
            ]
        )

        # 검증 세트에 대한 예측 및 평가
        preds = model.predict_proba(X_valid)[:, 1]

        # thresholds
        precision, recall, thresholds = precision_recall_curve(y_valid, preds)
        f1_scores = 2*recall*precision / (recall + precision)
        cat_best_threshold = thresholds[np.argmax(f1_scores)]
        y_pred_custom_threshold_cat = (preds >= cat_best_threshold).astype(int)

        f1 = f1_score(y_valid, y_pred_custom_threshold_cat)

        return f1

    # Optuna 스터디 생성 및 최적화
    sampler = optuna.samplers.TPESampler(seed=42)
    study = optuna.create_study(direction="maximize", sampler=sampler)
    study.optimize(objective, n_trials=50)

    # 최적의 하이퍼파라미터 출력
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

#     study_best_trial_params = {
#         'n_estimators': 1181,
#         'max_depth': 42,
#         'learning_rate': 0.009444269250537143,
#         'reg_alpha': 0.0358194186029774,
#         'reg_lambda': 0.001046166296905767,
#         'min_child_weight': 3.3802455616568117,
#         'min_child_samples': 38,
#         'subsample': 0.4032508293056122,
#         'subsample_freq': 0,
#         'colsample_bytree': 0.2541295048133576,
#         'num_leaves': 62
#     }
        
    return study.best_trial.params, X, y, X_train.index, X_valid.index

In [10]:
def catboost_optuna(train, cat_features_indices):
    
    # train X, y
    X = train.drop(columns=['target'])
    y = train['target']

    # $plit 
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Pooling
    train_pool = Pool(X_train, y_train, cat_features=cat_features_indices)
    valid_pool = Pool(X_valid, y_valid, cat_features=cat_features_indices)
    
    # tuning parameters
    def objective(trial):
        # 하이퍼파라미터를 샘플링
        params = {
            "iterations": trial.suggest_int("iterations", 100, 1000),
            "depth": trial.suggest_int("depth", 4, 10),
            "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1.0, log=True),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-2, 10.0),
            "border_count": trial.suggest_int("border_count", 32, 255),
            "random_strength": trial.suggest_float("random_strength", 1e-9, 10.0),
            "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0),
            "od_type": trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
            "od_wait": trial.suggest_int("od_wait", 10, 50),
            "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
#             "scale_pos_weight": trial.suggest_int('scale_pos_weight', 6, 10),
            "verbose": 0,
            "random_seed": 42,
            'one_hot_max_size': 4
        }

        # CatBoost 모델 학습
        model = CatBoostClassifier(**params)
        model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=50, verbose=0)

        # 검증 세트에 대한 예측 및 평가
        preds = model.predict_proba(X_valid)[:, 1]

        # thresholds
        precision, recall, thresholds = precision_recall_curve(y_valid, preds)
        f1_scores = 2*recall*precision / (recall + precision)
        cat_best_threshold = thresholds[np.argmax(f1_scores)]
        y_pred_custom_threshold_cat = (preds >= cat_best_threshold).astype(int)

        f1 = f1_score(y_valid, y_pred_custom_threshold_cat)
        
        return f1

    # Optuna 스터디 생성 및 최적화
    sampler = optuna.samplers.TPESampler(seed=42)
    study = optuna.create_study(direction="maximize", sampler=sampler)
    study.optimize(objective, n_trials=50)

    # 최적의 하이퍼파라미터 출력
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))
    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))
    
#     study_best_trial_params = {
#         'iterations': 800,
#         'depth': 7,
#         'learning_rate': 0.810235620776663,
#         'l2_leaf_reg': 9.445546463334189,
#         'border_count': 175,
#         'random_strength': 9.076647952917511,
#         'bagging_temperature': 0.940243954743633,
#         'od_type': 'IncToDec',
#         'od_wait': 23,
#         'boosting_type': 'Plain'
#     }
    
    return study.best_trial.params, X, y, X_train.index, X_valid.index

## 3. Setting

In [11]:
# 기준
columnname = ['Equipment', 'Receip No Collect Result', 'Production Qty Collect Result', 'PalletID Collect Result', ]

In [12]:
# drop oolumns
drop_col = [
    
    # 단일 칼럼
    'Wip Line_Dam',
    'Process Desc._Dam',
    'Insp. Seq No._Dam',
    'Insp Judge Code_Dam',
    'Wip Line_Fill1',
    'Process Desc._Fill1',
    'Insp. Seq No._Fill1',
    'Insp Judge Code_Fill1',
    'Wip Line_Fill2',
    'Process Desc._Fill2',
    'Insp. Seq No._Fill2',
    'Insp Judge Code_Fill2',
    'Wip Line_AutoClave',
    'Process Desc._AutoClave',
    'Equipment_AutoClave',
    'Insp. Seq No._AutoClave',
    'Insp Judge Code_AutoClave',
    'GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave',
    'GMES_ORIGIN_INSP_JUDGE_CODE Unit Time_AutoClave',
    'GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave',
    
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2',
    'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2',
    'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2',
    'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2',
    'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2',
    'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2',
    'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2',
    'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2',
    'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2',
    'HEAD Standby Position X Collect Result_Fill2',
    'HEAD Standby Position Y Collect Result_Fill2',
    'HEAD Standby Position Z Collect Result_Fill2',
    'Head Clean Position X Collect Result_Fill2',
    'Head Clean Position Y Collect Result_Fill2',
    'Head Clean Position Z Collect Result_Fill2',
    'Head Purge Position X Collect Result_Fill2',
    'Head Purge Position Y Collect Result_Fill2',
    'Head Purge Position Z Collect Result_Fill2',
    
    'DISCHARGED SPEED OF RESIN Collect Result_Fill2',
    'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill2',
    'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill2',
    'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill2',
    'Dispense Volume(Stage1) Collect Result_Fill2',
    'Dispense Volume(Stage2) Collect Result_Fill2',
    'Dispense Volume(Stage3) Collect Result_Fill2',
    
    'HEAD Standby Position X Collect Result_Dam',
    'HEAD Standby Position Y Collect Result_Dam',
    'HEAD Standby Position Z Collect Result_Dam',
    'Head Clean Position X Collect Result_Dam',
    'Head Clean Position Y Collect Result_Dam',
    'Head Purge Position X Collect Result_Dam',
    'Head Purge Position Y Collect Result_Dam',
    'Head Zero Position X Collect Result_Dam',
    'Head Zero Position Y Collect Result_Dam',
    'Head Zero Position Z Collect Result_Dam',
    
    '1st Pressure Judge Value_AutoClave',
    '2nd Pressure Judge Value_AutoClave',
    '3rd Pressure Judge Value_AutoClave',
    'Chamber Temp. Judge Value_AutoClave',
    
    'HEAD Standby Position X Collect Result_Fill1',
    'HEAD Standby Position Y Collect Result_Fill1',
    'HEAD Standby Position Z Collect Result_Fill1',
    'Head Clean Position X Collect Result_Fill1',
    'Head Clean Position Y Collect Result_Fill1',
    'Head Clean Position Z Collect Result_Fill1',
    'Head Purge Position X Collect Result_Fill1',
    'Head Purge Position Y Collect Result_Fill1',

    # Cure 변수는 거의 동일함 -> equipment별로 dam은 동일하기 때문에 자르기
    'CURE END POSITION X Collect Result_Fill2',
    'CURE END POSITION Z Collect Result_Fill2',
    'CURE END POSITION Θ Collect Result_Fill2',
    'CURE STANDBY POSITION X Collect Result_Fill2',
    'CURE STANDBY POSITION Z Collect Result_Fill2',
    'CURE STANDBY POSITION Θ Collect Result_Fill2',
    'CURE START POSITION X Collect Result_Fill2',
    'CURE START POSITION Z Collect Result_Fill2',
    'CURE START POSITION Θ Collect Result_Fill2',

    'CURE END POSITION X Collect Result_Dam',
    'CURE END POSITION Z Collect Result_Dam',
    'CURE END POSITION Θ Collect Result_Dam',
    'CURE STANDBY POSITION X Collect Result_Dam',
    'CURE STANDBY POSITION Z Collect Result_Dam',
    'CURE STANDBY POSITION Θ Collect Result_Dam',
    'CURE START POSITION X Collect Result_Dam',
    'CURE START POSITION Z Collect Result_Dam',
    'CURE START POSITION Θ Collect Result_Dam',
    
    # 라인 서클 축약해서 넣어둠
    'Stage1 Circle2 Distance Speed Collect Result_Dam',
    'Stage1 Circle3 Distance Speed Collect Result_Dam',
    'Stage1 Circle4 Distance Speed Collect Result_Dam',
    'Stage1 Line1 Distance Speed Collect Result_Dam',
    'Stage1 Line2 Distance Speed Collect Result_Dam',
    'Stage1 Line3 Distance Speed Collect Result_Dam',
    'Stage1 Line4 Distance Speed Collect Result_Dam',
    'Stage2 Circle2 Distance Speed Collect Result_Dam',
    'Stage2 Circle3 Distance Speed Collect Result_Dam',
    'Stage2 Circle4 Distance Speed Collect Result_Dam',
    'Stage2 Line1 Distance Speed Collect Result_Dam',
    'Stage2 Line2 Distance Speed Collect Result_Dam',
    'Stage2 Line3 Distance Speed Collect Result_Dam',
    'Stage2 Line4 Distance Speed Collect Result_Dam',
    'Stage3 Circle2 Distance Speed Collect Result_Dam',
    'Stage3 Circle3 Distance Speed Collect Result_Dam',
    'Stage3 Circle4 Distance Speed Collect Result_Dam',
    'Stage3 Line1 Distance Speed Collect Result_Dam',
    'Stage3 Line2 Distance Speed Collect Result_Dam',
    'Stage3 Line3 Distance Speed Collect Result_Dam',
    'Stage3 Line4 Distance Speed Collect Result_Dam',
    
    # 중복 변수
    'PalletID Collect Result_Fill1',
    'Production Qty Collect Result_Fill1',
    'Receip No Collect Result_Fill1',
    'PalletID Collect Result_Fill2',
    'Production Qty Collect Result_Fill2',
    'Receip No Collect Result_Fill2',
    'Equipment_Fill1',
    'Model.Suffix_Fill1',
    'Workorder_Fill1',
    'Equipment_Fill2',
    'Model.Suffix_Fill2',
    'Workorder_Fill2',
    'Model.Suffix_AutoClave',
    'Workorder_AutoClave',
    'Workorder_Dam',
    ####################################################################
    # 새로운 변수(파생변수 생성 도중 제거하고 싶은 변수 넣기)
    
    
]

## 4. Matched Data

### 뒤로 밀린 데이터 원상복구 진행하기

In [13]:
# 위치 옮기기
train_move = move_data(train)
test_move = move_data(test)

  dam.loc[dam['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].isin(['OK', np.nan]), dam.columns[24:]] = dam_mask
  fill2.loc[fill2['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'].isin(['OK', np.nan]), fill2.columns[24:]] = fill2_mask
  dam.loc[dam['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].isin(['OK', np.nan]), dam.columns[24:]] = dam_mask
  fill2.loc[fill2['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'].isin(['OK', np.nan]), fill2.columns[24:]] = fill2_mask


### Modified Equipment data type

In [14]:
# Equipment 번호만 가져오기
train_move['Equipment_Dam'] = train_move['Equipment_Dam'].str.slice(15, 16)
train_move['Equipment_Fill1'] = train_move['Equipment_Fill1'].str.slice(17, 18)
train_move['Equipment_Fill2'] = train_move['Equipment_Fill2'].str.slice(17, 18)

test_move['Equipment_Dam'] = test_move['Equipment_Dam'].str.slice(15, 16)
test_move['Equipment_Fill1'] = test_move['Equipment_Fill1'].str.slice(17, 18)
test_move['Equipment_Fill2'] = test_move['Equipment_Fill2'].str.slice(17, 18)

### Type Change

In [15]:
# 데이터 타입 변경하기
type_change = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',
    'Equipment_Dam',
    'Equipment_Fill1',
    'Equipment_Fill2'
]

types = [
    'float64', 'float64', 'float64', 'int64', 'int64', 'int64'
]
for i, t in zip(type_change, types):
    train_move[i] = train_move[i].astype(t)
    test_move[i] = test_move[i].astype(t)

### Fill1의 X좌표 바꾸기

- 바꿔야할 칼럼  
DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1  
DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1  
DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1  
Dispense Volume(Stage1) Collect Result_Fill1  
Dispense Volume(Stage2) Collect Result_Fill1  
Dispense Volume(Stage3) Collect Result_Fill1  
HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1  
HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1  
HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1  
HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1  
HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1  
HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1  
HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1  
HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1  
HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1

### Equipment 1

In [16]:
# 100대 값을 갖는 X위치 1, 3 체인지
condition = (train_move['Equipment_Fill1'] == 1) & (train_move['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'] < 200)
condition2 = (test_move['Equipment_Fill1'] == 1) & (test_move['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'] < 200)

# 바꿔야 되는 칼럼
As = [
    'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1',
    'Dispense Volume(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'
]

Bs = [
    'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1',
    'Dispense Volume(Stage3) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1'
]

# 교환
for a, b in zip(As, Bs):
    train_move.loc[condition, [a, b]] = train_move.loc[condition, [b, a]].values
    test_move.loc[condition2, [a, b]] = test_move.loc[condition2, [b, a]].values

In [17]:
# 400대 값을 갖는 X위치 1, 2 체인지
condition = (train_move['Equipment_Fill1'] == 1) & (train_move['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].between(400, 500))
condition2 = (test_move['Equipment_Fill1'] == 1) & (test_move['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].between(400, 500))

# 바꿔야 되는 칼럼
As = [
    'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1',
    'Dispense Volume(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'
]

Bs = [
    'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1',
    'Dispense Volume(Stage2) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1'
]

# 교환
for a, b in zip(As, Bs):
    train_move.loc[condition, [a, b]] = train_move.loc[condition, [b, a]].values
    test_move.loc[condition2, [a, b]] = test_move.loc[condition2, [b, a]].values

### Equipment 2

In [18]:
# 400대 값을 갖는 X위치 1, 2 체인지
condition = (train_move['Equipment_Fill1'] == 2) & (train_move['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].between(400, 500))
condition2 = (test_move['Equipment_Fill1'] == 2) & (test_move['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].between(400, 500))

# 바꿔야 되는 칼럼
As = [
    'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1',
    'Dispense Volume(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'
]

Bs = [
    'DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1',
    'Dispense Volume(Stage2) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1'
]

# 교환
for a, b in zip(As, Bs):
    train_move.loc[condition, [a, b]] = train_move.loc[condition, [b, a]].values
    test_move.loc[condition2, [a, b]] = test_move.loc[condition2, [b, a]].values

In [19]:
# 100대 값을 갖는 X위치 1, 3 체인지
condition = (train_move['Equipment_Fill1'] == 2) & (train_move['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'] > 800)
condition2 = (test_move['Equipment_Fill1'] == 2) & (test_move['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'] > 800)

# 바꿔야 되는 칼럼
As = [
    'DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1',
    'Dispense Volume(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'
]

Bs = [
    'DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1',
    'Dispense Volume(Stage3) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1'
]

# 교환
for a, b in zip(As, Bs):
    train_move.loc[condition, [a, b]] = train_move.loc[condition, [b, a]].values
    test_move.loc[condition2, [a, b]] = test_move.loc[condition2, [b, a]].values

In [20]:
# 바뀐 데이터 이름 바꾸기
df_train = train_move.copy()
df_test = test_move.copy()

### New Column

In [21]:
# 불일치 변수
df_train['inconsistant'] = 0
df_test['inconsistant'] = 0

# 장착
for i in columnname:
    inconsistant(df_train, i, 'inconsistant', True)
    inconsistant(df_test, i, 'inconsistant', False)

In [22]:
# 시간이 0이하, 900이상인 값은 이상치로 분류
for j in ['Machine Tact time Collect Result_Dam', 'Machine Tact time Collect Result_Fill1', 'Machine Tact time Collect Result_Fill2']:
    cri = [
        df_train[j] <= 0,
        df_train[j] > 900
    ]
    cri2 = [
        df_test[j] <= 0,
        df_test[j] > 900
    ]
    con = [
        1, 1
    ]
    df_train['inconsistant'] = np.select(cri, con, default = df_train['inconsistant'])
    df_test['inconsistant'] = np.select(cri2, con, default = df_test['inconsistant'])

### Speed Line & Circle

In [23]:
# 라인별로 속도가 같아야 정상이다.
df_train['Stage1 Line Sum Speed_Dam'] = df_train['Stage1 Line1 Distance Speed Collect Result_Dam'] + df_train['Stage1 Line2 Distance Speed Collect Result_Dam'] + df_train['Stage1 Line3 Distance Speed Collect Result_Dam'] + df_train['Stage1 Line4 Distance Speed Collect Result_Dam']
df_train['Stage2 Line Sum Speed_Dam'] = df_train['Stage2 Line1 Distance Speed Collect Result_Dam'] + df_train['Stage2 Line2 Distance Speed Collect Result_Dam'] + df_train['Stage2 Line3 Distance Speed Collect Result_Dam'] + df_train['Stage2 Line4 Distance Speed Collect Result_Dam']
df_train['Stage3 Line Sum Speed_Dam'] = df_train['Stage3 Line1 Distance Speed Collect Result_Dam'] + df_train['Stage3 Line2 Distance Speed Collect Result_Dam'] + df_train['Stage3 Line3 Distance Speed Collect Result_Dam'] + df_train['Stage3 Line4 Distance Speed Collect Result_Dam']


In [24]:
# 라인별로 속도가 같아야 정상이다.
df_test['Stage1 Line Sum Speed_Dam'] = df_test['Stage1 Line1 Distance Speed Collect Result_Dam'] + df_test['Stage1 Line2 Distance Speed Collect Result_Dam'] + df_test['Stage1 Line3 Distance Speed Collect Result_Dam'] + df_test['Stage1 Line4 Distance Speed Collect Result_Dam']
df_test['Stage2 Line Sum Speed_Dam'] = df_test['Stage2 Line1 Distance Speed Collect Result_Dam'] + df_test['Stage2 Line2 Distance Speed Collect Result_Dam'] + df_test['Stage2 Line3 Distance Speed Collect Result_Dam'] + df_test['Stage2 Line4 Distance Speed Collect Result_Dam']
df_test['Stage3 Line Sum Speed_Dam'] = df_test['Stage3 Line1 Distance Speed Collect Result_Dam'] + df_test['Stage3 Line2 Distance Speed Collect Result_Dam'] + df_test['Stage3 Line3 Distance Speed Collect Result_Dam'] + df_test['Stage3 Line4 Distance Speed Collect Result_Dam']

### time 보정

In [25]:
# time 보정하기
df_train['round_1st_time'] = round(df_train['1st Pressure 1st Pressure Unit Time_AutoClave'], -1)
df_train['round_2nd_time'] = round(df_train['2nd Pressure Unit Time_AutoClave'], -1)
df_train['round_3rd_time'] = round(df_train['3rd Pressure Unit Time_AutoClave'], -1)
df_train['all_time'] = round(df_train['Chamber Temp. Unit Time_AutoClave'], -1)

df_test['round_1st_time'] = round(df_test['1st Pressure 1st Pressure Unit Time_AutoClave'], -1)
df_test['round_2nd_time'] = round(df_test['2nd Pressure Unit Time_AutoClave'], -1)
df_test['round_3rd_time'] = round(df_test['3rd Pressure Unit Time_AutoClave'], -1)
df_test['all_time'] = round(df_test['Chamber Temp. Unit Time_AutoClave'], -1)

time_col = [
    '1st Pressure 1st Pressure Unit Time_AutoClave',
    '2nd Pressure Unit Time_AutoClave',
    '3rd Pressure Unit Time_AutoClave',
    'Chamber Temp. Unit Time_AutoClave'
]

# 적용
df_train = df_train.drop(columns = time_col, axis = 1)
df_test = df_test.drop(columns = time_col, axis = 1)

### Fill2 경화

In [26]:
# cure 위치 차이 즉 방향을 나타내는 변수 생성
df_train['cure_x_direction_fill'] = np.where(df_train['CURE START POSITION X Collect Result_Fill2'] - df_train['CURE END POSITION X Collect Result_Fill2'] > 0, 1, -1)
df_train['cure_y_dist_fill'] = df_train['CURE START POSITION Z Collect Result_Fill2'] - df_train['CURE END POSITION Z Collect Result_Fill2']

df_test['cure_x_direction_fill'] = np.where(df_test['CURE START POSITION X Collect Result_Fill2'] - df_test['CURE END POSITION X Collect Result_Fill2'] > 0, 1, -1)
df_test['cure_y_dist_fill'] = df_test['CURE START POSITION Z Collect Result_Fill2'] - df_test['CURE END POSITION Z Collect Result_Fill2']

### 각 좌표별 차이

In [27]:
df_train['Minus1_Dam']= df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'] - df_train['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam']
df_train['Minus2_Dam']= df_train['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam'] - df_train['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam']

df_test['Minus1_Dam']= df_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'] - df_test['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam']
df_test['Minus2_Dam']= df_test['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam'] - df_test['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam']

df_train['Minus1_Fill1']= df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'] - df_train['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1']
df_train['Minus2_Fill1']= df_train['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1'] - df_train['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1']

df_test['Minus1_Fill1']= df_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'] - df_test['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1']
df_test['Minus2_Fill1']= df_test['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1'] - df_test['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1']

df_train['Minus1Y_Dam']= df_train['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam'] - df_train['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam']
df_train['Minus2Y_Dam']= df_train['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'] - df_train['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam']

df_test['Minus1Y_Dam']= df_test['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam'] - df_test['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam']
df_test['Minus2Y_Dam']= df_test['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'] - df_test['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam']

df_train['Minus1Y_Fill1']= df_train['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1'] - df_train['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1']
df_train['Minus2Y_Fill1']= df_train['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1'] - df_train['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1']

df_test['Minus1Y_Fill1']= df_test['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1'] - df_test['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1']
df_test['Minus2Y_Fill1']= df_test['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1'] - df_test['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1']

df_train['Minus1Y_Dam'] = df_train['Minus1Y_Dam'].apply(lambda x: 1 if x > 2 or x < -2 else 0)
df_train['Minus2Y_Dam'] = df_train['Minus2Y_Dam'].apply(lambda x: 1 if x > 2 or x < -2 else 0)

df_test['Minus1Y_Dam'] = df_test['Minus1Y_Dam'].apply(lambda x: 1 if x > 2 or x < -2 else 0)
df_test['Minus2Y_Dam'] = df_test['Minus2Y_Dam'].apply(lambda x: 1 if x > 2 or x < -2 else 0)

### 타입 변경

In [28]:
df_train['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2'] = df_train['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2'].astype(float)
df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'] = df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'].astype(float)

### Workorder 쪼개기

In [29]:
df_train['workorder_first'] = df_train['Workorder_Dam'].str.slice(0, 2)
df_train['workorder_third'] = df_train['Workorder_Dam'].str.slice(2, 4)

df_test['workorder_first'] = df_test['Workorder_Dam'].str.slice(0, 2)
df_test['workorder_third'] = df_test['Workorder_Dam'].str.slice(2, 4)

### Columns Drop

In [31]:
# 수많은 칼럼 버리기
df_train = df_train.drop(columns = drop_col, axis = 1)
df_test = df_test.drop(columns = drop_col, axis = 1)

### Type 수정

In [32]:
label_encoders = {}
categorical_features = ['workorder_first', 'workorder_third', 'Model.Suffix_Dam']

# 시드 설정
np.random.seed(42)
for feature in categorical_features:
    le = LabelEncoder()
    df_train[feature] = le.fit_transform(df_train[feature])
    
    # 검증 데이터에 있는 새로운 값에 대해 처리
    unique_values = set(df_test[feature].unique()) - set(le.classes_)
    if unique_values:
        # 새로운 값들을 인코딩할 무작위 숫자 생성
        new_labels = np.random.randint(0, len(le.classes_), size=len(unique_values))
        # 새로운 값들을 인코딩
        le.classes_ = np.append(le.classes_, list(unique_values))
        le.transform(list(unique_values))  # transform을 호출해서 classes_ 업데이트
    
    df_test[feature] = le.transform(df_test[feature])
    label_encoders[feature] = le

### target 0, 1 변환

In [33]:
df_train['target'] = np.where(df_train['target'] == 'Normal', 0, 1)

### 이름 바꾸기

In [34]:
name_dic = {
    'Equipment_Dam': 'equipment',
    'Model.Suffix_Dam': 'model_suffix',
    'Workorder_Dam': 'workorder',
    'PalletID Collect Result_Dam':'pallet_id',
    'Production Qty Collect Result_Dam': 'qty',
    'Receip No Collect Result_Dam': 'receip'
}

df_train.rename(columns = name_dic, inplace = True)
df_test.rename(columns = name_dic, inplace = True)

In [35]:
# collect result 빼자
df_train.columns = df_train.columns.str.replace(' Collect Result', '')
df_test.columns = df_test.columns.str.replace(' Collect Result', '')

In [36]:
# LGBM이 공백 넣지 말래요
df_train.columns = df_train.columns.str.replace(' ', '_')
df_test.columns = df_test.columns.str.replace(' ', '_')

In [37]:
# 애초에 비정상인 값은 굳이 학습시킬 이유 없다.
df_train_adj = df_train[df_train['inconsistant'] == 0]

In [38]:
# 두 집단으로 나누기
equip1 = df_train_adj[df_train_adj['equipment'] == 1]
equip2 = df_train_adj[df_train_adj['equipment'] == 2]

equip1_test = df_test[df_test['equipment'] == 1]
equip2_test = df_test[df_test['equipment'] == 2]

In [39]:
# 집단을 나누는 기준을 제외시키기
equip1.drop(['inconsistant', 'equipment'], axis = 1, inplace = True)
equip2.drop(['inconsistant', 'equipment'], axis = 1, inplace = True)

equip1_test.drop(['inconsistant', 'equipment'], axis = 1, inplace = True)
equip2_test.drop(['inconsistant', 'equipment'], axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  equip1.drop(['inconsistant', 'equipment'], axis = 1, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  equip2.drop(['inconsistant', 'equipment'], axis = 1, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  equip1_test.drop(['inconsistant', 'equipment'], axis = 1, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/ind

In [40]:
equip1.columns

Index(['model_suffix', 'CURE_SPEED_Dam', 'DISCHARGED_SPEED_OF_RESIN_Dam',
       'DISCHARGED_TIME_OF_RESIN(Stage1)_Dam',
       'DISCHARGED_TIME_OF_RESIN(Stage2)_Dam',
       'DISCHARGED_TIME_OF_RESIN(Stage3)_Dam', 'Dispense_Volume(Stage1)_Dam',
       'Dispense_Volume(Stage2)_Dam', 'Dispense_Volume(Stage3)_Dam',
       'HEAD_NORMAL_COORDINATE_X_AXIS(Stage1)_Dam',
       'HEAD_NORMAL_COORDINATE_X_AXIS(Stage2)_Dam',
       'HEAD_NORMAL_COORDINATE_X_AXIS(Stage3)_Dam',
       'HEAD_NORMAL_COORDINATE_Y_AXIS(Stage1)_Dam',
       'HEAD_NORMAL_COORDINATE_Y_AXIS(Stage2)_Dam',
       'HEAD_NORMAL_COORDINATE_Y_AXIS(Stage3)_Dam',
       'HEAD_NORMAL_COORDINATE_Z_AXIS(Stage1)_Dam',
       'HEAD_NORMAL_COORDINATE_Z_AXIS(Stage2)_Dam',
       'HEAD_NORMAL_COORDINATE_Z_AXIS(Stage3)_Dam',
       'Head_Clean_Position_Z_Dam', 'Head_Purge_Position_Z_Dam',
       'Machine_Tact_time_Dam', 'pallet_id', 'qty', 'receip',
       'Stage1_Circle1_Distance_Speed_Dam',
       'Stage2_Circle1_Distance_Speed_Dam',
  

In [41]:
# univariate validation을 통한 변수 제외 목록 뽑기
except_value = []
for i in equip1.columns:
    print("===============================\n")
    print(f"{i} 변수의 검정")
    
    if len(np.unique(equip1[i])) < 14:
        chiresult = chi2_contingency(pd.crosstab(equip1[i], equip1['target']), correction=False)
        print('Chi square: {}'.format(chiresult[0]))
        print('P-value: {:.4f}'.format(chiresult[1]))
        
        if chiresult[1] > 0.05:
            except_value.append(i)
        
    else:
        
        l = levene(equip1['target'], equip1[i])
        b = bartlett(equip1['target'], equip1[i])
        print(l)
        print(b)
        
        
        if (l[1] >= 0.05) or (b[1] >= 0.05):
            t = ttest_ind(equip1['target'], equip1[i], equal_var = True)
            print(t)
            
        else:
            t = ttest_ind(equip1['target'], equip1[i], equal_var = False)
            print(t)
            
        if t[1] > 0.05:
            except_value.append(i)


model_suffix 변수의 검정
Chi square: 10.12947141892338
P-value: 0.1193

CURE_SPEED_Dam 변수의 검정
Chi square: 17.763952563218012
P-value: 0.0014

DISCHARGED_SPEED_OF_RESIN_Dam 변수의 검정
Chi square: 67.04620917313018
P-value: 0.0000

DISCHARGED_TIME_OF_RESIN(Stage1)_Dam 변수의 검정
LeveneResult(statistic=34788.391699280284, pvalue=0.0)
BartlettResult(statistic=105445.10615572892, pvalue=0.0)
TtestResult(statistic=-553.972191169205, pvalue=0.0, df=25151.205176769763)

DISCHARGED_TIME_OF_RESIN(Stage2)_Dam 변수의 검정
LeveneResult(statistic=79478.53665547512, pvalue=0.0)
BartlettResult(statistic=73795.57371022321, pvalue=0.0)
TtestResult(statistic=-497.5445477976165, pvalue=0.0, df=25633.968465143844)

DISCHARGED_TIME_OF_RESIN(Stage3)_Dam 변수의 검정
LeveneResult(statistic=34329.73826270856, pvalue=0.0)
BartlettResult(statistic=105226.64413825389, pvalue=0.0)
TtestResult(statistic=-556.2591418548592, pvalue=0.0, df=25152.836132511195)

Dispense_Volume(Stage1)_Dam 변수의 검정
LeveneResult(statistic=12883.138525402992, pv

In [42]:
except_value

['model_suffix',
 'pallet_id',
 'HEAD_NORMAL_COORDINATE_Y_AXIS(Stage2)_Fill1',
 'CURE_SPEED_Fill2',
 'cure_x_direction_fill',
 'Minus1Y_Dam',
 'Minus2Y_Dam']