In [76]:
# !pip install category_encoders
# !pip install tqdm
# !pip install lightgbm
# !pip install xgboost
# !pip install catboost
# !pip install torch
# !pip install seaborn

# 제품 이상여부 판별 프로젝트

## 0. Environment & Data Loading & Shift

### Library

In [77]:
import os
import re
import gc

import numpy as np
import pandas as pd
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score)

import matplotlib as plt
import category_encoders as ce
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import KFold, StratifiedKFold
from tqdm import tqdm

import lightgbm as lgb
import xgboost as xgb
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

import warnings
pd.set_option('mode.chained_assignment', None)
warnings.filterwarnings(action='ignore')

import random
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(42) # Seed 고정

### Data Loading

In [78]:
ROOT_DIR = "data"
RANDOM_STATE = 42

df_train = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
df_test = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

df_test_copy = df_test.copy()

In [79]:
df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].value_counts()

HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam
OK       11293
549       7271
162.4     3580
550       2398
550.3     1909
549.5     1263
548.5       26
Name: count, dtype: int64

In [80]:
df_test_후처리용 = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))

In [81]:
#Drop columns with percentage of missing value and drop only one unique value
drop_cols = []
for column in df_train.columns:
    if ((df_train[column].isnull().sum() / df_train.shape[0]) > 0.4) or (df_train[column].nunique() <=1):
        drop_cols.append(column)
    
df_train.drop(columns = drop_cols, inplace = True)
df_train.head()

Unnamed: 0,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Collect Date_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,...,Head Clean Position Z Collect Result_Fill2,Head Purge Position X Collect Result_Fill2,Head Purge Position Y Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,WorkMode Collect Result_Fill2,target
0,Dam dispenser #1,AJX75334505,4F1XA938-1,2024-04-25 11:10:00,240.0,2.5,-90,100,1030,-90,...,50.0,91.8,270,50,114.612,19.9,7,127,1,Normal
1,Dam dispenser #1,AJX75334505,3KPM0016-2,2023-09-19 14:30:00,240.0,2.5,-90,70,1030,-90,...,91.8,270.0,50,85,19.6,7.0,185,1,0,Normal
2,Dam dispenser #2,AJX75334501,4E1X9167-1,2024-03-05 09:30:00,1000.0,12.5,90,85,280,90,...,50.0,91.8,270,50,114.612,19.8,10,73,1,Normal
3,Dam dispenser #2,AJX75334501,3K1X0057-1,2023-09-25 15:40:00,1000.0,12.5,90,70,280,90,...,91.8,270.0,50,85,19.9,12.0,268,1,0,Normal
4,Dam dispenser #1,AJX75334501,3HPM0007-1,2023-06-27 13:20:00,240.0,2.5,-90,70,1030,-90,...,91.8,270.0,50,85,19.7,8.0,121,1,0,Normal


In [82]:
time_features_tr = df_train[['Collect Date_Dam','Collect Date_Fill1','Collect Date_Fill2','Collect Date_AutoClave']]
time_features_te = df_test[['Collect Date_Dam','Collect Date_Fill1','Collect Date_Fill2','Collect Date_AutoClave']]

df_train.drop(columns = ['Collect Date_Dam','Collect Date_Fill1','Collect Date_Fill2','Collect Date_AutoClave'],inplace=True)

### DATA SHIFT

### train

In [83]:
column  = list(df_train.columns)

In [84]:
#WorkMode Collect Result_Fill1 기준(Dam)
df_train_1 = df_train[df_train['WorkMode Collect Result_Fill1']!=7]
df_train_2 = df_train[df_train['WorkMode Collect Result_Fill1']==7]
replace_data  = df_train_1.iloc[:,15:69].shift(-1, axis=1).drop(columns=['Dispense Volume(Stage3) Collect Result_Dam','WorkMode Collect Result_Dam'])
df_train.loc[df_train_1.index,replace_data.columns] = replace_data
df_train = df_train.drop(columns  = ['WorkMode Collect Result_Dam'])

#WorkMode Collect Result_Fill1 기준(Fill1)
df_train_1 = df_train[df_train['WorkMode Collect Result_Fill1']!=7]
df_train_2 = df_train[df_train['WorkMode Collect Result_Fill1']==7]
replace_data  = df_train_1.iloc[:,89:112].shift(-1, axis=1).drop(columns=['WorkMode Collect Result_Fill1'])
df_train.loc[df_train_1.index,replace_data.columns] = replace_data
df_train = df_train.drop(columns  = ['WorkMode Collect Result_Fill1'])

#WorkMode Collect Result_Fill1 기준(Fill2)
idx = replace_data.index
df_train_1 = df_train.loc[idx]
replace_data  = df_train_1.iloc[:,120:143].shift(-1, axis=1).drop(columns=['WorkMode Collect Result_Fill2'])
df_train.loc[df_train_1.index,replace_data.columns] = replace_data
df_train = df_train.drop(columns  = ['WorkMode Collect Result_Fill2'])

df_train.isna().sum().sum()

0

In [85]:
df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].value_counts()

HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam
549      7271
162.4    3580
162.4    3487
162.7    2454
550      2398
164.2    2358
550.4    2350
550.3    1909
550.3    1868
550.6    1763
551.5    1440
549.5    1263
551.7     970
548.9     843
161.2     826
550.5     747
163.5     566
164.1     515
551.8     495
161.7     489
552.1     480
552.0     446
549.4     446
164.5     376
163.3     244
164.0     217
163.8     208
551.3     127
551.2     101
163.4      67
550.7      66
163.7      38
163.0      29
548.5      26
551.6      20
163.6      13
163.1       9
551.0       1
Name: count, dtype: int64

### test

In [86]:
column.remove('target')
df_test = df_test[column]

In [87]:
df_test_1 = df_test[df_test['WorkMode Collect Result_Fill1']!=7]
df_test_2 = df_test[df_test['WorkMode Collect Result_Fill1']==7]
replace_data = df_test_1.iloc[:,15:69].shift(-1, axis=1).drop(columns=['Dispense Volume(Stage3) Collect Result_Dam','WorkMode Collect Result_Dam'])
df_test.loc[df_test_1.index,replace_data.columns] = replace_data
df_test = df_test.drop(columns  = ['WorkMode Collect Result_Dam'])


df_test_1 = df_test[df_test['WorkMode Collect Result_Fill1']!=7]
df_test_2 = df_test[df_test['WorkMode Collect Result_Fill1']==7]
replace_data  = df_test_1.iloc[:,89:112].shift(-1, axis=1).drop(columns=['WorkMode Collect Result_Fill1'])
df_test.loc[df_test_1.index,replace_data.columns] = replace_data
df_test = df_test.drop(columns  = ['WorkMode Collect Result_Fill1'])


idx = replace_data.index
df_test_1 = df_test.loc[idx]
replace_data  = df_test_1.iloc[:,120:143].shift(-1, axis=1).drop(columns=['WorkMode Collect Result_Fill2'])
df_test.loc[df_test_1.index,replace_data.columns] = replace_data
df_test = df_test.drop(columns  = ['WorkMode Collect Result_Fill2'])

df_test.isna().sum().sum()

0

## Float Transfrom

In [88]:
df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'] = df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].astype(float)
df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'] = df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].astype(float)
df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'] = df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'].astype(float)

In [89]:
df_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'] = df_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].astype(float)
df_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'] = df_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].astype(float)
df_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'] = df_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'].astype(float)

## 1. Data Preprocessing

In [90]:
#Drop columns only one unique value
drop_cols = []
for column in df_train.columns:
    if df_train[column].nunique() == 1:
        drop_cols.append(column)
        
df_train.drop(columns = drop_cols, inplace = True)
df_test.drop(columns = drop_cols, inplace = True)
display(df_train.head())
display(df_test.head())

Unnamed: 0,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,...,HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2,HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2,HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2,HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,target
0,Dam dispenser #1,AJX75334505,4F1XA938-1,240.0,2.5,-90,100,1030,-90,16,...,428.0,243.7,243.7,243.7,114.612,19.9,7.0,127,1,Normal
1,Dam dispenser #1,AJX75334505,3KPM0016-2,240.0,2.5,-90,70,1030,-90,10,...,428.0,243.7,243.7,243.7,85.0,19.6,7.0,185,1,Normal
2,Dam dispenser #2,AJX75334501,4E1X9167-1,1000.0,12.5,90,85,280,90,16,...,1324.2,243.5,243.5,243.5,114.612,19.8,10.0,73,1,Normal
3,Dam dispenser #2,AJX75334501,3K1X0057-1,1000.0,12.5,90,70,280,90,10,...,1324.2,243.5,243.5,243.5,85.0,19.9,12.0,268,1,Normal
4,Dam dispenser #1,AJX75334501,3HPM0007-1,240.0,2.5,-90,70,1030,-90,10,...,428.0,243.7,243.7,243.7,85.0,19.7,8.0,121,1,Normal


Unnamed: 0,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,...,HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2,HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2,HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2,HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2,HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2
0,Dam dispenser #2,AJX75334501,3J1XF767-1,1000.0,12.5,90,70,280,90,10,...,1324.2,1324.2,243.5,243.5,243.5,85.0,19.8,13.0,195,1
1,Dam dispenser #2,AJX75334501,4B1XD472-2,1000.0,12.5,90,70,280,90,16,...,1324.2,1324.2,243.5,243.5,243.5,85.0,19.8,14.0,256,1
2,Dam dispenser #1,AJX75334501,3H1XE355-1,240.0,2.5,-90,70,1030,-90,10,...,427.9,428.0,243.7,243.7,243.7,85.0,19.7,1.0,98,1
3,Dam dispenser #2,AJX75334501,3L1XA128-1,1000.0,12.5,90,70,280,90,10,...,1324.2,1324.2,243.5,243.5,243.5,85.0,20.0,14.0,0,1
4,Dam dispenser #1,AJX75334501,4A1XA639-1,240.0,2.5,-90,70,1030,-90,16,...,427.9,428.0,243.7,243.7,243.7,85.0,19.8,1.0,215,1


In [91]:
#Time rematching
df_train = pd.concat([df_train, time_features_tr], axis = 1)
df_test = pd.concat([df_test, time_features_te], axis = 1)

display(df_train.head())
display(df_test.head())

Unnamed: 0,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,...,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,target,Collect Date_Dam,Collect Date_Fill1,Collect Date_Fill2,Collect Date_AutoClave
0,Dam dispenser #1,AJX75334505,4F1XA938-1,240.0,2.5,-90,100,1030,-90,16,...,114.612,19.9,7.0,127,1,Normal,2024-04-25 11:10:00,2024-04-25 11:20:00,2024-04-25 11:20:00,2024-04-25 11:50:00
1,Dam dispenser #1,AJX75334505,3KPM0016-2,240.0,2.5,-90,70,1030,-90,10,...,85.0,19.6,7.0,185,1,Normal,2023-09-19 14:30:00,2023-09-19 14:30:00,2023-09-19 14:30:00,2023-09-19 15:00:00
2,Dam dispenser #2,AJX75334501,4E1X9167-1,1000.0,12.5,90,85,280,90,16,...,114.612,19.8,10.0,73,1,Normal,2024-03-05 09:30:00,2024-03-05 09:30:00,2024-03-05 09:30:00,2024-03-05 10:10:00
3,Dam dispenser #2,AJX75334501,3K1X0057-1,1000.0,12.5,90,70,280,90,10,...,85.0,19.9,12.0,268,1,Normal,2023-09-25 15:40:00,2023-09-25 15:40:00,2023-09-25 15:40:00,2023-09-25 16:20:00
4,Dam dispenser #1,AJX75334501,3HPM0007-1,240.0,2.5,-90,70,1030,-90,10,...,85.0,19.7,8.0,121,1,Normal,2023-06-27 13:20:00,2023-06-27 13:20:00,2023-06-27 13:20:00,2023-06-27 14:00:00


Unnamed: 0,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,...,HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2,Head Purge Position Z Collect Result_Fill2,Machine Tact time Collect Result_Fill2,PalletID Collect Result_Fill2,Production Qty Collect Result_Fill2,Receip No Collect Result_Fill2,Collect Date_Dam,Collect Date_Fill1,Collect Date_Fill2,Collect Date_AutoClave
0,Dam dispenser #2,AJX75334501,3J1XF767-1,1000.0,12.5,90,70,280,90,10,...,243.5,85.0,19.8,13.0,195,1,2023-09-15 13:20:00,2023-09-15 13:30:00,2023-09-15 13:30:00,2023-09-15 14:00:00
1,Dam dispenser #2,AJX75334501,4B1XD472-2,1000.0,12.5,90,70,280,90,16,...,243.5,85.0,19.8,14.0,256,1,2024-02-06 16:50:00,2024-02-06 16:50:00,2024-02-06 16:50:00,2024-02-06 17:30:00
2,Dam dispenser #1,AJX75334501,3H1XE355-1,240.0,2.5,-90,70,1030,-90,10,...,243.7,85.0,19.7,1.0,98,1,2023-07-14 11:30:00,2023-07-14 11:40:00,2023-07-14 11:40:00,2023-07-14 12:10:00
3,Dam dispenser #2,AJX75334501,3L1XA128-1,1000.0,12.5,90,70,280,90,10,...,243.5,85.0,20.0,14.0,0,1,2023-11-03 08:00:00,2023-11-03 08:00:00,2023-11-03 08:00:00,2023-11-03 08:30:00
4,Dam dispenser #1,AJX75334501,4A1XA639-1,240.0,2.5,-90,70,1030,-90,16,...,243.7,85.0,19.8,1.0,215,1,2023-12-23 14:00:00,2023-12-23 14:00:00,2023-12-23 14:10:00,2023-12-23 14:40:00


In [92]:
#Int to Object Transformation
transform_col = [
                'Production Qty Collect Result_Dam','Production Qty Collect Result_Fill1','Production Qty Collect Result_Fill2',
                'PalletID Collect Result_Dam','PalletID Collect Result_Fill1','PalletID Collect Result_Fill2',
                'Receip No Collect Result_Dam','Receip No Collect Result_Fill1','Receip No Collect Result_Fill2']

df_train[transform_col] = df_train[transform_col].astype(int).astype(object)
df_test[transform_col] = df_test[transform_col].astype(int).astype(object)

In [93]:
#중복 columns 처리
##Equipment, Model.Suffix, Workorder 통일화
df_train.rename(columns = {'Equipment_Dam':'Equipment', 'Model.Suffix_Dam':'Model.Suffix', 'Workorder_Dam':'Workorder'}, inplace = True)
df_train.drop(columns = ['Equipment_Fill1', 'Equipment_Fill2',
                        'Model.Suffix_AutoClave', 'Model.Suffix_Fill1', 'Model.Suffix_Fill2',
                        'Workorder_AutoClave', 'Workorder_Fill1', 'Workorder_Fill2'], inplace = True)
df_test.rename(columns = {'Equipment_Dam':'Equipment', 'Model.Suffix_Dam':'Model.Suffix', 'Workorder_Dam':'Workorder'}, inplace = True)
df_test.drop(columns = ['Equipment_Fill1', 'Equipment_Fill2',
                        'Model.Suffix_AutoClave', 'Model.Suffix_Fill1', 'Model.Suffix_Fill2',
                        'Workorder_AutoClave', 'Workorder_Fill1', 'Workorder_Fill2'], inplace = True)

df_train['Equipment'] = df_train['Equipment'].map({'Dam dispenser #1':'Dispenser_1', 'Dam dispenser #2':'Dispneser_2'})
df_test['Equipment'] = df_test['Equipment'].map({'Dam dispenser #1':'Dispenser_1', 'Dam dispenser #2':'Dispneser_2'})

##Production Qty, PalletID, Receip No 통일화
df_train.rename(columns = {'Production Qty Collect Result_Dam':'Production Qty Collect Result', 
                           'PalletID Collect Result_Dam':'PalletID Collect Result',
                          'Receip No Collect Result_Dam':'Receip No Collect Result'}, inplace = True)
df_train.drop(columns = ['Production Qty Collect Result_Fill1', 'Production Qty Collect Result_Fill2',
                        'PalletID Collect Result_Fill1', 'PalletID Collect Result_Fill2',
                        'Receip No Collect Result_Fill1', 'Receip No Collect Result_Fill2'], inplace = True)
df_test.rename(columns = {'Production Qty Collect Result_Dam':'Production Qty Collect Result', 
                           'PalletID Collect Result_Dam':'PalletID Collect Result',
                          'Receip No Collect Result_Dam':'Receip No Collect Result'}, inplace = True)
df_test.drop(columns = ['Production Qty Collect Result_Fill1', 'Production Qty Collect Result_Fill2',
                        'PalletID Collect Result_Fill1', 'PalletID Collect Result_Fill2',
                        'Receip No Collect Result_Fill1', 'Receip No Collect Result_Fill2'], inplace = True)

In [94]:
df_train['Work_Equip'] = df_train['Equipment'] + df_train['Workorder']
df_test['Work_Equip'] = df_test['Equipment'] + df_test['Workorder']

In [95]:
#Equipment/PalletID, Model.Suffix/Receip No 통일화
df_train.rename(columns = {'Equipment':'Equipment_PalletID', 
                           'Model.Suffix':'Model_Receip'}, inplace = True)
df_train['Equipment_PalletID'] = df_train['Equipment_PalletID'] + '_' + df_train['PalletID Collect Result'].astype(str)
df_train['Model_Receip'] = df_train['Model_Receip'] + '_' + df_train['Receip No Collect Result'].astype(str)
df_test.rename(columns = {'Equipment':'Equipment_PalletID', 
                           'Model.Suffix':'Model_Receip'}, inplace = True)
df_test['Equipment_PalletID'] = df_test['Equipment_PalletID'] + '_' + df_test['PalletID Collect Result'].astype(str)
df_test['Model_Receip'] = df_test['Model_Receip'] + '_' + df_test['Receip No Collect Result'].astype(str)

df_train.drop(columns = ['PalletID Collect Result', 'Receip No Collect Result'], inplace = True)
df_test.drop(columns = ['PalletID Collect Result', 'Receip No Collect Result'], inplace = True)

In [96]:
#Production Qty Transformation
for lst in tqdm(df_train['Workorder'].unique()):
    
    tr = df_train[df_train['Workorder'] == lst]
    te = df_test[df_test['Workorder'] == lst]

    tr['Production_Sequence_Ratio'] = tr['Production Qty Collect Result'] / tr['Production Qty Collect Result'].max()
    te['Production_Sequence_Ratio'] = te['Production Qty Collect Result'] / tr['Production Qty Collect Result'].max()
    
    tr['sin_sequence'] = np.sin(2 * np.pi * tr['Production Qty Collect Result'].astype(float) / 59.0)
    te['sin_sequence'] = np.sin(2 * np.pi * te['Production Qty Collect Result'].astype(float) / 59.0)
    
    tr['cos_sequence'] = np.cos(2 * np.pi * tr['Production Qty Collect Result'].astype(float) / 59.0)
    te['cos_sequence'] = np.cos(2 * np.pi * te['Production Qty Collect Result'].astype(float) / 59.0)
    
    df_train.loc[tr.index,'Production_Sequence_ratio'] = tr['Production_Sequence_Ratio']
    df_test.loc[te.index,'Production_Sequence_ratio'] = te['Production_Sequence_Ratio']

    df_train.loc[tr.index,'sin_sequence'] = tr['sin_sequence']
    df_test.loc[te.index,'sin_sequence'] = te['sin_sequence']
    
    df_train.loc[tr.index,'cos_sequence'] = tr['cos_sequence']
    df_test.loc[te.index,'cos_sequence'] = te['cos_sequence'] 

100%|██████████| 663/663 [00:06<00:00, 109.93it/s]


## 2. Feature Engineering

### Model detail

In [97]:
#패턴 통일
def remove_zeros(input_string):
    # 패턴 정의: - 문자 뒤에 000이 있는 경우
    pattern = r'-(000)'

    # re.sub를 사용하여 패턴을 대체
    modified_string = re.sub(pattern, '-', input_string)
    return modified_string

df_train['Workorder'] = df_train['Workorder'].apply(lambda x : remove_zeros(x))
df_test['Workorder'] = df_test['Workorder'].apply(lambda x : remove_zeros(x))

### Dam Features

#### Workorder & Production Qty

In [98]:
#문자열 정보 추출
df_train['Wo_Number'] = df_train['Workorder'].apply(lambda x : x[:1]).astype(str)
df_train['number_alpha'] = df_train['Workorder'].apply(lambda x : x[:2]).astype(str)
df_train['alpha'] = df_train['Workorder'].apply(lambda x : x[1]).astype(str)
df_train['Wo_Main'] = df_train['Workorder'].apply(lambda x : x[:3]).astype(str)
df_train['Wo_Sub'] = df_train['Workorder'].apply(lambda x : x[3]).astype(str)
df_train['Wo_Detail_1'] = df_train['Workorder'].apply(lambda x : x[4]).astype(str)
df_train['Wo_Detail_2'] = df_train['Workorder'].apply(lambda x : x[5:]).astype(str)
df_train['last'] = df_train['Workorder'].apply(lambda x : x[-1]).astype(str)

df_train['sj_1'] = df_train['Workorder'].apply(lambda x : x[:4]).astype(str)
df_train['sj_5'] = df_train['Workorder'].apply(lambda x : x[:8]).astype(str)
df_train['sj_6'] = df_train['Workorder'].apply(lambda x : x[2]).astype(str)
df_train['sj_7'] = df_train['Workorder'].apply(lambda x : x[5]).astype(str)
df_train['sj_8'] = df_train['Workorder'].apply(lambda x : x[4:8]).astype(str)
df_train['sj_9'] = df_train['Workorder'].apply(lambda x : x[-5:-2]).astype(str)

df_test['Wo_Number'] = df_test['Workorder'].apply(lambda x : x[:1]).astype(str)
df_test['number_alpha'] = df_test['Workorder'].apply(lambda x : x[:2]).astype(str)
df_test['alpha'] = df_test['Workorder'].apply(lambda x : x[1]).astype(str)
df_test['Wo_Main'] = df_test['Workorder'].apply(lambda x : x[:3]).astype(str)
df_test['Wo_Sub'] = df_test['Workorder'].apply(lambda x : x[3]).astype(str)
df_test['Wo_Detail_1'] = df_test['Workorder'].apply(lambda x : x[4]).astype(str)
df_test['Wo_Detail_2'] = df_test['Workorder'].apply(lambda x : x[5:]).astype(str)
df_test['last'] = df_test['Workorder'].apply(lambda x : x[-1]).astype(str)

df_test['sj_1'] = df_test['Workorder'].apply(lambda x : x[:4]).astype(str)
df_test['sj_5'] = df_test['Workorder'].apply(lambda x : x[:8]).astype(str)
df_test['sj_6'] = df_test['Workorder'].apply(lambda x : x[2]).astype(str)
df_test['sj_7'] = df_test['Workorder'].apply(lambda x : x[5]).astype(str)
df_test['sj_8'] = df_test['Workorder'].apply(lambda x : x[4:8]).astype(str)
df_test['sj_9'] = df_test['Workorder'].apply(lambda x : x[-5:-2]).astype(str)

#### Signal combination

In [99]:
#CURE(Dam)
##CURE 가동 방향 및 가동높이
df_train['Cure_HD_Dam'] = df_train['CURE END POSITION X Collect Result_Dam'] - df_train['CURE START POSITION X Collect Result_Dam']
df_test['Cure_HD_Dam'] = df_test['CURE END POSITION X Collect Result_Dam'] - df_test['CURE START POSITION X Collect Result_Dam']

##CURE 가동 시간
df_train['Cure_time_Dam'] = df_train['Cure_HD_Dam'] / df_train['CURE SPEED Collect Result_Dam']
df_test['Cure_time_Dam'] = df_test['Cure_HD_Dam'] / df_test['CURE SPEED Collect Result_Dam']

##CURE position category
df_train['Cure_position_category_Dam'] = df_train['CURE END POSITION X Collect Result_Dam'].map({240.0:'Down',1000.0:'Up'}).astype(str)
df_test['Cure_position_category_Dam'] = df_test['CURE END POSITION X Collect Result_Dam'].map({240.0:'Down',1000.0:'Up'}).astype(str)

##CURE Hysteresis
def f_H(x):
    return np.arccos(1 - 2 * np.tanh(5.16 * (x / (1 + 1.31 * x**0.99))**0.706))

u_c = df_train['CURE SPEED Collect Result_Dam']
theta_e_1 = df_train['CURE START POSITION Θ Collect Result_Dam']

mu = 1
gamma = 1
df_train['Theta_d_1_Dam'] = f_H((mu * u_c / gamma) + f_H(theta_e_1))

u_c = df_test['CURE SPEED Collect Result_Dam']
theta_e_1 = df_test['CURE START POSITION Θ Collect Result_Dam']  # equilibrium contact angle 데이터 배열

df_test['Theta_d_1_Dam'] = f_H((mu * u_c / gamma) + f_H(theta_e_1))

In [100]:
#DISCHARGED RESIN(Dam)
##Resin 토출기 이동거리
df_train['Discharged_Resin_distance_1_Dam'] = df_train['DISCHARGED SPEED OF RESIN Collect Result_Dam'] * df_train['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam']
df_train['Discharged_Resin_distance_2_Dam'] = df_train['DISCHARGED SPEED OF RESIN Collect Result_Dam'] * df_train['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam']
df_train['Discharged_Resin_distance_3_Dam'] = df_train['DISCHARGED SPEED OF RESIN Collect Result_Dam'] * df_train['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam']
df_test['Discharged_Resin_distance_1_Dam'] = df_test['DISCHARGED SPEED OF RESIN Collect Result_Dam'] * df_test['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam']
df_test['Discharged_Resin_distance_2_Dam'] = df_test['DISCHARGED SPEED OF RESIN Collect Result_Dam'] * df_test['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam']
df_test['Discharged_Resin_distance_3_Dam'] = df_test['DISCHARGED SPEED OF RESIN Collect Result_Dam'] * df_test['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam']

##Resin 토출 총 소요 시간
df_train['TotalTime_Discharged_Resin_Dam'] = df_train['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam'] + df_train['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam'] + df_train['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam']
df_test['TotalTime_Discharged_Resin_Dam'] = df_test['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam'] + df_test['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam'] + df_test['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam']

In [101]:
#DISPENSE(Dam)
##Dispensor 크기 변화
df_train['Dispense_volume_change1_Dam'] = df_train['Dispense Volume(Stage2) Collect Result_Dam'] - df_train['Dispense Volume(Stage1) Collect Result_Dam']
df_train['Dispense_volume_change2_Dam'] = df_train['Dispense Volume(Stage3) Collect Result_Dam'] - df_train['Dispense Volume(Stage2) Collect Result_Dam']
df_test['Dispense_volume_change1_Dam'] = df_test['Dispense Volume(Stage2) Collect Result_Dam'] - df_test['Dispense Volume(Stage1) Collect Result_Dam']
df_test['Dispense_volume_change2_Dam'] = df_test['Dispense Volume(Stage3) Collect Result_Dam'] - df_test['Dispense Volume(Stage2) Collect Result_Dam']

##Dispensor 크기 변화량
df_train['Dispense_volume_change1_abs_Dam'] = df_train['Dispense_volume_change1_Dam'].abs()
df_train['Dispense_volume_change2_abs_Dam'] = df_train['Dispense_volume_change2_Dam'].abs()
df_test['Dispense_volume_change1_abs_Dam'] = df_test['Dispense_volume_change1_Dam'].abs()
df_test['Dispense_volume_change2_abs_Dam'] = df_test['Dispense_volume_change2_Dam'].abs()

In [102]:
#DISCHARGED RESIN & DISPENSE (Dam)
##도포된 Resin 양
df_train['Total_Resin_volume1_Dam'] = df_train['Discharged_Resin_distance_1_Dam'] * df_train['Dispense Volume(Stage1) Collect Result_Dam']
df_train['Total_Resin_volume2_Dam'] = df_train['Discharged_Resin_distance_2_Dam'] * df_train['Dispense Volume(Stage2) Collect Result_Dam']
df_train['Total_Resin_volume3_Dam'] = df_train['Discharged_Resin_distance_3_Dam'] * df_train['Dispense Volume(Stage3) Collect Result_Dam']
df_test['Total_Resin_volume1_Dam'] = df_test['Discharged_Resin_distance_1_Dam'] * df_test['Dispense Volume(Stage1) Collect Result_Dam']
df_test['Total_Resin_volume2_Dam'] = df_test['Discharged_Resin_distance_2_Dam'] * df_test['Dispense Volume(Stage2) Collect Result_Dam']
df_test['Total_Resin_volume3_Dam'] = df_test['Discharged_Resin_distance_3_Dam'] * df_test['Dispense Volume(Stage3) Collect Result_Dam']

##시간당 토출량
df_train['Stage1_Dam_Volume_Speed'] = df_train['Dispense Volume(Stage1) Collect Result_Dam'] / df_train['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam']
df_train['Stage2_Dam_Volume_Speed'] = df_train['Dispense Volume(Stage2) Collect Result_Dam'] / df_train['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam']
df_train['Stage3_Dam_Volume_Speed'] = df_train['Dispense Volume(Stage3) Collect Result_Dam'] / df_train['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam']
df_test['Stage1_Dam_Volume_Speed'] = df_test['Dispense Volume(Stage1) Collect Result_Dam'] / df_test['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam']
df_test['Stage2_Dam_Volume_Speed'] = df_test['Dispense Volume(Stage2) Collect Result_Dam'] / df_test['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam']
df_test['Stage3_Dam_Volume_Speed'] = df_test['Dispense Volume(Stage3) Collect Result_Dam'] / df_test['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam']

In [103]:
#HEAD(Dam)
##Dam X,Y,Z
df_train['Dam_X_1'] = df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'] - df_train['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam']
df_train['Dam_X_2'] = df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'] - df_train['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam']
df_train['Dam_X_3'] = df_train['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam'] - df_train['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam']
df_test['Dam_X_1'] = df_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'] - df_test['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam']
df_test['Dam_X_2'] = df_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'] - df_test['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam']
df_test['Dam_X_3'] = df_test['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam'] - df_test['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam']

df_train['Dam_Y_1'] = df_train['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam'] - df_train['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam']
df_train['Dam_Y_2'] = df_train['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam'] - df_train['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam']
df_train['Dam_Y_3'] = df_train['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'] - df_train['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam']
df_test['Dam_Y_1'] = df_test['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam'] - df_test['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam']
df_test['Dam_Y_2'] = df_test['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam'] - df_test['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam']
df_test['Dam_Y_3'] = df_test['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'] - df_test['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam']

df_train['Dam_dist_1'] = np.sqrt(df_train['Dam_X_1']**2 +df_train['Dam_Y_1']**2)
df_train['Dam_dist_2'] = np.sqrt(df_train['Dam_X_2']**2 +df_train['Dam_Y_2']**2)
df_train['Dam_dist_3'] = np.sqrt(df_train['Dam_X_3']**2 +df_train['Dam_Y_3']**2)
df_test['Dam_dist_1'] = np.sqrt(df_test['Dam_X_1']**2 +df_test['Dam_Y_1']**2)
df_test['Dam_dist_2'] = np.sqrt(df_test['Dam_X_2']**2 +df_test['Dam_Y_2']**2)
df_test['Dam_dist_3'] = np.sqrt(df_test['Dam_X_3']**2 +df_test['Dam_Y_3']**2)

##노즐 분사 벡터 크기 & 각도
def calculate_magnitude(a,b,c):
    # 벡터 크기 계산
    return np.sqrt(a**2 + b**2 +c**2)

def calculate_angles(a,b,c):
    # 벡터 크기
    magnitude = calculate_magnitude(a,b,c)
    
    # 각 좌표축과의 각도 (cosine)
    angle_x = a / magnitude
    angle_y = b / magnitude
    angle_z = c / magnitude
    
    return angle_x, angle_y, angle_z

df_train['Vector_Dam_Stage1']= calculate_magnitude(df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'],
                                                   df_train['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam'],
                                                    df_train['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam'])
df_train['Vector_Dam_Stage2']= calculate_magnitude(df_train['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam'],
                                                   df_train['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'],
                                                    df_train['HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam'])
df_train['Vector_Dam_Stage3']= calculate_magnitude(df_train['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam'],
                                                   df_train['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam'],
                                                    df_train['HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam'])
df_test['Vector_Dam_Stage1']= calculate_magnitude(df_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'],
                                                   df_test['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam'],
                                                    df_test['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam'])
df_test['Vector_Dam_Stage2']= calculate_magnitude(df_test['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam'],
                                                   df_test['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'],
                                                    df_test['HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam'])
df_test['Vector_Dam_Stage3']= calculate_magnitude(df_test['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam'],
                                                   df_test['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam'],
                                                    df_test['HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam'])

df_train['Angle_x_Dam_Stage1'],df_train['Angle_y_Dam_Stage1'],df_train['Angle_z_Dam_Stage1'] = calculate_angles(df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'],
                                                   df_train['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam'],
                                                    df_train['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam'])
df_train['Angle_x_Dam_Stage2'] ,df_train['Angle_y_Dam_Stage2'],df_train['Angle_z_Dam_Stage2'] = calculate_angles(df_train['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam'],
                                                   df_train['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'],
                                                    df_train['HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam'])
df_train['Angle_x_Dam_Stage3'] ,df_train['Angle_y_Dam_Stage3'],df_train['Angle_z_Dam_Stage3'] = calculate_angles(df_train['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam'],
                                                   df_train['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam'],
                                                    df_train['HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam'])
df_test['Angle_x_Dam_Stage1'],df_test['Angle_y_Dam_Stage1'],df_test['Angle_z_Dam_Stage1'] = calculate_angles(df_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'],
                                                   df_test['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam'],
                                                    df_test['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam'])
df_test['Angle_x_Dam_Stage2'] ,df_test['Angle_y_Dam_Stage2'],df_test['Angle_z_Dam_Stage2'] = calculate_angles(df_test['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam'],
                                                   df_test['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam'],
                                                    df_test['HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam'])
df_test['Angle_x_Dam_Stage3'] ,df_test['Angle_y_Dam_Stage3'],df_test['Angle_z_Dam_Stage3'] = calculate_angles(df_test['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam'],
                                                   df_test['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam'],
                                                    df_test['HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam'])

In [104]:
#Circle & Line Distance Speed(Dam)
##Circle speed sum
df_train['Stage1_Circle_sum_Speed_Dam'] = df_train['Stage1 Circle1 Distance Speed Collect Result_Dam'] + df_train['Stage1 Circle2 Distance Speed Collect Result_Dam'] + df_train['Stage1 Circle3 Distance Speed Collect Result_Dam'] + df_train['Stage1 Circle4 Distance Speed Collect Result_Dam']
df_train['Stage2_Circle_sum_Speed_Dam'] = df_train['Stage2 Circle1 Distance Speed Collect Result_Dam'] + df_train['Stage2 Circle2 Distance Speed Collect Result_Dam'] + df_train['Stage2 Circle3 Distance Speed Collect Result_Dam'] + df_train['Stage2 Circle4 Distance Speed Collect Result_Dam']
df_train['Stage3_Circle_sum_Speed_Dam'] = df_train['Stage3 Circle1 Distance Speed Collect Result_Dam'] + df_train['Stage3 Circle2 Distance Speed Collect Result_Dam'] + df_train['Stage3 Circle3 Distance Speed Collect Result_Dam'] + df_train['Stage3 Circle4 Distance Speed Collect Result_Dam']
df_test['Stage1_Circle_sum_Speed_Dam'] = df_test['Stage1 Circle1 Distance Speed Collect Result_Dam'] + df_test['Stage1 Circle2 Distance Speed Collect Result_Dam'] + df_test['Stage1 Circle3 Distance Speed Collect Result_Dam'] + df_test['Stage1 Circle4 Distance Speed Collect Result_Dam']
df_test['Stage2_Circle_sum_Speed_Dam'] = df_test['Stage2 Circle1 Distance Speed Collect Result_Dam'] + df_test['Stage2 Circle2 Distance Speed Collect Result_Dam'] + df_test['Stage2 Circle3 Distance Speed Collect Result_Dam'] + df_test['Stage2 Circle4 Distance Speed Collect Result_Dam']
df_test['Stage3_Circle_sum_Speed_Dam'] = df_test['Stage3 Circle1 Distance Speed Collect Result_Dam'] + df_test['Stage3 Circle2 Distance Speed Collect Result_Dam'] + df_test['Stage3 Circle3 Distance Speed Collect Result_Dam'] + df_test['Stage3 Circle4 Distance Speed Collect Result_Dam']

##Line speed sum
df_train['Stage1_Line_sum_Speed_Dam'] = df_train['Stage1 Line1 Distance Speed Collect Result_Dam'] + df_train['Stage1 Line2 Distance Speed Collect Result_Dam'] + df_train['Stage1 Line3 Distance Speed Collect Result_Dam'] + df_train['Stage1 Line4 Distance Speed Collect Result_Dam']
df_train['Stage2_Line_sum_Speed_Dam'] = df_train['Stage2 Line1 Distance Speed Collect Result_Dam'] + df_train['Stage2 Line2 Distance Speed Collect Result_Dam'] + df_train['Stage2 Line3 Distance Speed Collect Result_Dam'] + df_train['Stage2 Line4 Distance Speed Collect Result_Dam']
df_train['Stage3_Line_sum_Speed_Dam'] = df_train['Stage3 Line1 Distance Speed Collect Result_Dam'] + df_train['Stage3 Line2 Distance Speed Collect Result_Dam'] + df_train['Stage3 Line3 Distance Speed Collect Result_Dam'] + df_train['Stage3 Line4 Distance Speed Collect Result_Dam']
df_test['Stage1_Line_sum_Speed_Dam'] = df_test['Stage1 Line1 Distance Speed Collect Result_Dam'] + df_test['Stage1 Line2 Distance Speed Collect Result_Dam'] + df_test['Stage1 Line3 Distance Speed Collect Result_Dam'] + df_test['Stage1 Line4 Distance Speed Collect Result_Dam']
df_test['Stage2_Line_sum_Speed_Dam'] = df_test['Stage2 Line1 Distance Speed Collect Result_Dam'] + df_test['Stage2 Line2 Distance Speed Collect Result_Dam'] + df_test['Stage2 Line3 Distance Speed Collect Result_Dam'] + df_test['Stage2 Line4 Distance Speed Collect Result_Dam']
df_test['Stage3_Line_sum_Speed_Dam'] = df_test['Stage3 Line1 Distance Speed Collect Result_Dam'] + df_test['Stage3 Line2 Distance Speed Collect Result_Dam'] + df_test['Stage3 Line3 Distance Speed Collect Result_Dam'] + df_test['Stage3 Line4 Distance Speed Collect Result_Dam']

##Diff from Circle to Line
df_train['Abs_speed_Stage1_Dam'] = df_train['Stage1_Circle_sum_Speed_Dam'] - df_train['Stage1_Line_sum_Speed_Dam']
df_train['Abs_speed_Stage2_Dam'] = df_train['Stage2_Circle_sum_Speed_Dam'] - df_train['Stage2_Line_sum_Speed_Dam']
df_train['Abs_speed_Stage3_Dam'] = df_train['Stage3_Circle_sum_Speed_Dam'] - df_train['Stage3_Line_sum_Speed_Dam']
df_test['Abs_speed_Stage1_Dam'] = df_test['Stage1_Circle_sum_Speed_Dam'] - df_test['Stage1_Line_sum_Speed_Dam']
df_test['Abs_speed_Stage2_Dam'] = df_test['Stage2_Circle_sum_Speed_Dam'] - df_test['Stage2_Line_sum_Speed_Dam']
df_test['Abs_speed_Stage3_Dam'] = df_test['Stage3_Circle_sum_Speed_Dam'] - df_test['Stage3_Line_sum_Speed_Dam']

In [105]:
#Tact Time & Discharged Resin
##Diff from Tact Time to Discharged Time
df_train['Abs_Tact_discharged_time_Dam'] = df_train['Machine Tact time Collect Result_Dam'] - df_train['TotalTime_Discharged_Resin_Dam']
df_test['Abs_Tact_discharged_time_Dam'] = df_test['Machine Tact time Collect Result_Dam'] - df_test['TotalTime_Discharged_Resin_Dam']

In [106]:
#Thickness(Dam)
##Diff Thickness
df_train['THICKNESS_abs_Dam_1'] = df_train['THICKNESS 1 Collect Result_Dam'] - df_train['THICKNESS 2 Collect Result_Dam']
df_train['THICKNESS_abs_Dam_2'] = df_train['THICKNESS 1 Collect Result_Dam'] - df_train['THICKNESS 3 Collect Result_Dam']
df_train['THICKNESS_abs_Dam_3'] = df_train['THICKNESS 2 Collect Result_Dam'] - df_train['THICKNESS 3 Collect Result_Dam']
df_test['THICKNESS_abs_Dam_1'] = df_test['THICKNESS 1 Collect Result_Dam'] - df_test['THICKNESS 2 Collect Result_Dam']
df_test['THICKNESS_abs_Dam_2'] = df_test['THICKNESS 1 Collect Result_Dam'] - df_test['THICKNESS 3 Collect Result_Dam']
df_test['THICKNESS_abs_Dam_3'] = df_test['THICKNESS 2 Collect Result_Dam'] - df_test['THICKNESS 3 Collect Result_Dam']

### AutoClave Features

In [107]:
#탈포 압력 값 정제
df_train['AutoClave_1st_Pressure_str'] = df_train['1st Pressure 1st Pressure Unit Time_AutoClave'].astype(str).str[-1].astype('float')
df_train['1st Pressure Unit Time_AutoClave_new'] = df_train['1st Pressure 1st Pressure Unit Time_AutoClave'].round(-1)
df_train['AutoClave_2nd_Pressure_str'] = df_train['2nd Pressure Unit Time_AutoClave'].astype(str).str[-1].astype('float')
df_train['2nd Pressure Unit Time_AutoClave_new'] = df_train['2nd Pressure Unit Time_AutoClave'].round(-1)
df_train['AutoClave_3rd_Pressure_str'] = df_train['3rd Pressure Unit Time_AutoClave'].astype(str).str[-1].astype('float')
df_train['3rd Pressure Unit Time_AutoClave_new'] = df_train['3rd Pressure Unit Time_AutoClave'].round(-1)
df_test['AutoClave_1st_Pressure_str'] = df_test['1st Pressure 1st Pressure Unit Time_AutoClave'].astype(str).str[-1].astype('float')
df_test['1st Pressure Unit Time_AutoClave_new'] = df_test['1st Pressure 1st Pressure Unit Time_AutoClave'].round(-1)
df_test['AutoClave_2nd_Pressure_str'] = df_test['2nd Pressure Unit Time_AutoClave'].astype(str).str[-1].astype('float')
df_test['2nd Pressure Unit Time_AutoClave_new'] = df_test['2nd Pressure Unit Time_AutoClave'].round(-1)
df_test['AutoClave_3rd_Pressure_str'] = df_test['3rd Pressure Unit Time_AutoClave'].astype(str).str[-1].astype('float')
df_test['3rd Pressure Unit Time_AutoClave_new'] = df_test['3rd Pressure Unit Time_AutoClave'].round(-1)

##Pressure str sum
df_train['AutoClave_Pressure_str_Sum']= df_train['AutoClave_1st_Pressure_str'] + df_train['AutoClave_2nd_Pressure_str'] + df_train['AutoClave_3rd_Pressure_str']
df_test['AutoClave_Pressure_str_Sum']= df_test['AutoClave_1st_Pressure_str'] + df_test['AutoClave_2nd_Pressure_str'] + df_test['AutoClave_3rd_Pressure_str']

In [108]:
#탈포 압력 & 시간
##Slope of Pressure-Time Curve
df_train['Pressure_Time_slope1_AutoClave'] = df_train['1st Pressure Collect Result_AutoClave'] / (df_train['1st Pressure 1st Pressure Unit Time_AutoClave'] + 1)
df_train['Pressure_Time_slope2_AutoClave'] = (df_train['2nd Pressure Collect Result_AutoClave'] - df_train['1st Pressure Collect Result_AutoClave']) / (df_train['2nd Pressure Unit Time_AutoClave'] + 1)
df_train['Pressure_Time_slope3_AutoClave'] = (df_train['3rd Pressure Collect Result_AutoClave'] - df_train['2nd Pressure Collect Result_AutoClave']) / (df_train['3rd Pressure Unit Time_AutoClave'] + 1)
df_test['Pressure_Time_slope1_AutoClave'] = df_test['1st Pressure Collect Result_AutoClave'] / (df_test['1st Pressure 1st Pressure Unit Time_AutoClave'] + 1)
df_test['Pressure_Time_slope2_AutoClave'] = (df_test['2nd Pressure Collect Result_AutoClave'] - df_test['1st Pressure Collect Result_AutoClave']) / (df_test['2nd Pressure Unit Time_AutoClave'] + 1)
df_test['Pressure_Time_slope3_AutoClave'] = (df_test['3rd Pressure Collect Result_AutoClave'] - df_test['2nd Pressure Collect Result_AutoClave']) / (df_test['3rd Pressure Unit Time_AutoClave'] + 1)

##Total Pressure Change
df_train['Total_Pressure_change_AutoClave'] = df_train['3rd Pressure Collect Result_AutoClave'] - df_train['1st Pressure Collect Result_AutoClave']
df_test['Total_Pressure_change_AutoClave'] = df_test['3rd Pressure Collect Result_AutoClave'] - df_test['1st Pressure Collect Result_AutoClave']

##Total Pressure
df_train['Total_Pressure_AutoClave'] = df_train['1st Pressure Collect Result_AutoClave'] + df_train['2nd Pressure Collect Result_AutoClave'] + df_train['3rd Pressure Collect Result_AutoClave'] 
df_test['Total_Pressure_AutoClave'] = df_test['1st Pressure Collect Result_AutoClave'] + df_test['2nd Pressure Collect Result_AutoClave'] + df_test['3rd Pressure Collect Result_AutoClave'] 

##Average Pressure
df_train['Average_Pressure_AutoClave'] = (df_train['1st Pressure Collect Result_AutoClave'] + df_train['2nd Pressure Collect Result_AutoClave'] + df_train['3rd Pressure Collect Result_AutoClave']) / 3
df_test['Average_Pressure_AutoClave'] = (df_test['1st Pressure Collect Result_AutoClave'] + df_test['2nd Pressure Collect Result_AutoClave'] + df_test['3rd Pressure Collect Result_AutoClave']) / 3

##Totel Pressure Time
df_train['Total_pressure_time_AutoClave'] = df_train['1st Pressure 1st Pressure Unit Time_AutoClave'] + df_train['2nd Pressure Unit Time_AutoClave'] + df_train['3rd Pressure Unit Time_AutoClave']
df_test['Total_pressure_time_AutoClave'] = df_test['1st Pressure 1st Pressure Unit Time_AutoClave'] + df_test['2nd Pressure Unit Time_AutoClave'] + df_test['3rd Pressure Unit Time_AutoClave']

##Area Under Pressure Time Curve
pressure_values = df_train[['1st Pressure Collect Result_AutoClave','2nd Pressure Collect Result_AutoClave','3rd Pressure Collect Result_AutoClave']]
df_train['Area_under_pressure_time_AutoClave'] = np.trapz(pressure_values, axis = 1)
pressure_values = df_test[['1st Pressure Collect Result_AutoClave','2nd Pressure Collect Result_AutoClave','3rd Pressure Collect Result_AutoClave']]
df_test['Area_under_pressure_time_AutoClave'] = np.trapz(pressure_values, axis = 1)

##Diff Time from Chamber Temp to Total Pressure
df_train['Time_AutoClave_else'] = df_train['Chamber Temp. Unit Time_AutoClave'] - df_train['Total_pressure_time_AutoClave']
df_test['Time_AutoClave_else'] = df_test['Chamber Temp. Unit Time_AutoClave'] - df_test['Total_pressure_time_AutoClave']

##Multiply Pressure and Time
df_train['1st Pressure_multi_time'] = df_train['1st Pressure Collect Result_AutoClave']  *  df_train['1st Pressure 1st Pressure Unit Time_AutoClave']
df_train['2nd Pressure_multi_time'] = df_train['2nd Pressure Collect Result_AutoClave'] * df_train['2nd Pressure Unit Time_AutoClave']
df_train['3rd Pressure_multi_time'] = df_train['3rd Pressure Collect Result_AutoClave'] * df_train['3rd Pressure Unit Time_AutoClave']
df_train['Total_Pressure_bytime'] = df_train['1st Pressure_multi_time'] + df_train['2nd Pressure_multi_time'] + df_train['3rd Pressure_multi_time']
df_test['1st Pressure_multi_time'] = df_test['1st Pressure Collect Result_AutoClave'] * df_test['1st Pressure 1st Pressure Unit Time_AutoClave']
df_test['2nd Pressure_multi_time'] = df_test['2nd Pressure Collect Result_AutoClave'] * df_test['2nd Pressure Unit Time_AutoClave']
df_test['3rd Pressure_multi_time'] = df_test['3rd Pressure Collect Result_AutoClave'] * df_test['3rd Pressure Unit Time_AutoClave']
df_test['Total_Pressure_bytime'] = df_test['1st Pressure_multi_time'] + df_test['2nd Pressure_multi_time'] + df_test['3rd Pressure_multi_time']

##PV = NRT
df_train['Auto_vol_1_AutoClave'] = (df_train['Chamber Temp. Collect Result_AutoClave'] + 273) / df_train['1st Pressure Collect Result_AutoClave']
df_train['Auto_vol_2_AutoClave'] = (df_train['Chamber Temp. Collect Result_AutoClave'] + 273) / df_train['2nd Pressure Collect Result_AutoClave']
df_train['Auto_vol_3_AutoClave'] = (df_train['Chamber Temp. Collect Result_AutoClave'] + 273) / df_train['3rd Pressure Collect Result_AutoClave']
df_test['Auto_vol_1_AutoClave'] = (df_test['Chamber Temp. Collect Result_AutoClave'] + 273) / df_test['1st Pressure Collect Result_AutoClave']
df_test['Auto_vol_2_AutoClave'] = (df_test['Chamber Temp. Collect Result_AutoClave'] + 273) / df_test['2nd Pressure Collect Result_AutoClave']
df_test['Auto_vol_3_AutoClave'] = (df_test['Chamber Temp. Collect Result_AutoClave'] + 273) / df_test['3rd Pressure Collect Result_AutoClave']

##Absolute Chamber Temp
df_train['K_TEMP_AutoClave'] = df_train['Chamber Temp. Collect Result_AutoClave'] + 273
df_test['K_TEMP_AutoClave'] = df_test['Chamber Temp. Collect Result_AutoClave'] + 273

##Arrhenius
df_train['Arrhenius_AutoClave'] = np.exp(1 / (df_train['Chamber Temp. Collect Result_AutoClave'] + 273) - 1 / 373.15)
df_test['Arrhenius_AutoClave'] = np.exp(1 / (df_test['Chamber Temp. Collect Result_AutoClave'] + 273) - 1 / 373.15)

### Fill1 Features

#### Signal combination

In [109]:
#DISCHARGED RESIN(Fill1)
##Resin 토출기 이동거리
df_train['Discharged_Resin_distance_1_Fill1'] = df_train['DISCHARGED SPEED OF RESIN Collect Result_Fill1'] * df_train['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1']
df_train['Discharged_Resin_distance_2_Fill1'] = df_train['DISCHARGED SPEED OF RESIN Collect Result_Fill1'] * df_train['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1']
df_train['Discharged_Resin_distance_3_Fill1'] = df_train['DISCHARGED SPEED OF RESIN Collect Result_Fill1'] * df_train['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1']
df_test['Discharged_Resin_distance_1_Fill1'] = df_test['DISCHARGED SPEED OF RESIN Collect Result_Fill1'] * df_test['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1']
df_test['Discharged_Resin_distance_2_Fill1'] = df_test['DISCHARGED SPEED OF RESIN Collect Result_Fill1'] * df_test['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1']
df_test['Discharged_Resin_distance_3_Fill1'] = df_test['DISCHARGED SPEED OF RESIN Collect Result_Fill1'] * df_test['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1']

##Resin 토출 총 소요 시간
df_train['TotalTime_Discharged_Resin_Fill1'] = df_train['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1'] + df_train['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1'] + df_train['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1']
df_test['TotalTime_Discharged_Resin_Fill1'] = df_test['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1'] + df_test['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1'] + df_test['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1']

In [110]:
#DISPENSE(Fill1)
##Dispensor 크기 변화
df_train['Dispense_volume_change1_Fill1'] = df_train['Dispense Volume(Stage2) Collect Result_Fill1'] - df_train['Dispense Volume(Stage1) Collect Result_Fill1']
df_train['Dispense_volume_change2_Fill1'] = df_train['Dispense Volume(Stage3) Collect Result_Fill1'] - df_train['Dispense Volume(Stage2) Collect Result_Fill1']
df_test['Dispense_volume_change1_Fill1'] = df_test['Dispense Volume(Stage2) Collect Result_Fill1'] - df_test['Dispense Volume(Stage1) Collect Result_Fill1']
df_test['Dispense_volume_change2_Fill1'] = df_test['Dispense Volume(Stage3) Collect Result_Fill1'] - df_test['Dispense Volume(Stage2) Collect Result_Fill1']

##Dispensor 크기 변화량
df_train['Dispense_volume_change1_abs_Fill1'] = df_train['Dispense_volume_change1_Fill1'].abs()
df_train['Dispense_volume_change2_abs_Fill1'] = df_train['Dispense_volume_change2_Fill1'].abs()
df_test['Dispense_volume_change1_abs_Fill1'] = df_test['Dispense_volume_change1_Fill1'].abs()
df_test['Dispense_volume_change2_abs_Fill1'] = df_test['Dispense_volume_change2_Fill1'].abs()

In [111]:
#DISCHARGED RESIN & DISPENSE (Fill1)
##도포된 Resin 양
df_train['Total_Resin_volume1_Fill1'] = df_train['Discharged_Resin_distance_1_Fill1'] * df_train['Dispense Volume(Stage1) Collect Result_Fill1']
df_train['Total_Resin_volume2_Fill1'] = df_train['Discharged_Resin_distance_2_Fill1'] * df_train['Dispense Volume(Stage2) Collect Result_Fill1']
df_train['Total_Resin_volume3_Fill1'] = df_train['Discharged_Resin_distance_3_Fill1'] * df_train['Dispense Volume(Stage3) Collect Result_Fill1']
df_test['Total_Resin_volume1_Fill1'] = df_test['Discharged_Resin_distance_1_Fill1'] * df_test['Dispense Volume(Stage1) Collect Result_Fill1']
df_test['Total_Resin_volume2_Fill1'] = df_test['Discharged_Resin_distance_2_Fill1'] * df_test['Dispense Volume(Stage2) Collect Result_Fill1']
df_test['Total_Resin_volume3_Fill1'] = df_test['Discharged_Resin_distance_3_Fill1'] * df_test['Dispense Volume(Stage3) Collect Result_Fill1']

# 시간당 토출량
df_train['Stage1_Fill1_Volume_Speed'] = df_train['Dispense Volume(Stage1) Collect Result_Fill1'] / df_train['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1']
df_train['Stage2_Fill1_Volume_Speed'] = df_train['Dispense Volume(Stage2) Collect Result_Fill1'] / df_train['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1']
df_train['Stage3_Fill1_Volume_Speed'] = df_train['Dispense Volume(Stage3) Collect Result_Fill1'] / df_train['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1']
df_test['Stage1_Fill1_Volume_Speed'] = df_test['Dispense Volume(Stage1) Collect Result_Fill1'] / df_test['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1']
df_test['Stage2_Fill1_Volume_Speed'] = df_test['Dispense Volume(Stage2) Collect Result_Fill1'] / df_test['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1']
df_test['Stage3_Fill1_Volume_Speed'] = df_test['Dispense Volume(Stage3) Collect Result_Fill1'] / df_test['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1']

In [112]:
#HEAD(Fill1)
##Fill1 X,Y,Z
df_train['Fill1_X_1'] = df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'] - df_train['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1']
df_train['Fill1_X_2'] = df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'] - df_train['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1']
df_train['Fill1_X_3'] = df_train['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1'] - df_train['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1']
df_test['Fill1_X_1'] = df_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'] - df_test['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1']
df_test['Fill1_X_2'] = df_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'] - df_test['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1']
df_test['Fill1_X_3'] = df_test['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1'] - df_test['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1']

df_train['Fill1_Y_1'] = df_train['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1'] - df_train['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1']
df_train['Fill1_Y_2'] = df_train['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1'] - df_train['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1']
df_train['Fill1_Y_3'] = df_train['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1'] - df_train['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1']
df_test['Fill1_Y_1'] = df_test['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1'] - df_test['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1']
df_test['Fill1_Y_2'] = df_test['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1'] - df_test['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1']
df_test['Fill1_Y_3'] = df_test['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1'] - df_test['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1']

df_train['Fill1_dist_1'] = np.sqrt(df_train['Fill1_X_1']**2 +df_train['Fill1_Y_1']**2)
df_train['Fill1_dist_2'] = np.sqrt(df_train['Fill1_X_2']**2 +df_train['Fill1_Y_2']**2)
df_train['Fill1_dist_3'] = np.sqrt(df_train['Fill1_X_3']**2 +df_train['Fill1_Y_3']**2)
df_test['Fill1_dist_1'] = np.sqrt(df_test['Fill1_X_1']**2 +df_test['Fill1_Y_1']**2)
df_test['Fill1_dist_2'] = np.sqrt(df_test['Fill1_X_2']**2 +df_test['Fill1_Y_2']**2)
df_test['Fill1_dist_3'] = np.sqrt(df_test['Fill1_X_3']**2 +df_test['Fill1_Y_3']**2)

df_train['Vector_Fill1_Stage1']= calculate_magnitude(df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'],
                                                   df_train['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1'],
                                                    df_train['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1'])
df_train['Vector_Fill1_Stage2']= calculate_magnitude(df_train['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1'],
                                                   df_train['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1'],
                                                    df_train['HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1'])
df_train['Vector_Fill1_Stage3']= calculate_magnitude(df_train['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1'],
                                                   df_train['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1'],
                                                    df_train['HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1'])

df_train['Angle_x_Fill1_Stage1'],df_train['Angle_y_Fill1_Stage1'],df_train['Angle_z_Fill1_Stage1'] = calculate_angles(df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'],
                                                   df_train['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1'],
                                                    df_train['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1'])
df_train['Angle_x_Fill1_Stage2'] ,df_train['Angle_y_Fill1_Stage2'],df_train['Angle_z_Fill1_Stage2'] = calculate_angles(df_train['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1'],
                                                   df_train['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1'],
                                                    df_train['HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1'])
df_train['Angle_x_Fill1_Stage3'] ,df_train['Angle_y_Fill1_Stage3'],df_train['Angle_z_Fill1_Stage3'] = calculate_angles(df_train['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1'],
                                                   df_train['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1'],
                                                    df_train['HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1'])
df_test['Vector_Fill1_Stage1']= calculate_magnitude(df_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'],
                                                   df_test['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1'],
                                                    df_test['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1'])
df_test['Vector_Fill1_Stage2']= calculate_magnitude(df_test['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1'],
                                                   df_test['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1'],
                                                    df_test['HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1'])
df_test['Vector_Fill1_Stage3']= calculate_magnitude(df_test['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1'],
                                                   df_test['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1'],
                                                    df_test['HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1'])

df_test['Angle_x_Fill1_Stage1'],df_test['Angle_y_Fill1_Stage1'],df_test['Angle_z_Fill1_Stage1'] = calculate_angles(df_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'],
                                                   df_test['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1'],
                                                    df_test['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1'])
df_test['Angle_x_Fill1_Stage2'] ,df_test['Angle_y_Fill1_Stage2'],df_test['Angle_z_Fill1_Stage2'] = calculate_angles(df_test['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1'],
                                                   df_test['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1'],
                                                    df_test['HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1'])
df_test['Angle_x_Fill1_Stage3'] ,df_test['Angle_y_Fill1_Stage3'],df_test['Angle_z_Fill1_Stage3'] = calculate_angles(df_test['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1'],
                                                   df_test['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1'],
                                                    df_test['HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1'])

In [113]:
#Tact Time & Discharged Resin
##Diff from Tact Time to Discharged Time
df_train['Abs_Tact_discharged_time_Fill1'] = df_train['Machine Tact time Collect Result_Fill1'] - df_train['TotalTime_Discharged_Resin_Fill1']
df_test['Abs_Tact_discharged_time_Fill1'] = df_test['Machine Tact time Collect Result_Fill1'] - df_test['TotalTime_Discharged_Resin_Fill1']

### Fill2 Features

#### Signal combination

In [114]:
#CURE(Fill2)
##CURE 가동 방향 및 가동높이
df_train['Cure_HD_Fill2'] = df_train['CURE END POSITION X Collect Result_Fill2'] - df_train['CURE START POSITION X Collect Result_Fill2']
df_test['Cure_HD_Fill2'] = df_test['CURE END POSITION X Collect Result_Fill2'] - df_test['CURE START POSITION X Collect Result_Fill2']

##CURE 가동 시간
df_train['Cure_time_Fill2'] = df_train['Cure_HD_Fill2'] / df_train['CURE SPEED Collect Result_Fill2']
df_test['Cure_time_Fill2'] = df_test['Cure_HD_Fill2'] / df_test['CURE SPEED Collect Result_Fill2']

##CURE position category
df_train['Cure_position_category_Fill2'] = df_train['CURE END POSITION X Collect Result_Fill2'].map({240:'Down',1020:'Up'}).astype(str)
df_test['Cure_position_category_Fill2'] = df_test['CURE END POSITION X Collect Result_Fill2'].map({240:'Down',1020:'Up'}).astype(str)

In [115]:
#HEAD(Fill2)
# Fill2 X,Y,Z
df_train['Fill2_X_1'] = df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'] - df_train['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2']
df_train['Fill2_X_2'] = df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'] - df_train['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2']
df_train['Fill2_X_3'] = df_train['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2'] - df_train['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2']
df_test['Fill2_X_1'] = df_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'] - df_test['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2']
df_test['Fill2_X_2'] = df_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'] - df_test['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2']
df_test['Fill2_X_3'] = df_test['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2'] - df_test['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2']

df_train['Fill2_Y_1'] = df_train['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2'] - df_train['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2']
df_train['Fill2_Y_2'] = df_train['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2'] - df_train['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2']
df_train['Fill2_Y_3'] = df_train['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2'] - df_train['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2']
df_test['Fill2_Y_1'] = df_test['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2'] - df_test['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2']
df_test['Fill2_Y_2'] = df_test['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2'] - df_test['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2']
df_test['Fill2_Y_3'] = df_test['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2'] - df_test['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2']

df_train['Fill2_dist_1'] = np.sqrt(df_train['Fill2_X_1']**2 +df_train['Fill2_Y_1']**2)
df_train['Fill2_dist_2'] = np.sqrt(df_train['Fill2_X_2']**2 +df_train['Fill2_Y_2']**2)
df_train['Fill2_dist_3'] = np.sqrt(df_train['Fill2_X_3']**2 +df_train['Fill2_Y_3']**2)
df_test['Fill2_dist_1'] = np.sqrt(df_test['Fill2_X_1']**2 +df_test['Fill2_Y_1']**2)
df_test['Fill2_dist_2'] = np.sqrt(df_test['Fill2_X_2']**2 +df_test['Fill2_Y_2']**2)
df_test['Fill2_dist_3'] = np.sqrt(df_test['Fill2_X_3']**2 +df_test['Fill2_Y_3']**2)

df_train['Vector_Fill2_Stage1']= calculate_magnitude(df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'],
                                                   df_train['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2'],
                                                    df_train['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2'])
df_train['Vector_Fill2_Stage2']= calculate_magnitude(df_train['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2'],
                                                   df_train['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2'],
                                                    df_train['HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2'])
df_train['Vector_Fill2_Stage3']= calculate_magnitude(df_train['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2'],
                                                   df_train['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2'],
                                                    df_train['HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2'])
df_test['Vector_Fill2_Stage1']= calculate_magnitude(df_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'],
                                                   df_test['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2'],
                                                    df_test['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2'])
df_test['Vector_Fill2_Stage2']= calculate_magnitude(df_test['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2'],
                                                   df_test['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2'],
                                                    df_test['HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2'])
df_test['Vector_Fill2_Stage3']= calculate_magnitude(df_test['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2'],
                                                   df_test['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2'],
                                                    df_test['HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2'])

df_train['Angle_x_Fill2_Stage1'],df_train['Angle_y_Fill2_Stage1'],df_train['Angle_z_Fill2_Stage1'] = calculate_angles(df_train['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'],
                                                   df_train['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2'],
                                                    df_train['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2'])
df_train['Angle_x_Fill2_Stage2'] ,df_train['Angle_y_Fill2_Stage2'],df_train['Angle_z_Fill2_Stage2'] = calculate_angles(df_train['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2'],
                                                   df_train['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2'],
                                                    df_train['HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2'])
df_train['Angle_x_Fill2_Stage3'] ,df_train['Angle_y_Fill2_Stage3'],df_train['Angle_z_Fill2_Stage3'] = calculate_angles(df_train['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2'],
                                                   df_train['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2'],
                                                    df_train['HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2'])
df_test['Angle_x_Fill2_Stage1'],df_test['Angle_y_Fill2_Stage1'],df_test['Angle_z_Fill2_Stage1'] = calculate_angles(df_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'],
                                                   df_test['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2'],
                                                    df_test['HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2'])
df_test['Angle_x_Fill2_Stage2'] ,df_test['Angle_y_Fill2_Stage2'],df_test['Angle_z_Fill2_Stage2'] = calculate_angles(df_test['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2'],
                                                   df_test['HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2'],
                                                    df_test['HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2'])
df_test['Angle_x_Fill2_Stage3'] ,df_test['Angle_y_Fill2_Stage3'],df_test['Angle_z_Fill2_Stage3'] = calculate_angles(df_test['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2'],
                                                   df_test['HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2'],
                                                    df_test['HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2'])

### Total Features

In [116]:
#Sum Time
##Tact Time 
df_train['Machine Tact time Collect Result_Total'] = df_train['Machine Tact time Collect Result_Dam'] + df_train['Machine Tact time Collect Result_Fill1'] + df_train['Machine Tact time Collect Result_Fill2']
df_test['Machine Tact time Collect Result_Total'] = df_test['Machine Tact time Collect Result_Dam'] + df_test['Machine Tact time Collect Result_Fill1'] + df_test['Machine Tact time Collect Result_Fill2']

##Sum Tact and Chamber 
df_train['Total_time'] = df_train['Chamber Temp. Unit Time_AutoClave'] + df_train['Machine Tact time Collect Result_Dam'] + df_train['Machine Tact time Collect Result_Fill1'] + df_train['Machine Tact time Collect Result_Fill2']
df_test['Total_time'] = df_test['Chamber Temp. Unit Time_AutoClave'] + df_test['Machine Tact time Collect Result_Dam'] + df_test['Machine Tact time Collect Result_Fill1'] + df_test['Machine Tact time Collect Result_Fill2']

In [117]:
df_train['Q_1'] = df_train['Dispense Volume(Stage1) Collect Result_Dam'] / df_train['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam']
df_train['Q_2'] = df_train['Dispense Volume(Stage2) Collect Result_Dam'] / df_train['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam']
df_train['Q_3'] = df_train['Dispense Volume(Stage3) Collect Result_Dam'] / df_train['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam']

df_train['Q_4'] = df_train['Dispense Volume(Stage1) Collect Result_Fill1'] / df_train['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1']
df_train['Q_5'] = df_train['Dispense Volume(Stage2) Collect Result_Fill1'] / df_train['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1']
df_train['Q_6'] = df_train['Dispense Volume(Stage3) Collect Result_Fill1'] / df_train['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1']

df_test['Q_1'] = df_test['Dispense Volume(Stage1) Collect Result_Dam'] / df_test['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam']
df_test['Q_2'] = df_test['Dispense Volume(Stage2) Collect Result_Dam'] / df_test['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam']
df_test['Q_3'] = df_test['Dispense Volume(Stage3) Collect Result_Dam'] / df_test['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam']

df_test['Q_4'] = df_test['Dispense Volume(Stage1) Collect Result_Fill1'] / df_test['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1']
df_test['Q_5'] = df_test['Dispense Volume(Stage2) Collect Result_Fill1'] / df_test['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1']
df_test['Q_6'] = df_test['Dispense Volume(Stage3) Collect Result_Fill1'] / df_test['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1']

In [118]:
df_train['Pressure/Time_Stage1'] = df_train['1st Pressure Collect Result_AutoClave'] / df_train['1st Pressure 1st Pressure Unit Time_AutoClave']
df_train['Pressure/Time_Stage2'] = df_train['2nd Pressure Collect Result_AutoClave'] / df_train['2nd Pressure Unit Time_AutoClave']
df_train['Pressure/Time_Stage3'] = df_train['3rd Pressure Collect Result_AutoClave'] / df_train['3rd Pressure Unit Time_AutoClave']

df_train['np_1'] = df_train['Q_1'] * df_train['1st Pressure 1st Pressure Unit Time_AutoClave']
df_train['np_2'] = df_train['Q_2'] * df_train['2nd Pressure Unit Time_AutoClave']
df_train['np_3'] = df_train['Q_3'] * df_train['3rd Pressure Unit Time_AutoClave']

df_train['두께_유량_1'] = df_train['THICKNESS 1 Collect Result_Dam'] / df_train['Q_1']
df_train['두께_유량_2'] = df_train['THICKNESS 2 Collect Result_Dam'] / df_train['Q_2']
df_train['두께_유량_3'] = df_train['THICKNESS 3 Collect Result_Dam'] / df_train['Q_3']

df_train['부피_유량_1'] = df_train['Dispense Volume(Stage1) Collect Result_Dam'] / df_train['THICKNESS 1 Collect Result_Dam']
df_train['부피_유량_2'] = df_train['Dispense Volume(Stage2) Collect Result_Dam'] / df_train['THICKNESS 2 Collect Result_Dam']
df_train['부피_유량_3'] = df_train['Dispense Volume(Stage3) Collect Result_Dam'] / df_train['THICKNESS 3 Collect Result_Dam']

df_test['Pressure/Time_Stage1'] = df_test['1st Pressure Collect Result_AutoClave'] / df_test['1st Pressure 1st Pressure Unit Time_AutoClave']
df_test['Pressure/Time_Stage2'] = df_test['2nd Pressure Collect Result_AutoClave'] / df_test['2nd Pressure Unit Time_AutoClave']
df_test['Pressure/Time_Stage3'] = df_test['3rd Pressure Collect Result_AutoClave'] / df_test['3rd Pressure Unit Time_AutoClave']

df_test['np_1'] = df_test['Q_1'] * df_test['1st Pressure 1st Pressure Unit Time_AutoClave']
df_test['np_2'] = df_test['Q_2'] * df_test['2nd Pressure Unit Time_AutoClave']
df_test['np_3'] = df_test['Q_3'] * df_test['3rd Pressure Unit Time_AutoClave']

df_test['두께_유량_1'] = df_test['THICKNESS 1 Collect Result_Dam'] / df_test['Q_1']
df_test['두께_유량_2'] = df_test['THICKNESS 2 Collect Result_Dam'] / df_test['Q_2']
df_test['두께_유량_3'] = df_test['THICKNESS 3 Collect Result_Dam'] / df_test['Q_3']

df_test['부피_유량_1'] = df_test['Dispense Volume(Stage1) Collect Result_Dam'] / df_test['THICKNESS 1 Collect Result_Dam']
df_test['부피_유량_2'] = df_test['Dispense Volume(Stage2) Collect Result_Dam'] / df_test['THICKNESS 2 Collect Result_Dam']
df_test['부피_유량_3'] = df_test['Dispense Volume(Stage3) Collect Result_Dam'] / df_test['THICKNESS 3 Collect Result_Dam']

In [119]:
df_train['volume_sum_stage_1']=df_train['Dispense Volume(Stage1) Collect Result_Dam'] + df_train['Dispense Volume(Stage1) Collect Result_Fill1']
df_train['volume_sum_stage_2']=df_train['Dispense Volume(Stage2) Collect Result_Dam'] + df_train['Dispense Volume(Stage2) Collect Result_Fill1']
df_train['volume_sum_stage_3']=df_train['Dispense Volume(Stage3) Collect Result_Dam'] + df_train['Dispense Volume(Stage3) Collect Result_Fill1']

df_train['pv_1'] = df_train['volume_sum_stage_1'] * df_train['1st Pressure Collect Result_AutoClave']
df_train['pv_2'] = df_train['volume_sum_stage_2'] * df_train['2nd Pressure Collect Result_AutoClave']
df_train['pv_3'] = df_train['volume_sum_stage_3'] * df_train['3rd Pressure Collect Result_AutoClave']

df_train['pvt_1'] = df_train['volume_sum_stage_1'] * df_train['1st Pressure Collect Result_AutoClave'] * df_train['1st Pressure 1st Pressure Unit Time_AutoClave']
df_train['pvt_2'] = df_train['volume_sum_stage_2'] * df_train['2nd Pressure Collect Result_AutoClave'] * df_train['2nd Pressure Unit Time_AutoClave']
df_train['pvt_3'] = df_train['volume_sum_stage_3'] * df_train['3rd Pressure Collect Result_AutoClave'] * df_train['3rd Pressure Unit Time_AutoClave']

df_train['speed_1'] = df_train['Stage1 Line1 Distance Speed Collect Result_Dam'] / df_train['Stage1 Circle1 Distance Speed Collect Result_Dam']
df_train['speed_2'] = df_train['Stage1 Line2 Distance Speed Collect Result_Dam'] / df_train['Stage1 Circle2 Distance Speed Collect Result_Dam']
df_train['speed_3'] = df_train['Stage1 Line3 Distance Speed Collect Result_Dam'] / df_train['Stage1 Circle3 Distance Speed Collect Result_Dam']
df_train['speed_4'] = df_train['Stage1 Line4 Distance Speed Collect Result_Dam'] / df_train['Stage1 Circle4 Distance Speed Collect Result_Dam']

df_train['speed_5'] = df_train['Stage2 Line1 Distance Speed Collect Result_Dam'] / df_train['Stage2 Circle1 Distance Speed Collect Result_Dam']
df_train['speed_6'] = df_train['Stage2 Line2 Distance Speed Collect Result_Dam'] / df_train['Stage2 Circle2 Distance Speed Collect Result_Dam']
df_train['speed_7'] = df_train['Stage2 Line3 Distance Speed Collect Result_Dam'] / df_train['Stage2 Circle3 Distance Speed Collect Result_Dam']
df_train['speed_8'] = df_train['Stage2 Line4 Distance Speed Collect Result_Dam'] / df_train['Stage2 Circle4 Distance Speed Collect Result_Dam']

df_train['speed_9'] = df_train['Stage3 Line1 Distance Speed Collect Result_Dam'] / df_train['Stage3 Circle1 Distance Speed Collect Result_Dam']
df_train['speed_10'] = df_train['Stage3 Line2 Distance Speed Collect Result_Dam'] / df_train['Stage3 Circle2 Distance Speed Collect Result_Dam']
df_train['speed_11'] = df_train['Stage3 Line3 Distance Speed Collect Result_Dam'] / df_train['Stage3 Circle3 Distance Speed Collect Result_Dam']
df_train['speed_12'] = df_train['Stage3 Line4 Distance Speed Collect Result_Dam'] / df_train['Stage3 Circle4 Distance Speed Collect Result_Dam']

df_test['volume_sum_stage_1']=df_test['Dispense Volume(Stage1) Collect Result_Dam'] + df_test['Dispense Volume(Stage1) Collect Result_Fill1']
df_test['volume_sum_stage_2']=df_test['Dispense Volume(Stage2) Collect Result_Dam'] + df_test['Dispense Volume(Stage2) Collect Result_Fill1']
df_test['volume_sum_stage_3']=df_test['Dispense Volume(Stage3) Collect Result_Dam'] + df_test['Dispense Volume(Stage3) Collect Result_Fill1']

df_test['pv_1'] = df_test['volume_sum_stage_1'] * df_test['1st Pressure Collect Result_AutoClave']
df_test['pv_2'] = df_test['volume_sum_stage_2'] * df_test['2nd Pressure Collect Result_AutoClave']
df_test['pv_3'] = df_test['volume_sum_stage_3'] * df_test['3rd Pressure Collect Result_AutoClave']

df_test['pvt_1'] = df_test['volume_sum_stage_1'] * df_test['1st Pressure Collect Result_AutoClave'] * df_test['1st Pressure 1st Pressure Unit Time_AutoClave']
df_test['pvt_2'] = df_test['volume_sum_stage_2'] * df_test['2nd Pressure Collect Result_AutoClave'] * df_test['2nd Pressure Unit Time_AutoClave']
df_test['pvt_3'] = df_test['volume_sum_stage_3'] * df_test['3rd Pressure Collect Result_AutoClave'] * df_test['3rd Pressure Unit Time_AutoClave']

df_test['speed_1'] = df_test['Stage1 Line1 Distance Speed Collect Result_Dam'] / df_test['Stage1 Circle1 Distance Speed Collect Result_Dam']
df_test['speed_2'] = df_test['Stage1 Line2 Distance Speed Collect Result_Dam'] / df_test['Stage1 Circle2 Distance Speed Collect Result_Dam']
df_test['speed_3'] = df_test['Stage1 Line3 Distance Speed Collect Result_Dam'] / df_test['Stage1 Circle3 Distance Speed Collect Result_Dam']
df_test['speed_4'] = df_test['Stage1 Line4 Distance Speed Collect Result_Dam'] / df_test['Stage1 Circle4 Distance Speed Collect Result_Dam']

df_test['speed_5'] = df_test['Stage2 Line1 Distance Speed Collect Result_Dam'] / df_test['Stage2 Circle1 Distance Speed Collect Result_Dam']
df_test['speed_6'] = df_test['Stage2 Line2 Distance Speed Collect Result_Dam'] / df_test['Stage2 Circle2 Distance Speed Collect Result_Dam']
df_test['speed_7'] = df_test['Stage2 Line3 Distance Speed Collect Result_Dam'] / df_test['Stage2 Circle3 Distance Speed Collect Result_Dam']
df_test['speed_8'] = df_test['Stage2 Line4 Distance Speed Collect Result_Dam'] / df_test['Stage2 Circle4 Distance Speed Collect Result_Dam']

df_test['speed_9'] = df_test['Stage3 Line1 Distance Speed Collect Result_Dam'] / df_test['Stage3 Circle1 Distance Speed Collect Result_Dam']
df_test['speed_10'] = df_test['Stage3 Line2 Distance Speed Collect Result_Dam'] / df_test['Stage3 Circle2 Distance Speed Collect Result_Dam']
df_test['speed_11'] = df_test['Stage3 Line3 Distance Speed Collect Result_Dam'] / df_test['Stage3 Circle3 Distance Speed Collect Result_Dam']
df_test['speed_12'] = df_test['Stage3 Line4 Distance Speed Collect Result_Dam'] / df_test['Stage3 Circle4 Distance Speed Collect Result_Dam']

In [120]:
df_train['CURE_START_END_DISTANCE_Fill2_X'] = df_train['CURE START POSITION X Collect Result_Fill2'] - df_train['CURE END POSITION X Collect Result_Fill2']
df_train['CURE_STANDBY_END_DISTANCE_Fill2_Z'] = df_train['CURE STANDBY POSITION Z Collect Result_Fill2'] - df_train['CURE END POSITION Z Collect Result_Fill2']
df_train['CURE_TIME_X_Fill2'] = df_train['CURE_START_END_DISTANCE_Fill2_X'] / df_train['CURE SPEED Collect Result_Fill2']
df_train['CURE_TIME_Z_Fill2'] = df_train['CURE_STANDBY_END_DISTANCE_Fill2_Z'] / df_train['CURE SPEED Collect Result_Fill2']
df_train['CURE_START_END_DISTANCE_Dam_X'] = df_train['CURE START POSITION X Collect Result_Dam'] - df_train['CURE END POSITION X Collect Result_Dam']
df_train['CURE_TIME_X_Dam'] = df_train['CURE_START_END_DISTANCE_Dam_X'] / df_train['CURE SPEED Collect Result_Dam']

df_test['CURE_START_END_DISTANCE_Fill2_X'] = df_test['CURE START POSITION X Collect Result_Fill2'] - df_test['CURE END POSITION X Collect Result_Fill2']
df_test['CURE_STANDBY_END_DISTANCE_Fill2_Z'] = df_test['CURE STANDBY POSITION Z Collect Result_Fill2'] - df_test['CURE END POSITION Z Collect Result_Fill2']
df_test['CURE_TIME_X_Fill2'] = df_test['CURE_START_END_DISTANCE_Fill2_X'] / df_test['CURE SPEED Collect Result_Fill2']
df_test['CURE_TIME_Z_Fill2'] = df_test['CURE_STANDBY_END_DISTANCE_Fill2_Z'] / df_test['CURE SPEED Collect Result_Fill2']
df_test['CURE_START_END_DISTANCE_Dam_X'] = df_test['CURE START POSITION X Collect Result_Dam'] - df_test['CURE END POSITION X Collect Result_Dam']
df_test['CURE_TIME_X_Dam'] = df_test['CURE_START_END_DISTANCE_Dam_X'] / df_test['CURE SPEED Collect Result_Dam']

In [121]:
df_train_tt = df_train.copy()
df_test_tt = df_test.copy()

In [122]:
#Time Features
df_train['Collect Date_Dam'] = pd.to_datetime(df_train['Collect Date_Dam'])
df_train['Collect Date_Fill1'] = pd.to_datetime(df_train['Collect Date_Fill1'])
df_train['Collect Date_Fill2'] = pd.to_datetime(df_train['Collect Date_Fill2'])
df_train['Collect Date_AutoClave'] = pd.to_datetime(df_train['Collect Date_AutoClave'])

##Time diff
df_train['Dam_Fill1_Time_Diff'] = (df_train['Collect Date_Fill1'] - df_train['Collect Date_Dam']).dt.total_seconds()
df_train['Fill1_Fill2_Time_Diff'] = (df_train['Collect Date_Fill2'] - df_train['Collect Date_Fill1']).dt.total_seconds()
df_train['Fill2_Auto_Time_Diff'] = (df_train['Collect Date_AutoClave'] - df_train['Collect Date_Fill2']).dt.total_seconds()
df_train['Total_Time_Diff'] = (df_train['Collect Date_AutoClave'] - df_train['Collect Date_Dam']).dt.total_seconds()

##Fill1&Fill2 Match
df_train['Fill1_Fill2_match'] = (df_train['Collect Date_Fill1'] == df_train['Collect Date_Fill2']).astype(int)

##Time Transform
df_train['year_Dam'] = df_train['Collect Date_Dam'].dt.year
df_train['month_Dam'] = df_train['Collect Date_Dam'].dt.month
df_train['day_Dam'] = df_train['Collect Date_Dam'].dt.day
df_train['weekday_Dam'] = df_train['Collect Date_Dam'].dt.weekday
df_train['hour_Dam'] = df_train['Collect Date_Dam'].dt.hour
df_train['minute_Dam'] = df_train['Collect Date_Dam'].dt.minute
df_train['second_Dam'] = df_train['Collect Date_Dam'].dt.second

df_train['year_Fill1'] = df_train['Collect Date_Fill1'].dt.year
df_train['month_Fill1'] = df_train['Collect Date_Fill1'].dt.month
df_train['day_Fill1'] = df_train['Collect Date_Fill1'].dt.day
df_train['weekday_Fill1'] = df_train['Collect Date_Fill1'].dt.weekday
df_train['hour_Fill1'] = df_train['Collect Date_Fill1'].dt.hour
df_train['minute_Fill1'] = df_train['Collect Date_Fill1'].dt.minute
df_train['second_Fill1'] = df_train['Collect Date_Fill1'].dt.second

df_train['year_Fill2'] = df_train['Collect Date_Fill2'].dt.year
df_train['month_Fill2'] = df_train['Collect Date_Fill2'].dt.month
df_train['day_Fill2'] = df_train['Collect Date_Fill2'].dt.day
df_train['weekday_Fill2'] = df_train['Collect Date_Fill2'].dt.weekday
df_train['hour_Fill2'] = df_train['Collect Date_Fill2'].dt.hour
df_train['minute_Fill2'] = df_train['Collect Date_Fill2'].dt.minute
df_train['second_Fill2'] = df_train['Collect Date_Fill2'].dt.second

df_train['sin_hour_Dam'] = np.sin(2 * np.pi * df_train['hour_Dam']/23.0)
df_train['cos_hour_Dam'] = np.cos(2 * np.pi * df_train['hour_Dam']/23.0)
df_train['sin_minute_Dam'] = np.sin(2 * np.pi * df_train['minute_Dam']/59.0)
df_train['cos_minute_Dam'] = np.cos(2 * np.pi * df_train['minute_Dam']/59.0)
df_train['sin_date_Dam'] = -np.sin(2 * np.pi * (df_train['month_Dam']+df_train['day_Dam']/31)/12)
df_train['cos_date_Dam'] = -np.sin(2 * np.pi * (df_train['month_Dam']+df_train['day_Dam']/31)/12)
df_train['sin_month_Dam'] = -np.sin(2 * np.pi * df_train['month_Dam']/12.0)
df_train['cos_month_Dam'] = -np.cos(2 * np.pi * df_train['month_Dam']/12.0)

##Cycling Transform
df_train['sin_hour_Fill1'] = np.sin(2 * np.pi * df_train['hour_Fill1']/23.0)
df_train['cos_hour_Fill1'] = np.cos(2 * np.pi * df_train['hour_Fill1']/23.0)
df_train['sin_minute_Fill1'] = np.sin(2 * np.pi * df_train['minute_Fill1']/59.0)
df_train['cos_minute_Fill1'] = np.cos(2 * np.pi * df_train['minute_Fill1']/59.0)
df_train['sin_date_Fill1'] = -np.sin(2 * np.pi * (df_train['month_Fill1']+df_train['day_Fill1']/31)/12)
df_train['cos_date_Fill1'] = -np.sin(2 * np.pi * (df_train['month_Fill1']+df_train['day_Fill1']/31)/12)
df_train['sin_month_Fill1'] = -np.sin(2 * np.pi * df_train['month_Fill1']/12.0)
df_train['cos_month_Fill1'] = -np.cos(2 * np.pi * df_train['month_Fill1']/12.0)

df_train['sin_hour_Fill2'] = np.sin(2 * np.pi * df_train['hour_Fill2']/23.0)
df_train['cos_hour_Fill2'] = np.cos(2 * np.pi * df_train['hour_Fill2']/23.0)
df_train['sin_minute_Fill2'] = np.sin(2 * np.pi * df_train['minute_Fill2']/59.0)
df_train['cos_minute_Fill2'] = np.cos(2 * np.pi * df_train['minute_Fill2']/59.0)
df_train['sin_date_Fill2'] = -np.sin(2 * np.pi * (df_train['month_Fill2']+df_train['day_Fill2']/31)/12)
df_train['cos_date_Fill2'] = -np.sin(2 * np.pi * (df_train['month_Fill2']+df_train['day_Fill2']/31)/12)
df_train['sin_month_Fill2'] = -np.sin(2 * np.pi * df_train['month_Fill2']/12.0)
df_train['cos_month_Fill2'] = -np.cos(2 * np.pi * df_train['month_Fill2']/12.0)

#Time Features
df_test['Collect Date_Dam'] = pd.to_datetime(df_test['Collect Date_Dam'])
df_test['Collect Date_Fill1'] = pd.to_datetime(df_test['Collect Date_Fill1'])
df_test['Collect Date_Fill2'] = pd.to_datetime(df_test['Collect Date_Fill2'])
df_test['Collect Date_AutoClave'] = pd.to_datetime(df_test['Collect Date_AutoClave'])

##Time diff
df_test['Dam_Fill1_Time_Diff'] = (df_test['Collect Date_Fill1'] - df_test['Collect Date_Dam']).dt.total_seconds()
df_test['Fill1_Fill2_Time_Diff'] = (df_test['Collect Date_Fill2'] - df_test['Collect Date_Fill1']).dt.total_seconds()
df_test['Fill2_Auto_Time_Diff'] = (df_test['Collect Date_AutoClave'] - df_test['Collect Date_Fill2']).dt.total_seconds()
df_test['Total_Time_Diff'] = (df_test['Collect Date_AutoClave'] - df_test['Collect Date_Dam']).dt.total_seconds()

##Fill1&Fill2 Match
df_test['Fill1_Fill2_match'] = (df_test['Collect Date_Fill1'] == df_test['Collect Date_Fill2']).astype(int)

##Time Transform
df_test['year_Dam'] = df_test['Collect Date_Dam'].dt.year
df_test['month_Dam'] = df_test['Collect Date_Dam'].dt.month
df_test['day_Dam'] = df_test['Collect Date_Dam'].dt.day
df_test['weekday_Dam'] = df_test['Collect Date_Dam'].dt.weekday
df_test['hour_Dam'] = df_test['Collect Date_Dam'].dt.hour
df_test['minute_Dam'] = df_test['Collect Date_Dam'].dt.minute
df_test['second_Dam'] = df_test['Collect Date_Dam'].dt.second

df_test['year_Fill1'] = df_test['Collect Date_Fill1'].dt.year
df_test['month_Fill1'] = df_test['Collect Date_Fill1'].dt.month
df_test['day_Fill1'] = df_test['Collect Date_Fill1'].dt.day
df_test['weekday_Fill1'] = df_test['Collect Date_Fill1'].dt.weekday
df_test['hour_Fill1'] = df_test['Collect Date_Fill1'].dt.hour
df_test['minute_Fill1'] = df_test['Collect Date_Fill1'].dt.minute
df_test['second_Fill1'] = df_test['Collect Date_Fill1'].dt.second

df_test['year_Fill2'] = df_test['Collect Date_Fill2'].dt.year
df_test['month_Fill2'] = df_test['Collect Date_Fill2'].dt.month
df_test['day_Fill2'] = df_test['Collect Date_Fill2'].dt.day
df_test['weekday_Fill2'] = df_test['Collect Date_Fill2'].dt.weekday
df_test['hour_Fill2'] = df_test['Collect Date_Fill2'].dt.hour
df_test['minute_Fill2'] = df_test['Collect Date_Fill2'].dt.minute
df_test['second_Fill2'] = df_test['Collect Date_Fill2'].dt.second

df_test['sin_hour_Dam'] = np.sin(2 * np.pi * df_test['hour_Dam']/23.0)
df_test['cos_hour_Dam'] = np.cos(2 * np.pi * df_test['hour_Dam']/23.0)
df_test['sin_minute_Dam'] = np.sin(2 * np.pi * df_test['minute_Dam']/59.0)
df_test['cos_minute_Dam'] = np.cos(2 * np.pi * df_test['minute_Dam']/59.0)
df_test['sin_date_Dam'] = -np.sin(2 * np.pi * (df_test['month_Dam']+df_test['day_Dam']/31)/12)
df_test['cos_date_Dam'] = -np.sin(2 * np.pi * (df_test['month_Dam']+df_test['day_Dam']/31)/12)
df_test['sin_month_Dam'] = -np.sin(2 * np.pi * df_test['month_Dam']/12.0)
df_test['cos_month_Dam'] = -np.cos(2 * np.pi * df_test['month_Dam']/12.0)

##Cycling Transform
df_test['sin_hour_Fill1'] = np.sin(2 * np.pi * df_test['hour_Fill1']/23.0)
df_test['cos_hour_Fill1'] = np.cos(2 * np.pi * df_test['hour_Fill1']/23.0)
df_test['sin_minute_Fill1'] = np.sin(2 * np.pi * df_test['minute_Fill1']/59.0)
df_test['cos_minute_Fill1'] = np.cos(2 * np.pi * df_test['minute_Fill1']/59.0)
df_test['sin_date_Fill1'] = -np.sin(2 * np.pi * (df_test['month_Fill1']+df_test['day_Fill1']/31)/12)
df_test['cos_date_Fill1'] = -np.sin(2 * np.pi * (df_test['month_Fill1']+df_test['day_Fill1']/31)/12)
df_test['sin_month_Fill1'] = -np.sin(2 * np.pi * df_test['month_Fill1']/12.0)
df_test['cos_month_Fill1'] = -np.cos(2 * np.pi * df_test['month_Fill1']/12.0)

df_test['sin_hour_Fill2'] = np.sin(2 * np.pi * df_test['hour_Fill2']/23.0)
df_test['cos_hour_Fill2'] = np.cos(2 * np.pi * df_test['hour_Fill2']/23.0)
df_test['sin_minute_Fill2'] = np.sin(2 * np.pi * df_test['minute_Fill2']/59.0)
df_test['cos_minute_Fill2'] = np.cos(2 * np.pi * df_test['minute_Fill2']/59.0)
df_test['sin_date_Fill2'] = -np.sin(2 * np.pi * (df_test['month_Fill2']+df_test['day_Fill2']/31)/12)
df_test['cos_date_Fill2'] = -np.sin(2 * np.pi * (df_test['month_Fill2']+df_test['day_Fill2']/31)/12)
df_test['sin_month_Fill2'] = -np.sin(2 * np.pi * df_test['month_Fill2']/12.0)
df_test['cos_month_Fill2'] = -np.cos(2 * np.pi * df_test['month_Fill2']/12.0)

df_train.drop(columns = ['Collect Date_Dam','Collect Date_Fill1','Collect Date_Fill2','Collect Date_AutoClave'], inplace=True)
df_test.drop(columns = ['Collect Date_Dam','Collect Date_Fill1','Collect Date_Fill2','Collect Date_AutoClave'], inplace=True)

In [123]:
#중복열 제거
duplicated_col = set(df_train.columns) - set(df_train.loc[:,~df_train.T.duplicated()].columns)
duplicated_col = list(duplicated_col)

print("중복 제거 전 컬럼 개수 : ", df_train.shape[1])
df_train.drop(columns = duplicated_col, inplace = True)
df_test.drop(columns = duplicated_col, inplace = True)
print("중복 제거 후 컬럼 개수 : ", df_train.shape[1])

display(df_train.head())
display(df_test.head())

중복 제거 전 컬럼 개수 :  355
중복 제거 후 컬럼 개수 :  307


Unnamed: 0,Equipment_PalletID,Model_Receip,Workorder,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,...,sin_hour_Fill1,cos_hour_Fill1,sin_minute_Fill1,cos_minute_Fill1,sin_date_Fill1,sin_hour_Fill2,cos_hour_Fill2,sin_minute_Fill2,cos_minute_Fill2,sin_date_Fill2
0,Dispenser_1_7,AJX75334505_1,4F1XA938-1,240.0,2.5,-90,100,1030,16,14.9,...,0.136167,-0.990686,0.847734,-0.530421,-0.585049,0.136167,-0.990686,0.847734,-0.530421,-0.585049
1,Dispenser_1_7,AJX75334505_1,3KPM0016-2,240.0,2.5,-90,70,1030,10,21.3,...,-0.631088,-0.775711,-0.053222,-0.998583,0.948947,-0.631088,-0.775711,-0.053222,-0.998583,0.948947
2,Dispneser_2_10,AJX75334501_1,4E1X9167-1,1000.0,12.5,90,85,280,16,14.7,...,0.631088,-0.775711,-0.053222,-0.998583,-0.996436,0.631088,-0.775711,-0.053222,-0.998583,-0.996436
3,Dispneser_2_12,AJX75334501_1,3K1X0057-1,1000.0,12.5,90,70,280,10,21.3,...,-0.81697,-0.57668,-0.899312,-0.437307,0.912166,-0.81697,-0.57668,-0.899312,-0.437307,0.912166
4,Dispenser_1_8,AJX75334501_1,3HPM0007-1,240.0,2.5,-90,70,1030,10,9.7,...,-0.398401,-0.917211,0.847734,-0.530421,0.440394,-0.398401,-0.917211,0.847734,-0.530421,0.440394


Unnamed: 0,Equipment_PalletID,Model_Receip,Workorder,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,...,sin_hour_Fill1,cos_hour_Fill1,sin_minute_Fill1,cos_minute_Fill1,sin_date_Fill1,sin_hour_Fill2,cos_hour_Fill2,sin_minute_Fill2,cos_minute_Fill2,sin_date_Fill2
0,Dispneser_2_13,AJX75334501_1,3J1XF767-1,1000.0,12.5,90,70,280,10,17.0,...,-0.398401,-0.917211,-0.053222,-0.998583,0.968077,-0.398401,-0.917211,-0.053222,-0.998583,0.968077
1,Dispneser_2_14,AJX75334501_1,4B1XD472-2,1000.0,12.5,90,70,280,16,14.2,...,-0.942261,-0.33488,-0.818303,0.574787,-0.912166,-0.942261,-0.33488,-0.818303,0.574787,-0.912166
2,Dispenser_1_1,AJX75334501_1,3H1XE355-1,240.0,2.5,-90,70,1030,10,9.7,...,0.136167,-0.990686,-0.899312,-0.437307,0.688967,0.136167,-0.990686,-0.899312,-0.437307,0.688967
3,Dispneser_2_14,AJX75334501_1,3L1XA128-1,1000.0,12.5,90,70,280,10,21.3,...,0.81697,-0.57668,0.0,1.0,0.455495,0.81697,-0.57668,0.0,1.0,0.455495
4,Dispenser_1_1,AJX75334501_1,4A1XA639-1,240.0,2.5,-90,70,1030,16,13.2,...,-0.631088,-0.775711,0.0,1.0,-0.378779,-0.631088,-0.775711,0.874763,0.484551,-0.378779


In [124]:
df_train_org = df_train.iloc[:,:271]
df_train_time = df_train.iloc[:,271:]

df_test_org = df_test.iloc[:,:270]
df_test_time = df_test.iloc[:,270:]

In [125]:
df_train.fillna(0.0, inplace = True)
df_test.fillna(0.0, inplace = True)

df_train = df_train.replace([np.inf, -np.inf], 0.0)
df_test = df_test.replace([np.inf, -np.inf], 0.0)

df_train['Production Qty Collect Result'] = df_train['Production Qty Collect Result'].astype(int)
df_train['Production_Sequence_ratio'] = df_train['Production_Sequence_ratio'].astype(float)

df_test['Production Qty Collect Result'] = df_test['Production Qty Collect Result'].astype(int)
df_test['Production_Sequence_ratio'] = df_test['Production_Sequence_ratio'].astype(float)

In [126]:
df_train_org.fillna(0.0, inplace = True)
df_test_org.fillna(0.0, inplace = True)

df_train_org = df_train_org.replace([np.inf, -np.inf], 0.0)
df_test_org = df_test_org.replace([np.inf, -np.inf], 0.0)

df_train_org['Production Qty Collect Result'] = df_train_org['Production Qty Collect Result'].astype(int)
df_train_org['Production_Sequence_ratio'] = df_train_org['Production_Sequence_ratio'].astype(float)

df_test_org['Production Qty Collect Result'] = df_test_org['Production Qty Collect Result'].astype(int)
df_test_org['Production_Sequence_ratio'] = df_test_org['Production_Sequence_ratio'].astype(float)

In [127]:
df_train_clof  = df_train.copy()
df_test_clof = df_test.copy()

### Scaling(Normalizing) & Labeling

In [128]:
#Target mapping
mapping = {'Normal': 0, 'AbNormal': 1}
df_train['target'] = df_train['target'].map(mapping)

#Identify numerical columns
numerical_cols = df_train.select_dtypes(include=['float','int']).columns.to_list()
numerical_cols.remove("target")

#Initialize the Normalizer
scaler = Normalizer()

#Fit and transform the numerical columns
df_train[numerical_cols] = scaler.fit_transform(df_train[numerical_cols])
df_test[numerical_cols] = scaler.transform(df_test[numerical_cols])

#Target Encoding
str_col = []
for col in df_train.columns:
    if df_train[col].dtype == "object":
        str_col.append(col)

for col in str_col:
    te = ce.TargetEncoder()
    df_train[col] = te.fit_transform(df_train[col], df_train['target'])
    df_test[col] = te.transform(df_test[col])

In [129]:
#Target mapping
mapping = {'Normal': 0, 'AbNormal': 1}
df_train_org['target'] = df_train_org['target'].map(mapping)

#Identify numerical columns
numerical_cols = df_train_org.select_dtypes(include=['float','int']).columns.to_list()
numerical_cols.remove("target")

#Initialize the Normalizer
scaler = Normalizer()

#Fit and transform the numerical columns
df_train_org[numerical_cols] = scaler.fit_transform(df_train_org[numerical_cols])
df_test_org[numerical_cols] = scaler.transform(df_test_org[numerical_cols])

#Target Encoding
str_col = []
for col in df_train_org.columns:
    if df_train_org[col].dtype == "object":
        str_col.append(col)

for col in str_col:
    te = ce.TargetEncoder()
    df_train_org[col] = te.fit_transform(df_train_org[col], df_train_org['target'])
    df_test_org[col] = te.transform(df_test_org[col])

### Validation Strategy based on Hard and Easy Sample 

In [130]:
from scipy.spatial.distance import mahalanobis
from numpy.linalg import inv

# Assuming df_train_org is your DataFrame and 'target' is your column of labels
# Label 'normal' is mapped to 0, and 'abnormal' is mapped to 1 for clarity

# Separate the data by label
normal_data = df_train_org[df_train_org['target'] == 0].drop(columns=['target'])
abnormal_data = df_train_org[df_train_org['target'] == 1].drop(columns=['target'])

# Calculate covariance matrices
cov_normal = np.cov(normal_data.T)
cov_abnormal = np.cov(abnormal_data.T)

# Add a small value to the diagonal for regularization
reg_value = 1e-6
cov_normal += np.eye(cov_normal.shape[0]) * reg_value
cov_abnormal += np.eye(cov_abnormal.shape[0]) * reg_value

# Calculate inverse covariance matrices
inv_cov_normal = inv(cov_normal)
inv_cov_abnormal = inv(cov_abnormal)

# Calculate means
mean_normal = np.mean(normal_data, axis=0)
mean_abnormal = np.mean(abnormal_data, axis=0)

hard_samples = []
easy_samples = []

# Define a threshold for classifying a sample as hard
threshold = 0.3 # You can adjust this threshold as needed

# Classify samples based on Mahalanobis distance
for index, row in df_train_org.drop(columns=['target']).iterrows():
    row_values = row.values
    dist_normal = mahalanobis(row_values, mean_normal, inv_cov_normal)
    dist_abnormal = mahalanobis(row_values, mean_abnormal, inv_cov_abnormal)

    # If the distances are similar, consider it a hard sample
    if abs(dist_normal - dist_abnormal) < threshold:
        hard_samples.append(df_train_org.iloc[index])
    else:
        easy_samples.append(df_train_org.iloc[index])

print(f"Hard samples: {len(hard_samples)}, Easy samples: {len(easy_samples)}")

# Convert hard_samples and easy_samples into DataFrames for easy viewing
hard_samples_df = pd.DataFrame(hard_samples)
easy_samples_df = pd.DataFrame(easy_samples)

hard_samples_df['easy_hard'] = "HARD"
easy_samples_df['easy_hard'] = "EASY"

df_train_org = pd.concat([hard_samples_df, easy_samples_df], axis = 0).sort_index()
df_train_org.head()

Hard samples: 16038, Easy samples: 24468


Unnamed: 0,Equipment_PalletID,Model_Receip,Workorder,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,...,speed_7,speed_9,speed_11,CURE_START_END_DISTANCE_Fill2_X,CURE_STANDBY_END_DISTANCE_Fill2_Z,CURE_TIME_X_Fill2,CURE_TIME_Z_Fill2,CURE_START_END_DISTANCE_Dam_X,CURE_TIME_X_Dam,easy_hard
0,0.054457,0.0486,0.158385,0.004127,4.3e-05,-0.001548,0.00172,0.017714,0.000275,0.000256,...,1.7e-05,1.7e-05,1.7e-05,0.013414,0.0,0.000279,0.0,0.013586,0.000136,EASY
1,0.054457,0.0486,0.015314,0.003764,3.9e-05,-0.001412,0.001098,0.016154,0.000157,0.000334,...,1.6e-05,1.6e-05,1.6e-05,0.012233,0.0,0.000245,0.0,0.01239,0.000177,EASY
2,0.052058,0.056695,0.009534,0.017099,0.000214,0.001539,0.001453,0.004788,0.000274,0.000251,...,1.7e-05,1.7e-05,1.7e-05,0.013337,0.0,0.000267,0.0,-0.012311,-0.000145,EASY
3,0.042871,0.056695,0.057143,0.021956,0.000274,0.001976,0.001537,0.006148,0.00022,0.000468,...,2.2e-05,2.2e-05,2.2e-05,0.017126,0.0,0.000343,0.0,-0.015809,-0.000226,EASY
4,0.060155,0.056695,0.109495,0.002586,2.7e-05,-0.00097,0.000754,0.011097,0.000108,0.000105,...,1.1e-05,1.1e-05,1.1e-05,0.008404,1.1e-05,0.000168,2.154775e-07,0.008511,0.000122,HARD


In [131]:
df_train['easy_hard'] = df_train_org['easy_hard']

In [132]:
#DL 시드 고정
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # GPU 사용 시 추가
    torch.backends.cudnn.deterministic = True  # Reproducibility를 위한 설정
    torch.backends.cudnn.benchmark = False

# clof

In [56]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

seed_everything(42)

class CustomLabelEncoder(LabelEncoder):
    def fit(self, y):
        super().fit(y)
        return self

    def transform(self, y):
        known_classes = set(self.classes_)
        y_transformed = []
        new_label = len(self.classes_)
        class_mapping = {label: idx for idx, label in enumerate(self.classes_)}

        for label in y:
            if label in known_classes:
                y_transformed.append(class_mapping[label])
            else:
                y_transformed.append(new_label)
                new_label += 1

        return np.array(y_transformed)

    def fit_transform(self, y):
        self.fit(y)
        return self.transform(y)
    
train = df_train_clof.drop(columns = ['sj_1','sj_5','sj_6','sj_7','sj_8','sj_9'])
test = df_test_clof.drop(columns = ['sj_1','sj_5','sj_6','sj_7','sj_8','sj_9'])

columns_to_keep = train.columns[train.columns != 'target']
train = train[columns_to_keep.tolist() + ['target']]  # Keep 'target' in train
test = test[columns_to_keep.tolist()]  # Exclude 'target' in test

train['target'] = train['target'].map({'AbNormal': 1, 'Normal': 0}).astype(int)

label_encoders = {}
for col in train.columns:
    if train[col].dtype == 'object':
        le = CustomLabelEncoder()
        train[col] = le.fit_transform(train[col].astype(str))
        test[col] = le.transform(test[col].astype(str))
        label_encoders[col] = le
        
# Initialize an empty array to store predictions
ensemble_predictions = np.zeros((test.shape[0],))

# Set up Stratified K-Fold
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Improved Contrastive Loss function
class ContrastiveLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, output1, output2, label):
        euclidean_distance = F.pairwise_distance(output1, output2)
        loss = torch.mean((1 - label) * torch.pow(euclidean_distance, 2) +
                          label * torch.pow(torch.clamp(self.margin - euclidean_distance, min=0.0), 2))
        return loss

# Enhanced encoder network with residual connections for CLOF
class EnhancedCLOFEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(EnhancedCLOFEncoder, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)
        self.fc_residual = nn.Linear(input_dim, output_dim)  # Residual connection

    def forward(self, x):
        residual = self.fc_residual(x)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x + residual  # Add the residual connection

# Create a custom dataset for contrastive learning
class ContrastiveDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x1 = self.data[idx]
        label = self.labels[idx]

        if label == 0:  # Inlier
            idx2 = idx  # Choose the same index
        else:  # Outlier
            idx2 = torch.randint(0, len(self.data), (1,)).item()  # Choose a random different index

        x2 = self.data[idx2]
        label = torch.tensor(int(label != self.labels[idx2]), dtype=torch.float32)  # 1 if different, 0 if same

        return x1, x2, label

# Hyperparameters
input_dim = train.drop(columns=['target']).shape[1]
hidden_dim = 128
output_dim = 64
batch_size = 128
test_batch_size = 256  # Larger batch size for test data to process faster
inlier_batch_size = 512  # Batch size for processing inliers
learning_rate = 0.00001
n_epochs = 100
margin = 1.0

# Stratified K-Fold Cross-Validation
for fold, (train_idx, val_idx) in enumerate(skf.split(train, train['target'])):
    print(f"Training fold {fold + 1}/{n_splits}...")

    # Split the data
    train_fold = train.iloc[train_idx]
    val_fold = train.iloc[val_idx]

    # Scale the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(train_fold.drop(columns=['target']))
    X_val = scaler.transform(val_fold.drop(columns=['target']))
    X_test = scaler.transform(test)

    y_train = train_fold['target'].values

    # Convert to tensors
    train_data = torch.tensor(X_train, dtype=torch.float32).to(device)
    test_data = torch.tensor(X_test, dtype=torch.float32).to(device)
    y_train = torch.tensor(y_train, dtype=torch.float32).to(device)

    # Create dataset and dataloader
    dataset = ContrastiveDataset(train_data, y_train)
    train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Initialize the model, loss function, and optimizer
    model = EnhancedCLOFEncoder(input_dim, hidden_dim, output_dim).to(device)
    criterion = ContrastiveLoss(margin=margin).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, verbose=True)

    # Training loop
    for epoch in range(n_epochs):
        model.train()
        total_loss = 0
        for x1, x2, label in train_loader:
            x1, x2, label = x1.to(device), x2.to(device), label.to(device)
            optimizer.zero_grad()

            output1 = model(x1)
            output2 = model(x2)
            loss = criterion(output1, output2, label)

            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        scheduler.step(total_loss)
        print(f"Epoch {epoch + 1}/{n_epochs}, Loss: {total_loss / len(train_loader):.4f}")

    # Generate predictions on the test set for the current fold
    model.eval()
    with torch.no_grad():
        test_loader = DataLoader(test_data, batch_size=test_batch_size, shuffle=False)
        encoded_test_data_list = []

        for x in test_loader:
            x = x.to(device)
            encoded_x = model(x)
            encoded_test_data_list.append(encoded_x)

        encoded_test_data = torch.cat(encoded_test_data_list)

        # Process inliers in batches to manage memory usage
        inlier_data = train_data[y_train == 0]  # Using only known inliers
        inlier_loader = DataLoader(inlier_data, batch_size=inlier_batch_size, shuffle=False)
        encoded_inliers_list = []

        for inlier_batch in inlier_loader:
            inlier_batch = inlier_batch.to(device)
            encoded_inlier_batch = model(inlier_batch)
            encoded_inliers_list.append(encoded_inlier_batch)

        encoded_inliers = torch.cat(encoded_inliers_list)

        # Compute distances in batches
        min_distances = []

        for test_vector in encoded_test_data:
            test_vector = test_vector.unsqueeze(0)  # Add batch dimension
            distances = F.pairwise_distance(test_vector, encoded_inliers)
            min_distance = torch.min(distances).item()
            min_distances.append(min_distance)

        min_distances = torch.tensor(min_distances, device=device)

        # Ensemble: Add predictions from the current fold to the ensemble
        ensemble_predictions += min_distances.cpu().numpy()

# Average the predictions from all folds
ensemble_predictions /= n_splits

# Improved thresholding for outlier detection using mean and standard deviation
threshold = np.mean(ensemble_predictions) + np.std(ensemble_predictions)  # Threshold for detecting outliers
outliers = ensemble_predictions > threshold

submission_clof = pd.read_csv("submission.csv")
submission_clof['target'] = ensemble_predictions
submission_clof.to_csv('submission_clof.csv', index=False)

Using device: cpu
Training fold 1/10...
Epoch 1/100, Loss: 0.1537
Epoch 2/100, Loss: 0.1288
Epoch 3/100, Loss: 0.1961
Epoch 4/100, Loss: 0.1898
Epoch 5/100, Loss: 0.2101
Epoch 6/100, Loss: 0.0689
Epoch 7/100, Loss: 0.1770
Epoch 8/100, Loss: 0.0902
Epoch 9/100, Loss: 0.0867
Epoch 10/100, Loss: 0.0894
Epoch 11/100, Loss: 0.1449
Epoch 12/100, Loss: 0.0608
Epoch 13/100, Loss: 0.0804
Epoch 14/100, Loss: 0.0757
Epoch 15/100, Loss: 0.0665
Epoch 16/100, Loss: 0.0504
Epoch 17/100, Loss: 0.0946
Epoch 18/100, Loss: 0.0660
Epoch 19/100, Loss: 0.0838
Epoch 20/100, Loss: 0.0582
Epoch 21/100, Loss: 0.0473
Epoch 22/100, Loss: 0.0680
Epoch 23/100, Loss: 0.1426
Epoch 24/100, Loss: 0.0714
Epoch 25/100, Loss: 0.0529
Epoch 26/100, Loss: 0.0356
Epoch 27/100, Loss: 0.0706
Epoch 28/100, Loss: 0.0744
Epoch 29/100, Loss: 0.0599
Epoch 30/100, Loss: 0.0257
Epoch 31/100, Loss: 0.0564
Epoch 32/100, Loss: 0.1180
Epoch 33/100, Loss: 0.0690
Epoch 34/100, Loss: 0.0439
Epoch 35/100, Loss: 0.0695
Epoch 00036: reducing le

Epoch 82/100, Loss: 0.0437
Epoch 83/100, Loss: 0.0473
Epoch 84/100, Loss: 0.0887
Epoch 85/100, Loss: 0.1344
Epoch 86/100, Loss: 0.0686
Epoch 87/100, Loss: 0.1645
Epoch 88/100, Loss: 0.0542
Epoch 89/100, Loss: 0.0578
Epoch 90/100, Loss: 0.0476
Epoch 91/100, Loss: 0.0836
Epoch 92/100, Loss: 0.0513
Epoch 93/100, Loss: 0.0777
Epoch 94/100, Loss: 0.0715
Epoch 95/100, Loss: 0.1738
Epoch 96/100, Loss: 0.1370
Epoch 97/100, Loss: 0.0350
Epoch 98/100, Loss: 0.0967
Epoch 99/100, Loss: 0.0655
Epoch 100/100, Loss: 0.0729
Training fold 4/10...
Epoch 1/100, Loss: 0.1878
Epoch 2/100, Loss: 0.1250
Epoch 3/100, Loss: 0.0784
Epoch 4/100, Loss: 0.1205
Epoch 5/100, Loss: 0.1029
Epoch 6/100, Loss: 0.1309
Epoch 7/100, Loss: 0.1164
Epoch 8/100, Loss: 0.0904
Epoch 00009: reducing learning rate of group 0 to 1.0000e-06.
Epoch 9/100, Loss: 0.0936
Epoch 10/100, Loss: 0.0948
Epoch 11/100, Loss: 0.1042
Epoch 12/100, Loss: 0.1433
Epoch 13/100, Loss: 0.1111
Epoch 14/100, Loss: 0.0938
Epoch 15/100, Loss: 0.0742
Epoch 

Epoch 64/100, Loss: 0.0677
Epoch 65/100, Loss: 0.0691
Epoch 66/100, Loss: 0.1262
Epoch 67/100, Loss: 0.1566
Epoch 68/100, Loss: 0.0633
Epoch 69/100, Loss: 0.0554
Epoch 70/100, Loss: 0.0701
Epoch 71/100, Loss: 0.0910
Epoch 72/100, Loss: 0.0828
Epoch 73/100, Loss: 0.0484
Epoch 74/100, Loss: 0.0711
Epoch 75/100, Loss: 0.0456
Epoch 76/100, Loss: 0.1472
Epoch 77/100, Loss: 0.0520
Epoch 78/100, Loss: 0.0556
Epoch 79/100, Loss: 0.1672
Epoch 80/100, Loss: 0.0561
Epoch 81/100, Loss: 0.0662
Epoch 82/100, Loss: 0.0693
Epoch 83/100, Loss: 0.0440
Epoch 84/100, Loss: 0.0480
Epoch 85/100, Loss: 0.0772
Epoch 86/100, Loss: 0.0573
Epoch 87/100, Loss: 0.0604
Epoch 88/100, Loss: 0.0639
Epoch 89/100, Loss: 0.1035
Epoch 90/100, Loss: 0.1061
Epoch 91/100, Loss: 0.1042
Epoch 92/100, Loss: 0.1010
Epoch 93/100, Loss: 0.0620
Epoch 94/100, Loss: 0.0524
Epoch 95/100, Loss: 0.2200
Epoch 96/100, Loss: 0.1435
Epoch 97/100, Loss: 0.1017
Epoch 98/100, Loss: 0.0908
Epoch 99/100, Loss: 0.0557
Epoch 100/100, Loss: 0.0542


Epoch 46/100, Loss: 0.0433
Epoch 47/100, Loss: 0.0364
Epoch 48/100, Loss: 0.0920
Epoch 49/100, Loss: 0.0825
Epoch 50/100, Loss: 0.0678
Epoch 51/100, Loss: 0.0631
Epoch 52/100, Loss: 0.1308
Epoch 53/100, Loss: 0.0721
Epoch 54/100, Loss: 0.1303
Epoch 55/100, Loss: 0.1653
Epoch 56/100, Loss: 0.1309
Epoch 57/100, Loss: 0.0501
Epoch 58/100, Loss: 0.0412
Epoch 59/100, Loss: 0.0562
Epoch 60/100, Loss: 0.1481
Epoch 61/100, Loss: 0.0493
Epoch 62/100, Loss: 0.0483
Epoch 63/100, Loss: 0.0565
Epoch 64/100, Loss: 0.0663
Epoch 65/100, Loss: 0.0807
Epoch 66/100, Loss: 0.0737
Epoch 67/100, Loss: 0.0821
Epoch 68/100, Loss: 0.0645
Epoch 69/100, Loss: 0.0472
Epoch 70/100, Loss: 0.1049
Epoch 71/100, Loss: 0.0455
Epoch 72/100, Loss: 0.0698
Epoch 73/100, Loss: 0.0489
Epoch 74/100, Loss: 0.1043
Epoch 75/100, Loss: 0.0585
Epoch 76/100, Loss: 0.0996
Epoch 77/100, Loss: 0.0719
Epoch 78/100, Loss: 0.0703
Epoch 79/100, Loss: 0.0471
Epoch 80/100, Loss: 0.1009
Epoch 81/100, Loss: 0.0652
Epoch 82/100, Loss: 0.0779
E

# dbn

In [57]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.data import DataLoader, Dataset

seed_sum = 0

for i in range(42,45):
    
    seed_everything(i)  # 원하는 시드 값으로 고정

    # Custom Label Encoder to handle unseen labels
#     class CustomLabelEncoder(LabelEncoder):
#         def fit(self, y):
#             super().fit(y)
#             return self

#         def transform(self, y):
#             known_classes = set(self.classes_)
#             y_transformed = []
#             new_label = len(self.classes_)
#             class_mapping = {label: idx for idx, label in enumerate(self.classes_)}

#             for label in y:
#                 if label in known_classes:
#                     y_transformed.append(class_mapping[label])
#                 else:
#                     y_transformed.append(new_label)
#                     new_label += 1

#             return np.array(y_transformed)

#         def fit_transform(self, y):
#             self.fit(y)
#             return self.transform(y)

    train = df_train.drop(columns = ['sj_1','sj_5','sj_6','sj_7','sj_8','sj_9'])
    test = df_test.drop(columns = ['sj_1','sj_5','sj_6','sj_7','sj_8','sj_9'])

    columns_to_keep = train.columns[train.columns != 'target']
    train = train[columns_to_keep.tolist() + ['target']]  # Keep 'target' in train
    columns_to_keep = train.columns[(train.columns != 'target') & (train.columns != 'easy_hard')]
    test = test[columns_to_keep.tolist()]  # Exclude 'target' in test

#     train['target'] = train['target'].map({'AbNormal': 1, 'Normal': 0}).astype(int)

#     label_encoders = {}
#     for col in train.columns:
#         if train[col].dtype == 'object':
#             le = CustomLabelEncoder()
#             train[col] = le.fit_transform(train[col].astype(str))
#             test[col] = le.transform(test[col].astype(str))
#             label_encoders[col] = le

    # Convert to PyTorch Datasets
    class TabularDataset(Dataset):
        def __init__(self, data, target=None):
            self.data = torch.tensor(data, dtype=torch.float32)
            self.target = torch.tensor(target, dtype=torch.float32) if target is not None else None

        def __len__(self):
            return len(self.data)

        def __getitem__(self, index):
            if self.target is not None:
                return self.data[index], self.target[index]
            else:
                return self.data[index]

    test_dataset = TabularDataset(X_test)

    # Define the enhanced DBN with a feedforward architecture
    class EnhancedDBN(nn.Module):
        def __init__(self, input_dim, hidden_dims):
            super(EnhancedDBN, self).__init__()
            layers = []
            for i in range(len(hidden_dims)):
                if i == 0:
                    layers.append(nn.Linear(input_dim, hidden_dims[i]))
                else:
                    layers.append(nn.Linear(hidden_dims[i-1], hidden_dims[i]))
                layers.append(nn.BatchNorm1d(hidden_dims[i]))
                layers.append(nn.ReLU())
                layers.append(nn.Dropout(0.3))
            self.hidden_layers = nn.Sequential(*layers)
            self.output_layer = nn.Linear(hidden_dims[-1], 1)
            self.sigmoid = nn.Sigmoid()

        def forward(self, x):
            x = self.hidden_layers(x)
            x = self.output_layer(x)
            return self.sigmoid(x)

    # Initialize the model, loss function, and optimizer
    input_dim = train.shape[1] - 2
    hidden_dims = [256, 128, 64]  # Enhanced hidden layers

    # Stratified K-Fold Cross-Validation with ensembling
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    oof_predictions = np.zeros(len(train))
    test_predictions = np.zeros(len(test))
    best_val_losses = []

    for fold, (train_index, val_index) in enumerate(skf.split(train, train['easy_hard'])):
        print(f"Fold {fold+1}")
        X_train, X_val = train.drop(columns=['target','easy_hard']).values[train_index], train.drop(columns=['target','easy_hard']).values[val_index]
        y_train, y_val = train['target'].values[train_index], train['target'].values[val_index]
        
        # Normalize the data
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.transform(X_val)
        X_test = scaler.transform(test)
        
        train_dataset = TabularDataset(X_train, y_train)
        val_dataset = TabularDataset(X_val, y_val)
        test_dataset = TabularDataset(X_test)
        
        train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
        
        model = EnhancedDBN(input_dim, hidden_dims)
        criterion = nn.BCELoss()
        optimizer = optim.AdamW(model.parameters(), lr=0.001)
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)
        
        # Training loop
        epochs = 30
        best_val_loss = np.inf
        patience = 5
        patience_counter = 0
    
        for epoch in range(epochs):
            model.train()
            train_loss = 0
            for data, target in train_loader:
                optimizer.zero_grad()
                output = model(data).squeeze()
                loss = criterion(output, target)
                loss.backward()
                optimizer.step()
                train_loss += loss.item() * data.size(0)

            train_loss /= len(train_loader.dataset)

            model.eval()
            val_loss = 0
            val_preds = []
            with torch.no_grad():
                for data, target in val_loader:
                    output = model(data).squeeze()
                    loss = criterion(output, target)
                    val_loss += loss.item() * data.size(0)
                    val_preds.extend(output.cpu().numpy())

            val_loss /= len(val_loader.dataset)
            scheduler.step(val_loss)

            print(f"Epoch {epoch+1}/{epochs}, Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

            # Early stopping
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print("Early stopping triggered.")
                    break

        # Out-of-Fold Predictions and Test Set Predictions
        oof_predictions[val_index] = val_preds
        test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
        test_preds = []
        with torch.no_grad():
            for data in test_loader:
                output = model(data).squeeze()
                test_preds.extend(output.cpu().numpy())
        test_predictions += np.array(test_preds) / skf.n_splits

        best_val_losses.append(best_val_loss)
    
    seed_sum += 1
    
# Save the submission file
submission_dbn = pd.read_csv("submission.csv")
submission_dbn['target'] = test_predictions / seed_sum
submission_dbn.to_csv('submission_dbn_원본.csv', index=False)

Fold 1
Epoch 1/30, Training Loss: 0.2521, Validation Loss: 0.1922
Epoch 2/30, Training Loss: 0.2036, Validation Loss: 0.1869
Epoch 3/30, Training Loss: 0.2013, Validation Loss: 0.1841
Epoch 4/30, Training Loss: 0.1980, Validation Loss: 0.1843
Epoch 5/30, Training Loss: 0.1971, Validation Loss: 0.1857
Epoch 6/30, Training Loss: 0.1962, Validation Loss: 0.1839
Epoch 7/30, Training Loss: 0.1946, Validation Loss: 0.1799
Epoch 8/30, Training Loss: 0.1935, Validation Loss: 0.1804
Epoch 9/30, Training Loss: 0.1918, Validation Loss: 0.1805
Epoch 10/30, Training Loss: 0.1924, Validation Loss: 0.1790
Epoch 11/30, Training Loss: 0.1908, Validation Loss: 0.1798
Epoch 12/30, Training Loss: 0.1904, Validation Loss: 0.1804
Epoch 13/30, Training Loss: 0.1890, Validation Loss: 0.1789
Epoch 14/30, Training Loss: 0.1897, Validation Loss: 0.1802
Epoch 15/30, Training Loss: 0.1895, Validation Loss: 0.1808
Epoch 16/30, Training Loss: 0.1868, Validation Loss: 0.1784
Epoch 17/30, Training Loss: 0.1877, Valida

Epoch 1/30, Training Loss: 0.2398, Validation Loss: 0.1939
Epoch 2/30, Training Loss: 0.2050, Validation Loss: 0.1925
Epoch 3/30, Training Loss: 0.2006, Validation Loss: 0.1880
Epoch 4/30, Training Loss: 0.1978, Validation Loss: 0.1900
Epoch 5/30, Training Loss: 0.1970, Validation Loss: 0.1880
Epoch 6/30, Training Loss: 0.1940, Validation Loss: 0.1881
Epoch 7/30, Training Loss: 0.1930, Validation Loss: 0.1845
Epoch 8/30, Training Loss: 0.1926, Validation Loss: 0.1843
Epoch 9/30, Training Loss: 0.1914, Validation Loss: 0.1867
Epoch 10/30, Training Loss: 0.1913, Validation Loss: 0.1846
Epoch 11/30, Training Loss: 0.1897, Validation Loss: 0.1827
Epoch 12/30, Training Loss: 0.1902, Validation Loss: 0.1861
Epoch 13/30, Training Loss: 0.1891, Validation Loss: 0.1835
Epoch 14/30, Training Loss: 0.1888, Validation Loss: 0.1819
Epoch 15/30, Training Loss: 0.1888, Validation Loss: 0.1827
Epoch 16/30, Training Loss: 0.1872, Validation Loss: 0.1841
Epoch 17/30, Training Loss: 0.1875, Validation Lo

Epoch 10/30, Training Loss: 0.1901, Validation Loss: 0.1895
Epoch 11/30, Training Loss: 0.1898, Validation Loss: 0.1878
Epoch 12/30, Training Loss: 0.1899, Validation Loss: 0.1887
Epoch 13/30, Training Loss: 0.1891, Validation Loss: 0.1919
Epoch 14/30, Training Loss: 0.1884, Validation Loss: 0.1886
Epoch 15/30, Training Loss: 0.1875, Validation Loss: 0.1871
Epoch 16/30, Training Loss: 0.1877, Validation Loss: 0.1872
Epoch 17/30, Training Loss: 0.1865, Validation Loss: 0.1890
Epoch 18/30, Training Loss: 0.1867, Validation Loss: 0.1878
Epoch 19/30, Training Loss: 0.1873, Validation Loss: 0.1866
Epoch 20/30, Training Loss: 0.1864, Validation Loss: 0.1883
Epoch 21/30, Training Loss: 0.1854, Validation Loss: 0.1884
Epoch 22/30, Training Loss: 0.1851, Validation Loss: 0.1886
Epoch 00023: reducing learning rate of group 0 to 5.0000e-04.
Epoch 23/30, Training Loss: 0.1853, Validation Loss: 0.1906
Epoch 24/30, Training Loss: 0.1824, Validation Loss: 0.1889
Early stopping triggered.
Fold 3
Epoch

Epoch 8/30, Training Loss: 0.1916, Validation Loss: 0.2025
Epoch 9/30, Training Loss: 0.1896, Validation Loss: 0.2018
Epoch 10/30, Training Loss: 0.1896, Validation Loss: 0.2018
Epoch 11/30, Training Loss: 0.1886, Validation Loss: 0.1997
Epoch 12/30, Training Loss: 0.1893, Validation Loss: 0.1988
Epoch 13/30, Training Loss: 0.1877, Validation Loss: 0.2027
Epoch 14/30, Training Loss: 0.1865, Validation Loss: 0.2011
Epoch 15/30, Training Loss: 0.1867, Validation Loss: 0.2000
Epoch 00016: reducing learning rate of group 0 to 5.0000e-04.
Epoch 16/30, Training Loss: 0.1860, Validation Loss: 0.1994
Epoch 17/30, Training Loss: 0.1849, Validation Loss: 0.1970
Epoch 18/30, Training Loss: 0.1839, Validation Loss: 0.1959
Epoch 19/30, Training Loss: 0.1828, Validation Loss: 0.1961
Epoch 20/30, Training Loss: 0.1832, Validation Loss: 0.1981
Epoch 21/30, Training Loss: 0.1823, Validation Loss: 0.1967
Epoch 00022: reducing learning rate of group 0 to 2.5000e-04.
Epoch 22/30, Training Loss: 0.1825, Va

Epoch 3/30, Training Loss: 0.2003, Validation Loss: 0.1884
Epoch 4/30, Training Loss: 0.1977, Validation Loss: 0.1863
Epoch 5/30, Training Loss: 0.1968, Validation Loss: 0.1878
Epoch 6/30, Training Loss: 0.1933, Validation Loss: 0.1891
Epoch 7/30, Training Loss: 0.1936, Validation Loss: 0.1857
Epoch 8/30, Training Loss: 0.1913, Validation Loss: 0.1824
Epoch 9/30, Training Loss: 0.1916, Validation Loss: 0.1874
Epoch 10/30, Training Loss: 0.1910, Validation Loss: 0.1838
Epoch 11/30, Training Loss: 0.1900, Validation Loss: 0.1819
Epoch 12/30, Training Loss: 0.1893, Validation Loss: 0.1821
Epoch 13/30, Training Loss: 0.1899, Validation Loss: 0.1823
Epoch 14/30, Training Loss: 0.1888, Validation Loss: 0.1825
Epoch 00015: reducing learning rate of group 0 to 5.0000e-04.
Epoch 15/30, Training Loss: 0.1875, Validation Loss: 0.1833
Epoch 16/30, Training Loss: 0.1867, Validation Loss: 0.1816
Epoch 17/30, Training Loss: 0.1866, Validation Loss: 0.1833
Epoch 18/30, Training Loss: 0.1849, Validatio

Epoch 15/30, Training Loss: 0.1868, Validation Loss: 0.1907
Early stopping triggered.
Fold 10
Epoch 1/30, Training Loss: 0.2634, Validation Loss: 0.1952
Epoch 2/30, Training Loss: 0.2060, Validation Loss: 0.1881
Epoch 3/30, Training Loss: 0.2012, Validation Loss: 0.1826
Epoch 4/30, Training Loss: 0.1985, Validation Loss: 0.1863
Epoch 5/30, Training Loss: 0.1970, Validation Loss: 0.1827
Epoch 6/30, Training Loss: 0.1948, Validation Loss: 0.1825
Epoch 7/30, Training Loss: 0.1936, Validation Loss: 0.1817
Epoch 8/30, Training Loss: 0.1930, Validation Loss: 0.1822
Epoch 9/30, Training Loss: 0.1922, Validation Loss: 0.1854
Epoch 10/30, Training Loss: 0.1917, Validation Loss: 0.1805
Epoch 11/30, Training Loss: 0.1904, Validation Loss: 0.1817
Epoch 12/30, Training Loss: 0.1900, Validation Loss: 0.1816
Epoch 13/30, Training Loss: 0.1901, Validation Loss: 0.1795
Epoch 14/30, Training Loss: 0.1891, Validation Loss: 0.1797
Epoch 15/30, Training Loss: 0.1890, Validation Loss: 0.1797
Epoch 16/30, Tr

# transformer

In [58]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import roc_auc_score

seed_sum = 0

for i in range(42,43):
    
    seed_everything(i)  # 원하는 시드 값으로 고정

    # Check if GPU is available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Identify columns to keep
    train = df_train.drop(columns = ['sj_1','sj_5','sj_6','sj_7','sj_8','sj_9'])
    test = df_test.drop(columns = ['sj_1','sj_5','sj_6','sj_7','sj_8','sj_9'])

    columns_to_keep = train.columns[train.columns != 'target']
    train = train[columns_to_keep.tolist() + ['target']]  # Keep 'target' in train
    columns_to_keep = train.columns[(train.columns != 'target') & (train.columns != 'easy_hard')]
    test = test[columns_to_keep.tolist()]  # Exclude 'target' in test

#     train['target'] = train['target'].map({'AbNormal': 1, 'Normal': 0}).astype(int)

#     str_col = []
#     for col in train.columns:
#         if train[col].dtype == "object":
#             str_col.append(col)

#     for col in str_col:
#         te = ce.TargetEncoder()
#         train[col] = te.fit_transform(train[col], train['target'])
#         test[col] = te.transform(test[col])

    # Convert to PyTorch Datasets
    class TabularDataset(Dataset):
        def __init__(self, data, target=None):
            self.data = torch.tensor(data, dtype=torch.float32).to(device)
            self.target = torch.tensor(target, dtype=torch.float32).to(device) if target is not None else None

        def __len__(self):
            return len(self.data)

        def __getitem__(self, index):
            if self.target is not None:
                return self.data[index], self.target[index]
            else:
                return self.data[index]

    # Define the Transformer-based model with enhancements
    class EnhancedTransformer(nn.Module):
        def __init__(self, input_dim, n_heads=8, n_layers=4, dim_feedforward=256, dropout=0.1):
            super(EnhancedTransformer, self).__init__()
            self.embedding = nn.Linear(input_dim, dim_feedforward)
            encoder_layer = nn.TransformerEncoderLayer(d_model=dim_feedforward, nhead=n_heads, dim_feedforward=dim_feedforward, dropout=dropout)
            self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
            self.fc_out = nn.Linear(dim_feedforward, 1)
            self.sigmoid = nn.Sigmoid()

        def forward(self, x):
            # Embed the input
            x = self.embedding(x)
            x = self.transformer_encoder(x.unsqueeze(0))  # Adding sequence dimension
            # Apply the final fully connected layer and sigmoid activation
            x = self.fc_out(x.squeeze(0))
            return self.sigmoid(x)

    input_dim = train.shape[1] - 2

    # Stratified K-Fold Cross-Validation with ensembling
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    oof_predictions = np.zeros(len(train))
    test_predictions = np.zeros(len(test))

    # Initialize the model, loss function, and optimizer
    model = EnhancedTransformer(input_dim=input_dim).to(device)
    criterion = nn.BCELoss()
    optimizer = optim.AdamW(model.parameters(), lr=0.00001)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)

    for fold, (train_index, val_index) in enumerate(skf.split(train, train['easy_hard'])):
        print(f"Fold {fold+1}")
        X_train, X_val = train.drop(columns=['target','easy_hard']).values[train_index], train.drop(columns=['target','easy_hard']).values[val_index]
        y_train, y_val = train['target'].values[train_index], train['target'].values[val_index]
        
        # Normalize data
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.transform(X_val)
        X_test = scaler.transform(test)
        
        train_dataset = TabularDataset(X_train, y_train)
        val_dataset = TabularDataset(X_val, y_val)
        test_dataset = TabularDataset(X_test)
                                                                                                   
        train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

        # Training loop
        epochs = 30
        for epoch in range(epochs):
            model.train()
            train_loss = 0
            for data, target in train_loader:
                optimizer.zero_grad()
                output = model(data).squeeze()
                loss = criterion(output, target)
                loss.backward()
                optimizer.step()
                train_loss += loss.item() * data.size(0)

            train_loss /= len(train_loader.dataset)

            model.eval()
            val_loss = 0
            val_preds = []
            with torch.no_grad():
                for data, target in val_loader:
                    output = model(data).squeeze()
                    loss = criterion(output, target)
                    val_loss += loss.item() * data.size(0)
                    val_preds.extend(output.cpu().numpy())

            val_loss /= len(val_loader.dataset)
            scheduler.step()

            print(f"Epoch {epoch+1}/{epochs}, Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

        # Out-of-Fold Predictions and Test Set Predictions
        oof_predictions[val_index] = val_preds
        test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
        test_preds = []
        with torch.no_grad():
            for data in test_loader:
                output = model(data).squeeze()
                test_preds.extend(output.cpu().numpy())
        test_predictions += np.array(test_preds) / skf.n_splits
    
    seed_sum += 1
    
# Save the submission file
submission_tf = pd.read_csv("submission.csv")
submission_tf['target'] = test_predictions / seed_sum
submission_tf.to_csv('submission_tf_원본.csv', index=False)

Fold 1
Epoch 1/30, Training Loss: 0.2337, Validation Loss: 0.1929
Epoch 2/30, Training Loss: 0.1982, Validation Loss: 0.1892
Epoch 3/30, Training Loss: 0.1947, Validation Loss: 0.1865
Epoch 4/30, Training Loss: 0.1927, Validation Loss: 0.1856
Epoch 5/30, Training Loss: 0.1914, Validation Loss: 0.1851
Epoch 6/30, Training Loss: 0.1911, Validation Loss: 0.1847
Epoch 7/30, Training Loss: 0.1901, Validation Loss: 0.1836
Epoch 8/30, Training Loss: 0.1894, Validation Loss: 0.1837
Epoch 9/30, Training Loss: 0.1892, Validation Loss: 0.1836
Epoch 10/30, Training Loss: 0.1890, Validation Loss: 0.1837
Epoch 11/30, Training Loss: 0.1888, Validation Loss: 0.1837
Epoch 12/30, Training Loss: 0.1888, Validation Loss: 0.1837
Epoch 13/30, Training Loss: 0.1896, Validation Loss: 0.1837
Epoch 14/30, Training Loss: 0.1897, Validation Loss: 0.1839
Epoch 15/30, Training Loss: 0.1890, Validation Loss: 0.1834
Epoch 16/30, Training Loss: 0.1888, Validation Loss: 0.1828
Epoch 17/30, Training Loss: 0.1885, Valida

KeyboardInterrupt: 

# ML

In [133]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(42) # Seed 고정

# LGBM CLASSIFIER

In [134]:
#X, y and cat_features definition
X = df_train.drop(columns = ['target'])
columns_to_keep = X.columns[X.columns != 'easy_hard']
X_test = df_test[columns_to_keep]
y = df_train['target'].values
#cat_features = str_col.copy() #Target Encoding 사용 시 주석처리

In [135]:
# {'max_depth': 16, 'num_leaves': 351, 'learning_rate': 0.03893878626108622, 
#  'lambda_l1': 1.4673071932573515e-06, 'lambda_l2': 0.1264772890278767, 
#  'bagging_fraction': 0.6993706440668855, 'feature_fraction': 0.5626216918840518, 'max_bin': 74}. Best is trial 23 with value: 0.27672955974842767.


In [136]:
# {'max_depth': 27, 'num_leaves': 208, 'learning_rate': 0.019101568674409413, 
#  'lambda_l1': 0.3326367603177364, 'lambda_l2': 1.486177480283442e-05, 
#  'bagging_fraction': 0.5242887175981938, 'feature_fraction': 0.5644130879580334, 'max_bin': 63}

# {'max_depth': 21, 'num_leaves': 200, 'learning_rate': 0.010341075793340348, 
#  'lambda_l1': 0.9571241496969837, 'lambda_l2': 0.0008310433648726436, 
#  'bagging_fraction': 0.5900656895949932, 'feature_fraction': 0.513070737805499, 'max_bin': 73}. Best is trial 12 with value: 0.2762345679012346.

import pandas as pd
import numpy as np
import lightgbm as lgbm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score
import matplotlib.pyplot as plt
# import seaborn as sns

def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.where(y_hat < 0.5, 0, 1)  # scikit-learn's f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat), True

train = X
y = y
models_1 = []
evals_result = {}
test_preds = []

# for state in [1,3,5,42,27]:
for state in [1]:
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=state)
    for train_index, test_index in skf.split(train,train['easy_hard']):
        X_train, X_val = train.drop(columns=['easy_hard']).iloc[train_index], train.drop(columns=['easy_hard']).iloc[test_index]
        y_train, y_val = y[train_index], y[test_index]

        print(X_train.shape, X_val.shape)

        y_pred_list = []
        
        dtrain = lgbm.Dataset(X_train, y_train)
        dvalid = lgbm.Dataset(X_val, y_val)
        print(state)
        params = {
            "objective": "binary",
            "metric": "binary_error",
            "verbosity": -1,
            "boosting_type": "gbdt",
          'max_depth': 25, 'num_leaves': 362, 'learning_rate': 0.046370921779368225, 
 'lambda_l1': 0.024642915353579757, 'lambda_l2': 0.06870119528218616, 
 'bagging_fraction': 0.6854130371192677, 'feature_fraction': 0.5004143807034069, 'max_bin': 96, "early_stopping_round": 100,
        }

        params["seed"] = state
        model = lgbm.train(
            params,
            dtrain,
            valid_sets=[dvalid],
            feval=lgb_f1_score,
            num_boost_round=1000
        )

        y_pred_list.append(model.predict(X_val))
        print(roc_auc_score(y_val, np.mean(y_pred_list, axis=0)))
        y_val_f1 = np.where(y_val >= 0.15, 1, 0)
        y_pred_f1 = np.where(np.mean(y_pred_list, axis=0) >= 0.15, 1, 0)
        print(f1_score(y_val_f1, y_pred_f1, average='binary'))

        models_1.append(model)

(32404, 306) (8102, 306)
1
0.7418278527397626
0.26410564225690275
(32405, 306) (8101, 306)
1
0.7212892960462874
0.23515715948777646
(32405, 306) (8101, 306)
1
0.7507326265365786
0.2780373831775701
(32405, 306) (8101, 306)
1
0.7358890018284858
0.28868360277136257
(32405, 306) (8101, 306)
1
0.7223128530100565
0.23570595099183195


In [137]:
test_preds_1 = []
test_preds_lst = []
for i in range(len(models_1)):
    pred = models_1[i].predict(X_test)
    test_preds_1.append(pred)
    
test_preds_1 = np.mean(test_preds_1,axis=0)

# LGBM REGRESSOR

In [None]:
 0.27892234548335976 
{'max_depth': 25, 'num_leaves': 362, 'learning_rate': 0.046370921779368225, 
 'lambda_l1': 0.024642915353579757, 'lambda_l2': 0.06870119528218616, 
 'bagging_fraction': 0.6854130371192677, 'feature_fraction': 0.5004143807034069, 'max_bin': 96}. Best is trial 59 with value: 0.27892234548335976.



In [None]:
#X = df_train.drop(columns = ['target','sj_1','sj_5','sj_6','sj_7','sj_8','sj_9'])
X = df_train.drop(columns = ['target'])
columns_to_keep = X.columns[X.columns != 'easy_hard']
X_test = df_test[columns_to_keep]
y = df_train['target'].values

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgbm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score
import matplotlib.pyplot as plt
# import seaborn as sns

def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.where(y_hat < 0.5, 0, 1)  # scikit-learn's f1 doesn't like probabilities
    return 'f1', f1_score(y_true, y_hat), True

train = X
y = y
models_2 = []
evals_result = {}
test_preds = []

# for state in [1,3,5,42,27]:
for state in [1,3,5,42,27]:
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=state)

    for train_index, test_index in skf.split(train,train['easy_hard']):
        X_train, X_val = train.drop(columns=['easy_hard']).iloc[train_index], train.drop(columns=['easy_hard']).iloc[test_index]
        y_train, y_val = y[train_index], y[test_index]

        print(X_train.shape, X_val.shape)

        y_pred_list = []
        
        dtrain = lgbm.Dataset(X_train, y_train)
        dvalid = lgbm.Dataset(X_val, y_val)
        print(state)
        params = {
            "objective": "regression",
            "metric": "rmse",
            "verbosity": -1,
            "boosting_type": "gbdt",
'max_depth': 16, 'num_leaves': 333, 'learning_rate': 0.021471027173361846,
 'lambda_l1': 0.005384441312585211, 'lambda_l2': 0.000648197586104832, 
 'bagging_fraction': 0.48644143461088324, 'feature_fraction': 0.5120050389226708, 'max_bin': 66, "early_stopping_round": 100,
        }
        params["seed"] = state
        
        model = lgbm.train(
            params,
            dtrain,
            valid_sets=[dvalid],
            feval=lgb_f1_score,
            num_boost_round=500
        )

        y_pred_list.append(model.predict(X_val))
        print(roc_auc_score(y_val, np.mean(y_pred_list, axis=0)))
        y_val_f1 = np.where(y_val >= 0.15, 1, 0)
        y_pred_f1 = np.where(np.mean(y_pred_list, axis=0) >= 0.15, 1, 0)
        print(f1_score(y_val_f1, y_pred_f1, average='binary'))

        models_2.append(model)

In [None]:
test_preds_2 = []

for i in range(len(models_2)):
    pred = models_2[i].predict(X_test)
    test_preds_2.append(pred)

sorted_probs = np.sort(test_preds_2, axis=0)
test_preds_2 = np.mean(sorted_probs[:],axis=0)

# CAT

In [None]:
#X = df_train.drop(columns = ['target','sj_1','sj_5','sj_6','sj_7','sj_8','sj_9'])
X = df_train.drop(columns = ['target'])
columns_to_keep = X.columns[X.columns != 'easy_hard']
X_test = df_test[columns_to_keep]
y = df_train['target'].values

In [None]:
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score, matthews_corrcoef
import numpy as np
import gc

model_cat_1 = []
is_holdout = False
f1_scores = []
mcc_scores = []  # MCC 점수를 저장할 리스트 추가
n_split_list = [10]

# For reproducibility
for state in [1, 5, 42, 77, 777]:
    for split in n_split_list:
        fold_idx = 1
        cv = StratifiedKFold(n_splits=split, shuffle=True, random_state=state)
        for train_index, valid_index in cv.split(X, X['easy_hard']):
            X_train, X_valid = X.drop(columns=['easy_hard']).iloc[train_index], X.drop(columns=['easy_hard']).iloc[valid_index]
            Y_train, Y_valid = y[train_index], y[valid_index]
            print("="*50)
            
            model = CatBoostClassifier(
                iterations=3000,
                random_state=state,
                task_type="CPU",
                depth=3,
                eval_metric="F1",
                bootstrap_type='Bayesian',  # Bayesian Bootstrap 사용
                random_strength=4,
                l2_leaf_reg=4,
                bagging_temperature=0.5,
                grow_policy='Depthwise',
                learning_rate=0.7,
            )
            
            model.fit(
                X_train, Y_train,
                eval_set=(X_valid, Y_valid),   # 검증 세트 설정
                early_stopping_rounds=300, 
                verbose=100
            )
            
            pred = model.predict_proba(X_valid)[:, 1]
            threshold = 0.5

            pred = np.where(pred >= threshold, True, False)
            
            # F1 score 계산
            f1 = f1_score(Y_valid, pred, labels=[True, False], average='binary')
            # MCC 계산
            mcc = matthews_corrcoef(Y_valid, pred)
            
            print(fold_idx, "Fold Validation F1 score:", f1)
            print(fold_idx, "Fold Validation MCC score:", mcc)
            
            f1_scores.append(f1)
            mcc_scores.append(mcc)  # MCC 점수를 리스트에 추가
            model_cat_1.append(model)
            fold_idx += 1
            
            gc.collect()
            
            if is_holdout:
                break 
    print(state, '학습 완료')
    
print("Validation : F1:", np.mean(f1_scores))
print("Validation : MCC:", np.mean(mcc_scores))

In [None]:
test_preds_3 = []

for i in range(len(model_cat_1)):
    pred = model_cat_1[i].predict(X_test)
    test_preds_3.append(pred)

sorted_probs = np.sort(test_preds_3, axis=0)
test_preds_3 = np.mean(sorted_probs[:],axis=0)

# XGB

In [None]:
#X = df_train.drop(columns = ['target','sj_1','sj_5','sj_6','sj_7','sj_8','sj_9'])
X = df_train.drop(columns = ['target'])
columns_to_keep = X.columns[X.columns != 'easy_hard']
X_test = df_test[columns_to_keep]
y = df_train['target'].values

In [None]:
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, matthews_corrcoef
import numpy as np
import gc

model_xgb_1 = []
is_holdout = False
f1_scores = []
mcc_scores = []  # MCC 점수를 저장할 리스트 추가
n_split_list = [10]

# For reproducibility
for state in [1, 5, 42, 77, 777]:
    for split in n_split_list:
        fold_idx = 1
        cv = StratifiedKFold(n_splits=split, shuffle=True, random_state=state)
        for train_index, valid_index in cv.split(X, X['easy_hard']):
            X_train, X_valid = X.drop(columns=['easy_hard']).iloc[train_index], X.drop(columns=['easy_hard']).iloc[valid_index]
            Y_train, Y_valid = y[train_index], y[valid_index]
            print("="*50)
            
            model = XGBClassifier(
                n_estimators=3000,
                random_state=state,
                max_depth=12,
                learning_rate=0.05,
                scale_pos_weight=1,  # 클래스 불균형 처리
                colsample_bytree=0.9,  # 각 트리에 사용할 피처 비율
                subsample=1.0,  # 각 트리에 사용할 샘플 비율
                tree_method='gpu_hist', 
                gpu_id=0,
                grow_policy='lossguide',  # Depth-wise 트리 성장 방식
                use_label_encoder=False,  # 경고 메시지 방지
                early_stopping_rounds=30,
                verbosity=1
            )
            
            model.fit(
                X_train, Y_train,
                eval_set=[(X_valid, Y_valid)],
                verbose=True
            )
            
            pred = model.predict_proba(X_valid)[:, 1]
            threshold = 0.15

            pred = np.where(pred >= threshold, True, False)
            
            # F1 score 계산
            f1 = f1_score(Y_valid, pred, labels=[True, False], average='binary')
            # MCC 계산
            mcc = matthews_corrcoef(Y_valid, pred)
            
            print(fold_idx, "Fold Validation F1 score:", f1)
            print(fold_idx, "Fold Validation MCC score:", mcc)
            
            f1_scores.append(f1)
            mcc_scores.append(mcc)  # MCC 점수를 리스트에 추가
            model_xgb_1.append(model)
            fold_idx += 1
            
            gc.collect()
            
            if is_holdout:
                break 
    print(state, '학습 완료')
    
print("Validation : F1:", np.mean(f1_scores))
print("Validation : MCC:", np.mean(mcc_scores))

In [None]:
test_preds_4 = []

for i in range(len(model_xgb_1)):
    pred = model_xgb_1[i].predict(X_test)
    test_preds_4.append(pred)

sorted_probs = np.sort(test_preds_4, axis=0)
test_preds_4 = np.mean(sorted_probs[:],axis=0)

# ENSEMBLE WITH LGBM, XGB

In [138]:
# sw_cat = pd.read_csv("sw_cat.csv")
# sw_xgb = pd.read_csv("sw_xgb.csv")


In [139]:
# sw_cat.to_csv('lgbm_preds.csv')

In [140]:
#test_preds_lgbm = (test_preds_lgbm_clf *0.3+ test_preds_lgbm_reg*0.1+ sw_cat['target']*0.3 + sw_xgb['target']*0.3)
# test_preds_final = (test_preds_1+ test_preds_3 + test_preds_4)/3  ## lgbm regressor 는 빠짐
# test_preds_final = (test_preds_1 *0.3+ test_preds_3*0.4 + test_preds_4*0.3)  ## lgbm regressor 는 빠짐
test_preds_final = test_preds_1


In [141]:
test_preds_final = np.where(test_preds_final >= 0.195, "AbNormal","Normal") # 19best    1:1:1 에 19 가 best

In [142]:
pd.Series(test_preds_final).value_counts()

Normal      16924
AbNormal      437
Name: count, dtype: int64

# ENSEMBLE WITH DEEP & 후처리

In [143]:
submission_clof = pd.read_csv("submission_clof_ts.csv")
submission_dbn = pd.read_csv('submission_dbn_ts.csv')
submission_transformer = pd.read_csv('submission_tf_ts.csv')

In [144]:
submission_clof_index = set(submission_clof[submission_clof['target']>np.percentile(submission_clof['target'], 94)].index)
submission_dbn_index = set(submission_dbn[submission_dbn['target']>np.percentile(submission_dbn['target'], 94)].index)
submission_transformer_index = set(submission_transformer[submission_transformer['target']>np.percentile(submission_transformer['target'], 94)].index)

intersection =  submission_clof_index & submission_transformer_index & submission_dbn_index #& submission_siam_index

intersection_list_1 = list(intersection)

In [145]:
len(intersection_list_1)

200

In [146]:
submission_clof_index = set(submission_clof[submission_clof['target']<np.percentile(submission_clof['target'], 80)].index)
submission_dbn_index = set(submission_dbn[submission_dbn['target']<np.percentile(submission_dbn['target'], 80)].index)
submission_transformer_index = set(submission_transformer[submission_transformer['target']<np.percentile(submission_transformer['target'], 80)].index)

intersection = submission_clof_index  & submission_transformer_index & submission_dbn_index #& submission_siam_index

intersection_list_2 = list(intersection)

In [147]:
k1 = df_test_후처리용[df_test_후처리용['Receip No Collect Result_Dam']!=df_test_후처리용['Receip No Collect Result_Fill1']].index
k2 = df_test_후처리용[df_test_후처리용['Receip No Collect Result_Fill2']!=df_test_후처리용['Receip No Collect Result_Fill1']].index
k3 = df_test_후처리용[df_test_후처리용['Receip No Collect Result_Dam']!=df_test_후처리용['Receip No Collect Result_Fill2']].index

union_1 = list(set(k1) | set(k2) | set(k3))

k1 = df_test_후처리용[df_test_후처리용['Production Qty Collect Result_Dam']!=df_test_후처리용['Production Qty Collect Result_Fill2']].index
k2 = df_test_후처리용[df_test_후처리용['Production Qty Collect Result_Fill2']!=df_test_후처리용['Production Qty Collect Result_Fill1']].index
k3 = df_test_후처리용[df_test_후처리용['Production Qty Collect Result_Dam']!=df_test_후처리용['Production Qty Collect Result_Fill1']].index
union_2 = list(set(k1) | set(k2) | set(k3))
union_3 = list(set(union_1) | set(union_2))

df_sub = pd.read_csv("submission.csv")
df_sub.loc[X_test.index,"target"] = test_preds_final
#df_sub.loc[submission_lgbm_index,"target"] = 'AbNormal'

df_sub.loc[intersection_list_1,"target"] ='AbNormal'  # 딥 후처리 여기서
# df_sub.loc[intersection_list_3,"target"] ='AbNormal'

df_sub.loc[intersection_list_2,"target"] ='Normal'
#df_sub.loc[idx,"target"] ='AbNormal'

# df_sub.loc[union_3,"target"] = 'AbNormal'
df_sub['target'].value_counts()

target
Normal      16889
AbNormal      472
Name: count, dtype: int64

In [148]:
len(union_3)

28

In [149]:
df_train_tt['target'] = np.where(df_train_tt['target'] == 'Normal', 0, 1)

# 그룹화하고 mean 값 계산
grouped_df = df_train_tt.groupby(['Workorder', 'Collect Date_AutoClave'])['target'].mean()

# mean 값이 1인 행만 필터링
filtered_df = grouped_df[grouped_df >= 0.6]

# index로 'Workorder'와 'Collect Date_AutoClave'를 리스트화
filtered_list = list(filtered_df.index)

In [150]:
submission = df_sub

df_test_tt['target'] = submission['target']

df_test_tt.loc[df_test_tt.set_index(['Workorder', 'Collect Date_AutoClave']).index.isin(filtered_list), 'target'] = 'AbNormal'

submission['target'] = df_test_tt['target']

In [151]:
submission.loc[union_3,"target"] = 'AbNormal'


In [152]:
submission['target'].value_counts()

target
Normal      16832
AbNormal      529
Name: count, dtype: int64

In [153]:
submission

Unnamed: 0,Set ID,target
0,0001be084fbc4aaa9d921f39e595961b,Normal
1,0005bbd180064abd99e63f9ed3e1ac80,Normal
2,000948934c4140d883d670adcb609584,Normal
3,000a6bfd02874c6296dc7b2e9c5678a7,Normal
4,0018e78ce91343678716e2ea27a51c95,Normal
...,...,...
17356,ffea508b59934d689b540f95eb3fa730,Normal
17357,ffed8923c8a448a98afc641b770be153,Normal
17358,fff1e73734da40adbe805359b3efb462,Normal
17359,fff8e38bdd09470baf95f71e92075dec,Normal


In [None]:
submission_clof = pd.read_csv("submission_clof_ts.csv")


In [None]:
submission.to_csv("submission.csv", index=False)