In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler, RobustScaler
import lightgbm as lgb
import warnings
# warnings.filterwarnings('ignore')

In [2]:
ROOT_DIR = "data"
random_state = 110

# Load data
train_data = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
train_data['target'] = train_data['target'].map({"Normal":0, "AbNormal":1})
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))
test_data = test_data[train_data.columns]
train_data.head()

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION X Unit Time_Dam,CURE END POSITION X Judge Value_Dam,...,Production Qty Collect Result_Fill2,Production Qty Unit Time_Fill2,Production Qty Judge Value_Fill2,Receip No Collect Result_Fill2,Receip No Unit Time_Fill2,Receip No Judge Value_Fill2,WorkMode Collect Result_Fill2,WorkMode Unit Time_Fill2,WorkMode Judge Value_Fill2,target
0,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XA938-1,1,OK,240.0,,,...,7,,,127,,,1,,,0
1,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,3KPM0016-2,1,OK,240.0,,,...,185,,,1,,,0,,,0
2,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1X9167-1,1,OK,1000.0,,,...,10,,,73,,,1,,,0
3,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1X0057-1,1,OK,1000.0,,,...,268,,,1,,,0,,,0
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3HPM0007-1,1,OK,240.0,,,...,121,,,1,,,0,,,0


In [3]:
def data_recovery(data):
    mixed_columns = [
        "GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave",
        "GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave",
        'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam',
        'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1',
        'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2',
        'Receip No Collect Result_Fill1',
        'Receip No Collect Result_Fill2',
    ]
    
    workmode_cols = [
        'WorkMode Collect Result_Dam',
        'WorkMode Collect Result_Fill1',
        'WorkMode Collect Result_Fill2',
    ]

    dam_cols = [
        'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
        'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam',
        'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',
        'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam',
        'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam',
        'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam',
        'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam',
        'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam',
        'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam',
        'HEAD Standby Position X Collect Result_Dam',
        'HEAD Standby Position Y Collect Result_Dam',
        'HEAD Standby Position Z Collect Result_Dam',
        'Head Clean Position X Collect Result_Dam',
        'Head Clean Position Y Collect Result_Dam',
        'Head Clean Position Z Collect Result_Dam',
        'Head Purge Position X Collect Result_Dam',
        'Head Purge Position Y Collect Result_Dam',
        'Head Purge Position Z Collect Result_Dam',
        'Head Zero Position X Collect Result_Dam',
        'Head Zero Position Y Collect Result_Dam',
        'Head Zero Position Z Collect Result_Dam',
        'Machine Tact time Collect Result_Dam',
        'PalletID Collect Result_Dam',
        'Production Qty Collect Result_Dam',
        'Receip No Collect Result_Dam',
        'Stage1 Circle1 Distance Speed Collect Result_Dam',
        'Stage1 Circle2 Distance Speed Collect Result_Dam',
        'Stage1 Circle3 Distance Speed Collect Result_Dam',
        'Stage1 Circle4 Distance Speed Collect Result_Dam',
        'Stage1 Line1 Distance Speed Collect Result_Dam',
        'Stage1 Line2 Distance Speed Collect Result_Dam',
        'Stage1 Line3 Distance Speed Collect Result_Dam',
        'Stage1 Line4 Distance Speed Collect Result_Dam',
        'Stage2 Circle1 Distance Speed Collect Result_Dam',
        'Stage2 Circle2 Distance Speed Collect Result_Dam',
        'Stage2 Circle3 Distance Speed Collect Result_Dam',
        'Stage2 Circle4 Distance Speed Collect Result_Dam',
        'Stage2 Line1 Distance Speed Collect Result_Dam',
        'Stage2 Line2 Distance Speed Collect Result_Dam',
        'Stage2 Line3 Distance Speed Collect Result_Dam',
        'Stage2 Line4 Distance Speed Collect Result_Dam',
        'Stage3 Circle1 Distance Speed Collect Result_Dam',
        'Stage3 Circle2 Distance Speed Collect Result_Dam',
        'Stage3 Circle3 Distance Speed Collect Result_Dam',
        'Stage3 Circle4 Distance Speed Collect Result_Dam',
        'Stage3 Line1 Distance Speed Collect Result_Dam',
        'Stage3 Line2 Distance Speed Collect Result_Dam',
        'Stage3 Line3 Distance Speed Collect Result_Dam',
        'Stage3 Line4 Distance Speed Collect Result_Dam',
        'THICKNESS 1 Collect Result_Dam',
        'THICKNESS 2 Collect Result_Dam',
        'THICKNESS 3 Collect Result_Dam',
        'WorkMode Collect Result_Dam',
    ]

    fill1_cols = [
        'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',
        'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1',
        'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1',
        'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1',
        'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1',
        'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1',
        'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1',
        'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1',
        'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1',
        'HEAD Standby Position X Collect Result_Fill1',
        'HEAD Standby Position Y Collect Result_Fill1',
        'HEAD Standby Position Z Collect Result_Fill1',
        'Head Clean Position X Collect Result_Fill1',
        'Head Clean Position Y Collect Result_Fill1',
        'Head Clean Position Z Collect Result_Fill1',
        'Head Purge Position X Collect Result_Fill1',
        'Head Purge Position Y Collect Result_Fill1',
        'Head Purge Position Z Collect Result_Fill1',
        'Machine Tact time Collect Result_Fill1',
        'PalletID Collect Result_Fill1',
        'Production Qty Collect Result_Fill1',
        'Receip No Collect Result_Fill1',
        'WorkMode Collect Result_Fill1',
    ]

    fill2_cols = [
        'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2',
        'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2',
        'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2',
        'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2',
        'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2',
        'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2',
        'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2',
        'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2',
        'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2',
        'HEAD Standby Position X Collect Result_Fill2',
        'HEAD Standby Position Y Collect Result_Fill2',
        'HEAD Standby Position Z Collect Result_Fill2',
        'Head Clean Position X Collect Result_Fill2',
        'Head Clean Position Y Collect Result_Fill2',
        'Head Clean Position Z Collect Result_Fill2',
        'Head Purge Position X Collect Result_Fill2',
        'Head Purge Position Y Collect Result_Fill2',
        'Head Purge Position Z Collect Result_Fill2',
        'Machine Tact time Collect Result_Fill2',
        'PalletID Collect Result_Fill2',
        'Production Qty Collect Result_Fill2',
        'Receip No Collect Result_Fill2',
        'WorkMode Collect Result_Fill2',
    ]

    model_cols = [
        'Model.Suffix_Dam',
        'Model.Suffix_AutoClave',
        'Model.Suffix_Fill1',
        'Model.Suffix_Fill2'
    ]
    
    workorder_cols = [
        'Workorder_Dam',
        'Workorder_AutoClave',
        'Workorder_Fill1',
        'Workorder_Fill2',
    ]
    
    condition = data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].replace("OK", np.nan).isna()

    data.loc[condition, dam_cols] = data[condition][dam_cols].shift(-1, axis=1)[dam_cols]
    data.loc[condition, fill1_cols] = data[condition][fill1_cols].shift(-1, axis=1)[fill1_cols]
    data.loc[condition, fill2_cols] = data[condition][fill2_cols].shift(-1, axis=1)[fill2_cols]

    data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'] = data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].astype(np.float64)
    data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'] = data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].astype(np.float64)
    data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'] = data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'].astype(np.float64)
    
    data['Model'] = data[model_cols[0]]
    data['Workorder'] = data[workorder_cols[0]]
    data['Chamber Temp. Judge Value_AutoClave'] = (data['Chamber Temp. Judge Value_AutoClave']=="OK").replace({True: 1, False: 0})
    
    data= data.drop(mixed_columns + workmode_cols + model_cols + workorder_cols, axis=1)
    
    
    return data

train_data = data_recovery(train_data)
test_data = data_recovery(test_data)

  data.loc[condition, dam_cols] = data[condition][dam_cols].shift(-1, axis=1)[dam_cols]
  data.loc[condition, fill2_cols] = data[condition][fill2_cols].shift(-1, axis=1)[fill2_cols]
  data.loc[condition, dam_cols] = data[condition][dam_cols].shift(-1, axis=1)[dam_cols]
  data.loc[condition, fill2_cols] = data[condition][fill2_cols].shift(-1, axis=1)[fill2_cols]


In [4]:
train_data["Workorder 별 실패율"] = train_data.groupby('Workorder')['target'].mean()
test_data["Workorder 별 실패율"] = test_data['Workorder'].map(train_data.groupby('Workorder')['target'].mean())

train_data['Equipment'] = train_data[['Equipment_Dam', 'Equipment_Fill1', 'Equipment_Fill2']].apply(lambda x: '_'.join(x), axis=1)
train_data["Equipment 별 실패율"] = train_data.groupby('Equipment')['target'].mean()
test_data['Equipment'] = test_data[['Equipment_Dam', 'Equipment_Fill1', 'Equipment_Fill2']].apply(lambda x: '_'.join(x), axis=1)
test_data["Equipment 별 실패율"] = test_data['Workorder'].map(train_data.groupby('Equipment')['target'].mean())

train_data["Receip 별 실패율"] = train_data.groupby('Receip No Collect Result_Dam')['target'].mean()
test_data["Receip 별 실패율"] = test_data['Receip No Collect Result_Dam'].map(train_data.groupby('Receip No Collect Result_Dam')['target'].mean())

In [5]:
def preprocess(data):
        
    # Workorder 분리
    # Function to split the text and handle the suffix conversion
    def split_and_convert(text):    
        # Find the numeric suffix (assuming it's after the last '-')
        prefix, suffix = text.split('-')

        # Convert the suffix to an integer
        suffix_int = str(int(suffix))

        # Split the prefix into individual characters
        prefix_chars = list(prefix)

        # Add the integer suffix to the list
        prefix_chars.append(suffix_int)

        return pd.Series(prefix_chars)

    tmp = data['Workorder'].apply(split_and_convert)
    tmp.columns = [f'char_{i+1}' for i in range(tmp.shape[1])]
    data = pd.concat([data, tmp], axis=1)
#     data = data.drop('Workorder', axis=1)


    # 작업 시간 누적 합계: 모든 공정의 작업 시간 합계
    data['Total_Tact_Time'] = (
        data['Machine Tact time Collect Result_Fill1'] + 
        data['Machine Tact time Collect Result_Dam'] +
        data['Machine Tact time Collect Result_Fill2'] + 
        data['Chamber Temp. Unit Time_AutoClave']
    )
    data["시간당 생산 제품수"] = data['Production Qty Collect Result_Dam'] / data['Total_Tact_Time']
    

    # 공정별 제품수 일치여부
    data['공정별 제품수 일치여부'] = (data[[
        'Production Qty Collect Result_Dam',
        'Production Qty Collect Result_Fill1',
        'Production Qty Collect Result_Fill2',
    ]].duplicated(keep=False)).astype(int)
    data['Dam to Fill Qty 차이'] = data['Production Qty Collect Result_Dam'] - data['Production Qty Collect Result_Fill1']
    data['Fill to Cure Qty 차이'] = data['Production Qty Collect Result_Fill1'] - data['Production Qty Collect Result_Fill2']
        
    
    # PalletID 일치여부
    data[~data[[
        'PalletID Collect Result_Dam',
        'PalletID Collect Result_Fill1',
        'PalletID Collect Result_Fill2',
    ]].duplicated(keep=False)]
    data = data.drop([
        'PalletID Collect Result_Fill1',
        'PalletID Collect Result_Fill2',
    ], axis=1)
    
    scaler = RobustScaler()
    data[[
        'Machine Tact time Collect Result_Fill1',
        'Machine Tact time Collect Result_Dam',
        'Machine Tact time Collect Result_Fill2'
    ]] = scaler.fit_transform(data[[
        'Machine Tact time Collect Result_Fill1',
        'Machine Tact time Collect Result_Dam',
        'Machine Tact time Collect Result_Fill2'
    ]])
    
    data['첫 번째 압력량'] = data['1st Pressure Collect Result_AutoClave'] * data['1st Pressure 1st Pressure Unit Time_AutoClave']
    data['두 번째 압력량'] = data['2nd Pressure Collect Result_AutoClave'] * data['2nd Pressure Unit Time_AutoClave']
    data['세 번째 압력량'] = data['3rd Pressure Collect Result_AutoClave'] * data['3rd Pressure Unit Time_AutoClave']
    data['쳄버 열용량'] = data['Chamber Temp. Collect Result_AutoClave'] * data['Chamber Temp. Unit Time_AutoClave']

    data['Dam Stage1 시간당 분사량'] = data['Dispense Volume(Stage1) Collect Result_Dam'] / (data['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam'] + 1e-6)
    data['Dam Stage1 분사효율'] = 1 - data['Dam Stage1 시간당 분사량'] / 10 /  data['DISCHARGED SPEED OF RESIN Collect Result_Dam']
    data['Dam Stage1 손실 분사량'] = data['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam'] * data['DISCHARGED SPEED OF RESIN Collect Result_Dam'] - data['Dispense Volume(Stage1) Collect Result_Dam']
    data['Dam Stage2 시간당 분사량'] = data['Dispense Volume(Stage2) Collect Result_Dam'] / (data['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam'] + 1e-6)
    data['Dam Stage2 분사효율'] = 1 - data['Dam Stage2 시간당 분사량'] / 10 /  data['DISCHARGED SPEED OF RESIN Collect Result_Dam']
    data['Dam Stage2 손실 분사량'] = data['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam'] * data['DISCHARGED SPEED OF RESIN Collect Result_Dam'] - data['Dispense Volume(Stage2) Collect Result_Dam']
    data['Dam Stage3 시간당 분사량'] = data['Dispense Volume(Stage3) Collect Result_Dam'] / (data['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam'] + 1e-6)
    data['Dam Stage3 분사효율'] = 1 - data['Dam Stage3 시간당 분사량'] / 10 /  data['DISCHARGED SPEED OF RESIN Collect Result_Dam']
    data['Dam Stage3 손실 분사량'] = data['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam'] * data['DISCHARGED SPEED OF RESIN Collect Result_Dam'] - data['Dispense Volume(Stage3) Collect Result_Dam']
    
    data['Fill1 Stage1 시간당 분사량'] = data['Dispense Volume(Stage1) Collect Result_Fill1'] / (data['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1'] + 1e-6)
    data['Fill1 Stage1 분사효율'] = 1 - data['Fill1 Stage1 시간당 분사량'] / 10 /  data['DISCHARGED SPEED OF RESIN Collect Result_Fill1']
    data['Fill1 Stage1 손실 분사량'] = data['DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1'] * data['DISCHARGED SPEED OF RESIN Collect Result_Fill1'] - data['Dispense Volume(Stage1) Collect Result_Fill1']
    data['Fill1 Stage2 시간당 분사량'] = data['Dispense Volume(Stage2) Collect Result_Fill1'] / (data['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1'] + 1e-6)
    data['Fill1 Stage2 분사효율'] = 1 - data['Fill1 Stage2 시간당 분사량'] / 10 /  data['DISCHARGED SPEED OF RESIN Collect Result_Fill1']
    data['Fill1 Stage2 손실 분사량'] = data['DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1'] * data['DISCHARGED SPEED OF RESIN Collect Result_Fill1'] - data['Dispense Volume(Stage2) Collect Result_Fill1']
    data['Fill1 Stage3 시간당 분사량'] = data['Dispense Volume(Stage3) Collect Result_Fill1'] / (data['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1'] + 1e-6)
    data['Fill1 Stage3 분사효율'] = 1 - data['Fill1 Stage3 시간당 분사량'] / 10 /  data['DISCHARGED SPEED OF RESIN Collect Result_Fill1']
    data['Fill1 Stage3 손실 분사량'] = data['DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1'] * data['DISCHARGED SPEED OF RESIN Collect Result_Fill1'] - data['Dispense Volume(Stage3) Collect Result_Fill1']     
    
    data['Stage1 Line1 - Line2 속도 변화량'] = data['Stage1 Line2 Distance Speed Collect Result_Dam'] - data['Stage1 Line1 Distance Speed Collect Result_Dam']
    data['Stage1 Line2 - Line3 속도 변화량'] = data['Stage1 Line3 Distance Speed Collect Result_Dam'] - data['Stage1 Line2 Distance Speed Collect Result_Dam']
    data['Stage1 Line3 - Line4 속도 변화량'] = data['Stage1 Line4 Distance Speed Collect Result_Dam'] - data['Stage1 Line3 Distance Speed Collect Result_Dam']
    
    data['Stage2 Line1 - Line2 속도 변화량'] = data['Stage2 Line2 Distance Speed Collect Result_Dam'] - data['Stage2 Line1 Distance Speed Collect Result_Dam']
    data['Stage2 Line2 - Line3 속도 변화량'] = data['Stage2 Line3 Distance Speed Collect Result_Dam'] - data['Stage2 Line2 Distance Speed Collect Result_Dam']
    data['Stage2 Line3 - Line4 속도 변화량'] = data['Stage2 Line4 Distance Speed Collect Result_Dam'] - data['Stage2 Line3 Distance Speed Collect Result_Dam']
    
    data['Stage3 Line1 - Line2 속도 변화량'] = data['Stage3 Line2 Distance Speed Collect Result_Dam'] - data['Stage3 Line1 Distance Speed Collect Result_Dam']
    data['Stage3 Line2 - Line3 속도 변화량'] = data['Stage3 Line3 Distance Speed Collect Result_Dam'] - data['Stage3 Line2 Distance Speed Collect Result_Dam']
    data['Stage3 Line3 - Line4 속도 변화량'] = data['Stage3 Line4 Distance Speed Collect Result_Dam'] - data['Stage3 Line3 Distance Speed Collect Result_Dam']
    
    data['두께 정상 여부'] = (data[[
        'THICKNESS 1 Collect Result_Dam',
        'THICKNESS 2 Collect Result_Dam',
        'THICKNESS 3 Collect Result_Dam',
    ]]==0).any(axis=1).astype(int)
    
    data['두께 간격'] = data[[
        'THICKNESS 1 Collect Result_Dam',
        'THICKNESS 2 Collect Result_Dam',
        'THICKNESS 3 Collect Result_Dam',
    ]].max() - data[[
        'THICKNESS 1 Collect Result_Dam',
        'THICKNESS 2 Collect Result_Dam',
        'THICKNESS 3 Collect Result_Dam',
    ]].min()
    
    return data

train_data = preprocess(train_data)
test_data = preprocess(test_data)

In [6]:
X_train = train_data.drop('target', axis=1)
cols_to_drop = X_train.columns[X_train.nunique(dropna=False) <= 1]
X_train = X_train.drop(cols_to_drop, axis=1)

y_train = train_data['target']
X_test = test_data[X_train.columns]

In [7]:
X_train.describe()

Unnamed: 0,CURE END POSITION X Collect Result_Dam,CURE END POSITION Z Collect Result_Dam,CURE END POSITION Θ Collect Result_Dam,CURE SPEED Collect Result_Dam,CURE START POSITION X Collect Result_Dam,CURE START POSITION Θ Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam,...,Stage1 Line1 - Line2 속도 변화량,Stage1 Line2 - Line3 속도 변화량,Stage1 Line3 - Line4 속도 변화량,Stage2 Line1 - Line2 속도 변화량,Stage2 Line2 - Line3 속도 변화량,Stage2 Line3 - Line4 속도 변화량,Stage3 Line1 - Line2 속도 변화량,Stage3 Line2 - Line3 속도 변화량,Stage3 Line3 - Line4 속도 변화량,두께 정상 여부
count,40506.0,40506.0,40506.0,40506.0,40506.0,40506.0,40506.0,40506.0,40506.0,40506.0,...,40506.0,40506.0,40506.0,40506.0,40506.0,40506.0,40506.0,40506.0,40506.0,40506.0
mean,530.370809,6.320669,-21.227966,73.864119,743.449859,-21.227966,12.882437,14.272249,6.861304,14.262638,...,7.56678,-28.608107,28.608107,63.38814,-65.980349,65.980349,7.72725,-8.862885,8.862885,0.874414
std,369.283055,4.858988,87.461776,9.761997,364.424068,87.461776,2.99765,3.743857,1.994393,3.733232,...,112.69617,124.149637,124.149637,256.957242,248.627434,248.627434,107.957822,110.47118,110.47118,0.331387
min,240.0,2.5,-90.0,70.0,280.0,-90.0,10.0,9.6,3.8,9.6,...,-6500.0,-2000.0,0.0,-6500.0,-1500.0,0.0,0.0,-2000.0,0.0,0.0
25%,240.0,2.5,-90.0,70.0,280.0,-90.0,10.0,13.1,4.9,13.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,240.0,2.5,-90.0,70.0,1030.0,-90.0,10.0,13.2,6.7,13.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,1000.0,12.5,90.0,70.0,1030.0,90.0,16.0,17.0,8.4,17.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1000.0,12.5,90.0,105.0,1030.0,90.0,16.0,21.3,10.6,21.4,...,2000.0,0.0,2000.0,1500.0,0.0,1500.0,2000.0,0.0,2000.0,1.0


In [8]:
# `object` 타입의 컬럼을 `category` 타입으로 변환
object_columns = X_train.select_dtypes(include=['object']).columns.tolist()
X_train[object_columns] = X_train[object_columns].astype(str).astype('category')
X_test[object_columns] = X_test[object_columns].astype(str).astype('category')
print(object_columns)

['Equipment_Dam', 'Equipment_Fill1', 'Equipment_Fill2', 'Model', 'Workorder', 'Equipment', 'char_1', 'char_2', 'char_3', 'char_4', 'char_5', 'char_6', 'char_7', 'char_8', 'char_9']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[object_columns] = X_test[object_columns].astype(str).astype('category')


In [9]:
# 기본 설정
n_splits = 10

# Stratified KFold 설정
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

In [10]:
def f1_metric(y_pred, data, threshold=0.5):
    y_true = data.get_label()
    y_pred = (y_pred >= threshold).astype(int)  # 임계값 적용
    return 'f1', f1_score(y_true, y_pred), True

scores = []
models = []

for fold, (train_index, valid_index) in enumerate(skf.split(X_train, y_train)):
    X_tr, X_val = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_tr, y_val = y_train.iloc[train_index], y_train.iloc[valid_index]

    train_data = lgb.Dataset(X_tr, label=y_tr, categorical_feature=object_columns, free_raw_data=False)
    valid_data = lgb.Dataset(X_val, label=y_val, categorical_feature=object_columns, reference=train_data, free_raw_data=False)

    params = {
        "objective": "binary",  # 이진 분류의 경우
        "boosting_type": "gbdt",
        "learning_rate": 0.05,
        "random_state": random_state,
        "verbose": -1,
        "metric": "None",  # 기본 메트릭을 사용하지 않도록 설정
        'importance_type': "gain",
        "n_jobs":-1,
        "class_weight":"balanced"
    }

    model = lgb.train(
        params,
        train_data,
        valid_sets=[valid_data],
        feval=lambda y_pred, data: f1_metric(y_pred, data, threshold=0.5),
        num_boost_round=4000,  # 부스팅 라운드 수
        callbacks=[
            lgb.early_stopping(stopping_rounds=1000),  # F1 스코어를 기준으로 조기 종료 설정
            lgb.log_evaluation(500)  # 학습 로그 출력 주기
        ]
    )

    models.append(model)
    scores.append(model.best_score['valid_0']['f1'])  # F1 스코어 저장

Training until validation scores don't improve for 1000 rounds
[500]	valid_0's f1: 0.227106
[1000]	valid_0's f1: 0.215827
Early stopping, best iteration is:
[305]	valid_0's f1: 0.228782
Training until validation scores don't improve for 1000 rounds
[500]	valid_0's f1: 0.1673
[1000]	valid_0's f1: 0.175182
[1500]	valid_0's f1: 0.172662
Early stopping, best iteration is:
[726]	valid_0's f1: 0.179104
Training until validation scores don't improve for 1000 rounds
[500]	valid_0's f1: 0.15444
[1000]	valid_0's f1: 0.169742
[1500]	valid_0's f1: 0.172043
[2000]	valid_0's f1: 0.185053
[2500]	valid_0's f1: 0.175439
Early stopping, best iteration is:
[1928]	valid_0's f1: 0.185714
Training until validation scores don't improve for 1000 rounds
[500]	valid_0's f1: 0.166038
[1000]	valid_0's f1: 0.157895
Early stopping, best iteration is:
[493]	valid_0's f1: 0.172932
Training until validation scores don't improve for 1000 rounds
[500]	valid_0's f1: 0.152672
[1000]	valid_0's f1: 0.141791
[1500]	valid_0's

In [11]:
# 각 모델별로 최적의 threshold와 관련된 정보를 저장할 리스트 초기화
thresholds = np.arange(-0.01, 1.0, 0.01)
best_thresholds = []
fold_best_scores = []

# 전체 데이터를 위한 결과 저장용 DataFrame
results_df = pd.DataFrame()

# 각 threshold에 대해 f1_score 계산
for fold, (train_index, valid_index) in enumerate(skf.split(X_train, y_train)):
    X_val = X_train.iloc[valid_index]
    y_val = y_train.iloc[valid_index]
    
    best_threshold = 0
    best_score = 0
    
    pred_proba = models[fold].predict(X_val)
    
    for threshold in thresholds:
        # Threshold 적용하여 클래스 결정
        pred = np.where(pred_proba >= threshold, 1, 0)
        
        # f1_score 계산
        score = f1_score(y_val, pred)
        
        # 최적의 threshold 업데이트
        if score > best_score:
            best_score = score
            best_threshold = threshold
    
    # 해당 Fold에 대한 최적의 threshold와 F1 스코어 저장
    best_thresholds.append(best_threshold)
    fold_best_scores.append(best_score)
    
    # 예측 결과 저장
    fold_result = pd.DataFrame({
        'fold': fold + 1,
        'true_label': y_val.values,
        'pred_proba': pred_proba,
        'pred_label': np.where(pred_proba >= best_threshold, 1, 0),
        'threshold': best_threshold
    })
    results_df = pd.concat([results_df, fold_result], axis=0)

# 각 Fold별 최적의 threshold와 F1 스코어 출력
print(f"\nBest Thresholds per fold: {best_thresholds}")
print(f"Best F1 Scores per fold: {fold_best_scores}")
print(f"Mean F1 Score : {np.mean(fold_best_scores)}")

# 예측 실패 사례 분석
misclassified_df = results_df[results_df['true_label'] != results_df['pred_label']]
pd.concat([misclassified_df, X_train.iloc[misclassified_df.index]], axis=1).to_csv("missed.csv")

print(f"\nMisclassified samples at each fold's best threshold:")
display(misclassified_df)


Best Thresholds per fold: [0.15, 0.15, 0.25, 0.22999999999999998, 0.18, 0.1, 0.24, 0.13999999999999999, 0.21, 0.41]
Best F1 Scores per fold: [0.2671905697445972, 0.21138211382113822, 0.19767441860465115, 0.257372654155496, 0.2297650130548303, 0.21556886227544908, 0.27956989247311825, 0.22641509433962265, 0.2349869451697128, 0.24666666666666667]
Mean F1 Score : 0.23665922303052822

Misclassified samples at each fold's best threshold:


Unnamed: 0,fold,true_label,pred_proba,pred_label,threshold
22,1,0,0.157002,1,0.15
35,1,0,0.184646,1,0.15
56,1,0,0.280242,1,0.15
63,1,0,0.215760,1,0.15
97,1,0,0.183196,1,0.15
...,...,...,...,...,...
3987,10,1,0.060541,0,0.41
4003,10,1,0.244448,0,0.41
4022,10,1,0.021489,0,0.41
4029,10,1,0.228825,0,0.41


In [12]:
# Test 데이터에 대해 최적의 threshold를 적용한 예측
final_predictions = []

for fold in range(len(models)):
    # X_test에 대한 예측 확률 계산
    pred_proba = models[fold].predict(X_test)
    
    # 각 fold에 해당하는 최적의 threshold 적용하여 클래스 결정 (0 또는 1)
    pred = np.where(pred_proba >= best_thresholds[fold], 1, 0)
    
    # 예측 결과 저장
    final_predictions.append(pred)

# 모든 폴드의 예측 결과를 합쳐서 최종 결론 도출 (평균)
final_predictions = np.mean(final_predictions, axis=0)
final_predictions = np.where(final_predictions >= 0.5, 1, 0)  # 평균이 0.5 이상인 경우 1로 결정

# 최종 예측 결과 출력
print("Final Predictions for X_test:")
print(final_predictions)

Final Predictions for X_test:
[0 0 0 ... 0 0 0]


In [13]:
feature_importance = models[0].feature_importance()
feature_names = X_train.columns

importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
})

importance_df

Unnamed: 0,Feature,Importance
0,Equipment_Dam,3
1,CURE END POSITION X Collect Result_Dam,0
2,CURE END POSITION Z Collect Result_Dam,0
3,CURE END POSITION Θ Collect Result_Dam,0
4,CURE SPEED Collect Result_Dam,5
...,...,...
151,Stage2 Line3 - Line4 속도 변화량,0
152,Stage3 Line1 - Line2 속도 변화량,0
153,Stage3 Line2 - Line3 속도 변화량,0
154,Stage3 Line3 - Line4 속도 변화량,0


In [14]:
importance_df.sort_values(by='Importance', ascending=False).head(50)

Unnamed: 0,Feature,Importance
107,Workorder,1557
120,시간당 생산 제품수,636
29,Production Qty Collect Result_Dam,624
104,Machine Tact time Collect Result_Fill2,557
28,PalletID Collect Result_Dam,537
119,Total_Tact_Time,408
58,1st Pressure Collect Result_AutoClave,369
27,Machine Tact time Collect Result_Dam,354
85,Machine Tact time Collect Result_Fill1,349
124,첫 번째 압력량,303


In [15]:
# 매핑 함수 정의
mapping = {1: 'AbNormal', 0: 'Normal'}
map_func = np.vectorize(mapping.get)

# 배열에 매핑 함수 적용
arr_mapped = map_func(final_predictions)
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = arr_mapped

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)