# 제품 이상여부 판별 프로젝트


## 1. 데이터 불러오기


### 필수 라이브러리


In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import MiniBatchKMeans
import lightgbm as lgb
import shap
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


### 데이터 읽어오기


In [2]:
ROOT_DIR = "data"
random_seed = 110

# Load data
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))
test_data = test_data[train_data.columns]
train_data.head()

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION X Unit Time_Dam,CURE END POSITION X Judge Value_Dam,...,Production Qty Collect Result_Fill2,Production Qty Unit Time_Fill2,Production Qty Judge Value_Fill2,Receip No Collect Result_Fill2,Receip No Unit Time_Fill2,Receip No Judge Value_Fill2,WorkMode Collect Result_Fill2,WorkMode Unit Time_Fill2,WorkMode Judge Value_Fill2,target
0,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XA938-1,1,OK,240.0,,,...,7,,,127,,,1,,,Normal
1,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,3KPM0016-2,1,OK,240.0,,,...,185,,,1,,,0,,,Normal
2,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1X9167-1,1,OK,1000.0,,,...,10,,,73,,,1,,,Normal
3,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1X0057-1,1,OK,1000.0,,,...,268,,,1,,,0,,,Normal
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3HPM0007-1,1,OK,240.0,,,...,121,,,1,,,0,,,Normal


In [3]:
mixed_columns = [
    "HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam",
    "HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam",
    "Stage1 Circle1 Distance Speed Collect Result_Dam",
    "THICKNESS 1 Collect Result_Dam",
    "WorkMode Collect Result_Dam",
    "GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave",
    "GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave",
    "HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1",
    "HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1",
    "HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2",
    "HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2"
]

train_data = train_data.drop(mixed_columns, axis=1)
test_data = test_data.drop(mixed_columns, axis=1)

In [4]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import StandardScaler

def perform_minibatch_kmeans_clustering(data, n_clusters=3, batch_size=1000):
    kmeans = MiniBatchKMeans(n_clusters=n_clusters, batch_size=batch_size, random_state=42)
    return kmeans.fit_predict(data)

def apply_minibatch_kmeans_clustering_to_data(train_data, test_data, feature_columns, n_clusters=3, batch_size=1000):
    scaler = StandardScaler()

    # 훈련 데이터에 대해 클러스터링
    train_data_scaled = scaler.fit_transform(train_data[feature_columns].dropna())
    train_labels = perform_minibatch_kmeans_clustering(train_data_scaled, n_clusters, batch_size)
    train_labels = train_labels.astype(str)
    
    # 테스트 데이터에 대해 동일한 스케일링 적용 후 클러스터링
    test_data_scaled = scaler.transform(test_data[feature_columns].dropna())
    test_labels = perform_minibatch_kmeans_clustering(test_data_scaled, n_clusters, batch_size)
    test_labels = test_labels.astype(str)
    
    return train_labels, test_labels

# 각 그룹별 클러스터 수 정의
clusters_dict = {
    'CURE END POSITION Collect Result_Dam': 2,
    'CURE STANDBY POSITION Collect Result_Dam': 1,
    'CURE START POSITION Collect Result_Dam': 2,
    'HEAD Standby Position Collect Result_Dam': 3,
    'Head Clean Position Collect Result_Dam': 4,
    'Head Purge Position Collect Result_Dam': 6,
    'Head Zero Position Collect Result_Dam': 4,
    'HEAD Standby Position Collect Result_Fill1': 3,
    'Head Clean Position Collect Result_Fill1': 2,
    'Head Purge Position Collect Result_Fill1': 3,
    'CURE END POSITION Collect Result_Fill2': 4,
    'CURE STANDBY POSITION Collect Result_Fill2': 4,
    'CURE START POSITION Collect Result_Fill2': 5,
    'HEAD Standby Position Collect Result_Fill2': 2,
    'Head Clean Position Collect Result_Fill2': 2,
    'Head Purge Position Collect Result_Fill2': 2,
    'HEAD NORMAL COORDINATE AXIS(Stage2) Collect Result_Dam': 6,
    'HEAD NORMAL COORDINATE AXIS(Stage3) Collect Result_Dam': 6,
    'HEAD NORMAL COORDINATE AXIS(Stage2) Collect Result_Fill1': 7,
    'HEAD NORMAL COORDINATE AXIS(Stage3) Collect Result_Fill1': 5,
    'HEAD NORMAL COORDINATE AXIS(Stage2) Collect Result_Fill2': 4,
    'HEAD NORMAL COORDINATE AXIS(Stage3) Collect Result_Fill2': 4,
    'HEAD NORMAL COORDINATE AXIS(Stage1) Collect Result_Dam' :4,
    'HEAD NORMAL COORDINATE AXIS(Stage1) Collect Result_Fill1' :7,
    'HEAD NORMAL COORDINATE AXIS(Stage1) Collect Result_Fill2' :4,
}



# 클러스터링 수행을 위한 그룹 정의
group_features = {
    'CURE END POSITION Collect Result_Dam': ['CURE END POSITION X Collect Result_Dam', 'CURE END POSITION Z Collect Result_Dam', 'CURE END POSITION Θ Collect Result_Dam'],
    'CURE STANDBY POSITION Collect Result_Dam': ['CURE STANDBY POSITION X Collect Result_Dam', 'CURE STANDBY POSITION Z Collect Result_Dam', 'CURE STANDBY POSITION Θ Collect Result_Dam'],
    'CURE START POSITION Collect Result_Dam': ['CURE START POSITION X Collect Result_Dam', 'CURE START POSITION Z Collect Result_Dam', 'CURE START POSITION Θ Collect Result_Dam'],
    'HEAD Standby Position Collect Result_Dam': ['HEAD Standby Position X Collect Result_Dam', 'HEAD Standby Position Y Collect Result_Dam', 'HEAD Standby Position Z Collect Result_Dam'],
    'Head Clean Position Collect Result_Dam': ['Head Clean Position X Collect Result_Dam', 'Head Clean Position Y Collect Result_Dam', 'Head Clean Position Z Collect Result_Dam'],
    'Head Purge Position Collect Result_Dam': ['Head Purge Position X Collect Result_Dam', 'Head Purge Position Y Collect Result_Dam', 'Head Purge Position Z Collect Result_Dam'],
    'Head Zero Position Collect Result_Dam': ['Head Zero Position X Collect Result_Dam', 'Head Zero Position Y Collect Result_Dam', 'Head Zero Position Z Collect Result_Dam'],
    'HEAD Standby Position Collect Result_Fill1': ['HEAD Standby Position X Collect Result_Fill1', 'HEAD Standby Position Y Collect Result_Fill1', 'HEAD Standby Position Z Collect Result_Fill1'],
    'Head Clean Position Collect Result_Fill1': ['Head Clean Position X Collect Result_Fill1', 'Head Clean Position Y Collect Result_Fill1', 'Head Clean Position Z Collect Result_Fill1'],
    'Head Purge Position Collect Result_Fill1': ['Head Purge Position X Collect Result_Fill1', 'Head Purge Position Y Collect Result_Fill1', 'Head Purge Position Z Collect Result_Fill1'],
    'CURE END POSITION Collect Result_Fill2': ['CURE END POSITION X Collect Result_Fill2', 'CURE END POSITION Z Collect Result_Fill2', 'CURE END POSITION Θ Collect Result_Fill2'],
    'CURE STANDBY POSITION Collect Result_Fill2': ['CURE STANDBY POSITION X Collect Result_Fill2', 'CURE STANDBY POSITION Z Collect Result_Fill2', 'CURE STANDBY POSITION Θ Collect Result_Fill2'],
    'CURE START POSITION Collect Result_Fill2': ['CURE START POSITION X Collect Result_Fill2', 'CURE START POSITION Z Collect Result_Fill2', 'CURE START POSITION Θ Collect Result_Fill2'],
    'HEAD Standby Position Collect Result_Fill2': ['HEAD Standby Position X Collect Result_Fill2', 'HEAD Standby Position Y Collect Result_Fill2', 'HEAD Standby Position Z Collect Result_Fill2'],
    'Head Clean Position Collect Result_Fill2': ['Head Clean Position X Collect Result_Fill2', 'Head Clean Position Y Collect Result_Fill2', 'Head Clean Position Z Collect Result_Fill2'],
    'Head Purge Position Collect Result_Fill2': ['Head Purge Position X Collect Result_Fill2', 'Head Purge Position Y Collect Result_Fill2', 'Head Purge Position Z Collect Result_Fill2'],
    'HEAD NORMAL COORDINATE AXIS(Stage2) Collect Result_Dam' :['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam','HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam','HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam',],
    'HEAD NORMAL COORDINATE AXIS(Stage3) Collect Result_Dam' :['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam','HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam','HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam',],
    'HEAD NORMAL COORDINATE AXIS(Stage2) Collect Result_Fill1' :['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1','HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1','HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1',],
    'HEAD NORMAL COORDINATE AXIS(Stage3) Collect Result_Fill1' :['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1','HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1','HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1',],
    'HEAD NORMAL COORDINATE AXIS(Stage2) Collect Result_Fill2' :['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2','HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2','HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2',],
    'HEAD NORMAL COORDINATE AXIS(Stage3) Collect Result_Fill2' :['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2','HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2','HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2',],

    # X aixs is removed
    'HEAD NORMAL COORDINATE AXIS(Stage1) Collect Result_Dam' :['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam','HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam',],
    'HEAD NORMAL COORDINATE AXIS(Stage1) Collect Result_Fill1' :['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1','HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1',],
    'HEAD NORMAL COORDINATE AXIS(Stage1) Collect Result_Fill2' :['HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2','HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2',],
}

# 클러스터링 수행 및 결과 추가
for group_name, feature_columns in group_features.items():
    n_clusters = clusters_dict[group_name]
    
    # 클러스터링 적용
    train_data[f'{group_name}_CLUSTER'], test_data[f'{group_name}_CLUSTER'] = apply_minibatch_kmeans_clustering_to_data(
        train_data, 
        test_data, 
        feature_columns, 
        n_clusters=n_clusters
    )

In [5]:
[x for x in train_data.columns if 'CLUSTER' in x]

['CURE END POSITION Collect Result_Dam_CLUSTER',
 'CURE STANDBY POSITION Collect Result_Dam_CLUSTER',
 'CURE START POSITION Collect Result_Dam_CLUSTER',
 'HEAD Standby Position Collect Result_Dam_CLUSTER',
 'Head Clean Position Collect Result_Dam_CLUSTER',
 'Head Purge Position Collect Result_Dam_CLUSTER',
 'Head Zero Position Collect Result_Dam_CLUSTER',
 'HEAD Standby Position Collect Result_Fill1_CLUSTER',
 'Head Clean Position Collect Result_Fill1_CLUSTER',
 'Head Purge Position Collect Result_Fill1_CLUSTER',
 'CURE END POSITION Collect Result_Fill2_CLUSTER',
 'CURE STANDBY POSITION Collect Result_Fill2_CLUSTER',
 'CURE START POSITION Collect Result_Fill2_CLUSTER',
 'HEAD Standby Position Collect Result_Fill2_CLUSTER',
 'Head Clean Position Collect Result_Fill2_CLUSTER',
 'Head Purge Position Collect Result_Fill2_CLUSTER',
 'HEAD NORMAL COORDINATE AXIS(Stage2) Collect Result_Dam_CLUSTER',
 'HEAD NORMAL COORDINATE AXIS(Stage3) Collect Result_Dam_CLUSTER',
 'HEAD NORMAL COORDINATE AX

In [6]:
# CURE_PROCESS_DAM 생성
cure_dam_cols = [
    'CURE START POSITION Collect Result_Dam_CLUSTER',
    'CURE STANDBY POSITION Collect Result_Dam_CLUSTER',
    'CURE END POSITION Collect Result_Dam_CLUSTER'
]

train_data['CURE_PROCESS_DAM'] = train_data[cure_dam_cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
test_data['CURE_PROCESS_DAM'] = test_data[cure_dam_cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)

# HEAD_PROCESS_DAM 생성
head_dam_cols = [
    'HEAD Standby Position Collect Result_Dam_CLUSTER',
    'Head Clean Position Collect Result_Dam_CLUSTER',
    'Head Purge Position Collect Result_Dam_CLUSTER',
    'Head Zero Position Collect Result_Dam_CLUSTER'
]

train_data['HEAD_PROCESS_DAM'] = train_data[head_dam_cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
test_data['HEAD_PROCESS_DAM'] = test_data[head_dam_cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)

# HEAD_PROCESS_FILL1 생성
head_fill1_cols = [
    'HEAD Standby Position Collect Result_Fill1_CLUSTER',
    'Head Clean Position Collect Result_Fill1_CLUSTER',
    'Head Purge Position Collect Result_Fill1_CLUSTER'
]

train_data['HEAD_PROCESS_FILL1'] = train_data[head_fill1_cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
test_data['HEAD_PROCESS_FILL1'] = test_data[head_fill1_cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)

# CURE_PROCESS_FILL2 생성
cure_fill2_cols = [
    'CURE START POSITION Collect Result_Fill2_CLUSTER',
    'CURE STANDBY POSITION Collect Result_Fill2_CLUSTER',
    'CURE END POSITION Collect Result_Fill2_CLUSTER'
]

train_data['CURE_PROCESS_FILL2'] = train_data[cure_fill2_cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
test_data['CURE_PROCESS_FILL2'] = test_data[cure_fill2_cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)

# HEAD_PROCESS_FILL2 생성
head_fill2_cols = [
    'HEAD Standby Position Collect Result_Fill2_CLUSTER',
    'Head Clean Position Collect Result_Fill2_CLUSTER',
    'Head Purge Position Collect Result_Fill2_CLUSTER'
]

train_data['HEAD_PROCESS_FILL2'] = train_data[head_fill2_cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
test_data['HEAD_PROCESS_FILL2'] = test_data[head_fill2_cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)

# 사용된 클러스터링 컬럼 삭제
target_cols = cure_dam_cols + head_dam_cols + head_fill1_cols + cure_fill2_cols + head_fill2_cols
train_data = train_data.drop(target_cols, axis=1)
test_data = test_data.drop(target_cols, axis=1)

In [7]:
# # 두 점 사이의 거리를 계산하는 함수
# def calculate_distance(x1, y1, z1, x2, y2, z2):
#     return np.sqrt((x2 - x1)**2 + (y2 - y1)**2 + (z2 - z1)**2)

# # 1. Head Standby -> Head Clean Distance
# train_data['HEAD_Standby_to_Clean_Distance'] = calculate_distance(
#     train_data['HEAD Standby Position X Collect Result_Dam'],
#     train_data['HEAD Standby Position Y Collect Result_Dam'],
#     train_data['HEAD Standby Position Z Collect Result_Dam'],
#     train_data['Head Clean Position X Collect Result_Dam'],
#     train_data['Head Clean Position Y Collect Result_Dam'],
#     train_data['Head Clean Position Z Collect Result_Dam']
# )

# test_data['HEAD_Standby_to_Clean_Distance'] = calculate_distance(
#     test_data['HEAD Standby Position X Collect Result_Dam'],
#     test_data['HEAD Standby Position Y Collect Result_Dam'],
#     test_data['HEAD Standby Position Z Collect Result_Dam'],
#     test_data['Head Clean Position X Collect Result_Dam'],
#     test_data['Head Clean Position Y Collect Result_Dam'],
#     test_data['Head Clean Position Z Collect Result_Dam']
# )

# # 2. Head Clean -> Head Purge Distance
# train_data['HEAD_Clean_to_Purge_Distance'] = calculate_distance(
#     train_data['Head Clean Position X Collect Result_Dam'],
#     train_data['Head Clean Position Y Collect Result_Dam'],
#     train_data['Head Clean Position Z Collect Result_Dam'],
#     train_data['Head Purge Position X Collect Result_Dam'],
#     train_data['Head Purge Position Y Collect Result_Dam'],
#     train_data['Head Purge Position Z Collect Result_Dam']
# )

# test_data['HEAD_Clean_to_Purge_Distance'] = calculate_distance(
#     test_data['Head Clean Position X Collect Result_Dam'],
#     test_data['Head Clean Position Y Collect Result_Dam'],
#     test_data['Head Clean Position Z Collect Result_Dam'],
#     test_data['Head Purge Position X Collect Result_Dam'],
#     test_data['Head Purge Position Y Collect Result_Dam'],
#     test_data['Head Purge Position Z Collect Result_Dam']
# )

# # 3. Head Purge -> Head Zero Distance
# train_data['HEAD_Purge_to_Zero_Distance'] = calculate_distance(
#     train_data['Head Purge Position X Collect Result_Dam'],
#     train_data['Head Purge Position Y Collect Result_Dam'],
#     train_data['Head Purge Position Z Collect Result_Dam'],
#     train_data['Head Zero Position X Collect Result_Dam'],
#     train_data['Head Zero Position Y Collect Result_Dam'],
#     train_data['Head Zero Position Z Collect Result_Dam']
# )

# test_data['HEAD_Purge_to_Zero_Distance'] = calculate_distance(
#     test_data['Head Purge Position X Collect Result_Dam'],
#     test_data['Head Purge Position Y Collect Result_Dam'],
#     test_data['Head Purge Position Z Collect Result_Dam'],
#     test_data['Head Zero Position X Collect Result_Dam'],
#     test_data['Head Zero Position Y Collect Result_Dam'],
#     test_data['Head Zero Position Z Collect Result_Dam']
# )

# # 4. Head Standby -> Head Zero Distance (Direct)
# train_data['HEAD_Standby_to_Zero_Distance'] = calculate_distance(
#     train_data['HEAD Standby Position X Collect Result_Dam'],
#     train_data['HEAD Standby Position Y Collect Result_Dam'],
#     train_data['HEAD Standby Position Z Collect Result_Dam'],
#     train_data['Head Zero Position X Collect Result_Dam'],
#     train_data['Head Zero Position Y Collect Result_Dam'],
#     train_data['Head Zero Position Z Collect Result_Dam']
# )

# test_data['HEAD_Standby_to_Zero_Distance'] = calculate_distance(
#     test_data['HEAD Standby Position X Collect Result_Dam'],
#     test_data['HEAD Standby Position Y Collect Result_Dam'],
#     test_data['HEAD Standby Position Z Collect Result_Dam'],
#     test_data['Head Zero Position X Collect Result_Dam'],
#     test_data['Head Zero Position Y Collect Result_Dam'],
#     test_data['Head Zero Position Z Collect Result_Dam']
# )

# # 두 점 사이의 거리를 계산하는 함수 (Theta 고려)
# def calculate_distance_with_theta(x1, z1, theta1, x2, z2, theta2):
#     # 각도 차이 계산 (360도를 기준으로 최소 차이 계산)
#     delta_theta = np.abs(theta2 - theta1)
#     delta_theta = np.minimum(delta_theta, 360 - delta_theta)
    
#     # 최종 거리 계산
#     return np.sqrt((x2 - x1)**2 + (z2 - z1)**2 + (delta_theta)**2)

# # 1. Cure Start -> Cure Standby Distance
# train_data['CURE_Start_to_Standby_Distance'] = calculate_distance_with_theta(
#     train_data['CURE START POSITION X Collect Result_Dam'],
#     train_data['CURE START POSITION Z Collect Result_Dam'],
#     train_data['CURE START POSITION Θ Collect Result_Dam'],
#     train_data['CURE STANDBY POSITION X Collect Result_Dam'],
#     train_data['CURE STANDBY POSITION Z Collect Result_Dam'],
#     train_data['CURE STANDBY POSITION Θ Collect Result_Dam']
# )

# test_data['CURE_Start_to_Standby_Distance'] = calculate_distance_with_theta(
#     test_data['CURE START POSITION X Collect Result_Dam'],
#     test_data['CURE START POSITION Z Collect Result_Dam'],
#     test_data['CURE START POSITION Θ Collect Result_Dam'],
#     test_data['CURE STANDBY POSITION X Collect Result_Dam'],
#     test_data['CURE STANDBY POSITION Z Collect Result_Dam'],
#     test_data['CURE STANDBY POSITION Θ Collect Result_Dam']
# )

# # 2. Cure Standby -> Cure End Distance
# train_data['CURE_Standby_to_End_Distance'] = calculate_distance_with_theta(
#     train_data['CURE STANDBY POSITION X Collect Result_Dam'],
#     train_data['CURE STANDBY POSITION Z Collect Result_Dam'],
#     train_data['CURE STANDBY POSITION Θ Collect Result_Dam'],
#     train_data['CURE END POSITION X Collect Result_Dam'],
#     train_data['CURE END POSITION Z Collect Result_Dam'],
#     train_data['CURE END POSITION Θ Collect Result_Dam']
# )

# test_data['CURE_Standby_to_End_Distance'] = calculate_distance_with_theta(
#     test_data['CURE STANDBY POSITION X Collect Result_Dam'],
#     test_data['CURE STANDBY POSITION Z Collect Result_Dam'],
#     test_data['CURE STANDBY POSITION Θ Collect Result_Dam'],
#     test_data['CURE END POSITION X Collect Result_Dam'],
#     test_data['CURE END POSITION Z Collect Result_Dam'],
#     test_data['CURE END POSITION Θ Collect Result_Dam']
# )

# # 3. Cure Start -> Cure End Distance (Direct)
# train_data['CURE_Start_to_End_Distance'] = calculate_distance_with_theta(
#     train_data['CURE START POSITION X Collect Result_Dam'],
#     train_data['CURE START POSITION Z Collect Result_Dam'],
#     train_data['CURE START POSITION Θ Collect Result_Dam'],
#     train_data['CURE END POSITION X Collect Result_Dam'],
#     train_data['CURE END POSITION Z Collect Result_Dam'],
#     train_data['CURE END POSITION Θ Collect Result_Dam']
# )

# test_data['CURE_Start_to_End_Distance'] = calculate_distance_with_theta(
#     test_data['CURE START POSITION X Collect Result_Dam'],
#     test_data['CURE START POSITION Z Collect Result_Dam'],
#     test_data['CURE START POSITION Θ Collect Result_Dam'],
#     test_data['CURE END POSITION X Collect Result_Dam'],
#     test_data['CURE END POSITION Z Collect Result_Dam'],
#     test_data['CURE END POSITION Θ Collect Result_Dam']
# )

In [8]:
for group, features in group_features.items():
    train_data = train_data.drop(features, axis=1)
    test_data = test_data.drop(features, axis=1)

In [9]:
cols_to_drop = train_data.columns[train_data.nunique(dropna=False) == 1]
train_data = train_data.drop(cols_to_drop, axis=1)

In [10]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Data columns (total 90 columns):
 #   Column                                                            Non-Null Count  Dtype  
---  ------                                                            --------------  -----  
 0   Equipment_Dam                                                     40506 non-null  object 
 1   Model.Suffix_Dam                                                  40506 non-null  object 
 2   Workorder_Dam                                                     40506 non-null  object 
 3   CURE SPEED Collect Result_Dam                                     40506 non-null  int64  
 4   DISCHARGED SPEED OF RESIN Collect Result_Dam                      40506 non-null  int64  
 5   DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam               40506 non-null  float64
 6   DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam               40506 non-null  float64
 7   DISCHARGED TIME OF RESIN(Stage3

In [11]:
train_data.head()

Unnamed: 0,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,CURE SPEED Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam,Dispense Volume(Stage1) Collect Result_Dam,Dispense Volume(Stage2) Collect Result_Dam,...,HEAD NORMAL COORDINATE AXIS(Stage2) Collect Result_Fill2_CLUSTER,HEAD NORMAL COORDINATE AXIS(Stage3) Collect Result_Fill2_CLUSTER,HEAD NORMAL COORDINATE AXIS(Stage1) Collect Result_Dam_CLUSTER,HEAD NORMAL COORDINATE AXIS(Stage1) Collect Result_Fill1_CLUSTER,HEAD NORMAL COORDINATE AXIS(Stage1) Collect Result_Fill2_CLUSTER,CURE_PROCESS_DAM,HEAD_PROCESS_DAM,HEAD_PROCESS_FILL1,CURE_PROCESS_FILL2,HEAD_PROCESS_FILL2
0,Dam dispenser #1,AJX75334505,4F1XA938-1,100,16,14.9,8.4,14.7,1.04,0.58,...,2,2,1,3,1,1_0_1,2_0_2_0,0_0_0,4_0_0,0_0_0
1,Dam dispenser #1,AJX75334505,3KPM0016-2,70,10,21.3,4.9,21.3,1.49,0.34,...,1,1,3,4,3,1_0_1,0_2_3_2,1_1_1,4_0_0,1_1_1
2,Dam dispenser #2,AJX75334501,4E1X9167-1,85,16,14.7,8.5,14.7,1.61,0.93,...,0,0,0,6,0,0_0_0,2_0_2_0,2_0_0,4_0_0,0_0_0
3,Dam dispenser #2,AJX75334501,3K1X0057-1,70,10,21.3,8.4,21.3,1.49,0.58,...,3,3,2,0,2,0_0_0,0_2_3_2,1_1_1,4_0_0,1_1_1
4,Dam dispenser #1,AJX75334501,3HPM0007-1,70,10,9.7,4.9,9.6,0.67,0.34,...,1,1,3,4,3,1_0_1,0_1_0_2,1_1_2,4_0_3,1_1_1


In [12]:
model_cols = [
    'Model.Suffix_Dam',
    'Model.Suffix_AutoClave',
    'Model.Suffix_Fill1',
    'Model.Suffix_Fill2'
]

train_data['Model'] = train_data[model_cols[0]]
test_data['Model'] = test_data[model_cols[0]]

train_data = train_data.drop(model_cols, axis=1)
test_data = test_data.drop(model_cols, axis=1)

In [13]:
# judge_cols = [
#     'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam',
#     'GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave',
#     'GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave',
#     'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1',
#     'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2'
# ]

# train_data['Judge'] = (train_data[judge_cols[0]]=='OK').replace({True: 1, False: 0})
# test_data['Judge'] = (test_data[judge_cols[0]]=='OK').replace({True: 1, False: 0})

# train_data = train_data.drop(judge_cols, axis=1)
# test_data = test_data.drop(judge_cols, axis=1)

In [14]:
workorder_cols = [
    'Workorder_Dam',
    'Workorder_AutoClave',
    'Workorder_Fill1',
    'Workorder_Fill2',
]

train_data['Workorder'] = train_data[workorder_cols[0]]
test_data['Workorder'] = test_data[workorder_cols[0]]

train_data = train_data.drop(workorder_cols, axis=1)
test_data = test_data.drop(workorder_cols, axis=1)

In [15]:
train_data['Chamber Temp. Judge Value_AutoClave'] = (train_data['Chamber Temp. Judge Value_AutoClave']=="OK").replace({True: 1, False: 0})
test_data['Chamber Temp. Judge Value_AutoClave'] = (test_data['Chamber Temp. Judge Value_AutoClave'] == "OK").replace({True: 1, False: 0})

In [16]:
target_cols = [
    'Equipment_Dam',
    'Equipment_Fill1',
    'Equipment_Fill2',
]

train_data['Equipment'] = train_data['Equipment_Dam'] + '_' + train_data['Equipment_Fill1'] + '_' + train_data['Equipment_Fill2']
test_data['Equipment'] = test_data['Equipment_Dam'] + '_' + test_data['Equipment_Fill1'] + '_' + test_data['Equipment_Fill2']
train_data = train_data.drop(target_cols, axis=1)
test_data = test_data.drop(target_cols, axis=1)

In [17]:
train_data['target'] = train_data['target'].map({'AbNormal':1, 'Normal':0})

In [18]:
train_data.head()

Unnamed: 0,CURE SPEED Collect Result_Dam,DISCHARGED SPEED OF RESIN Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam,DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam,Dispense Volume(Stage1) Collect Result_Dam,Dispense Volume(Stage2) Collect Result_Dam,Dispense Volume(Stage3) Collect Result_Dam,Machine Tact time Collect Result_Dam,PalletID Collect Result_Dam,...,HEAD NORMAL COORDINATE AXIS(Stage1) Collect Result_Fill1_CLUSTER,HEAD NORMAL COORDINATE AXIS(Stage1) Collect Result_Fill2_CLUSTER,CURE_PROCESS_DAM,HEAD_PROCESS_DAM,HEAD_PROCESS_FILL1,CURE_PROCESS_FILL2,HEAD_PROCESS_FILL2,Model,Workorder,Equipment
0,100,16,14.9,8.4,14.7,1.04,0.58,1.02,265.0,58.5,...,3,1,1_0_1,2_0_2_0,0_0_0,4_0_0,0_0_0,AJX75334505,4F1XA938-1,Dam dispenser #1_Fill1 dispenser #1_Fill2 disp...
1,70,10,21.3,4.9,21.3,1.49,0.34,1.49,65.1,7.0,...,4,3,1_0_1,0_2_3_2,1_1_1,4_0_0,1_1_1,AJX75334505,3KPM0016-2,Dam dispenser #1_Fill1 dispenser #1_Fill2 disp...
2,85,16,14.7,8.5,14.7,1.61,0.93,1.61,265.02,76.9,...,6,0,0_0_0,2_0_2_0,2_0_0,4_0_0,0_0_0,AJX75334501,4E1X9167-1,Dam dispenser #2_Fill1 dispenser #2_Fill2 disp...
3,70,10,21.3,8.4,21.3,1.49,0.58,1.49,70.5,12.0,...,0,2,0_0_0,0_2_3_2,1_1_1,4_0_0,1_1_1,AJX75334501,3K1X0057-1,Dam dispenser #2_Fill1 dispenser #2_Fill2 disp...
4,70,10,9.7,4.9,9.6,0.67,0.34,0.67,88.3,8.0,...,4,3,1_0_1,0_1_0_2,1_1_2,4_0_3,1_1_1,AJX75334501,3HPM0007-1,Dam dispenser #1_Fill1 dispenser #1_Fill2 disp...


In [19]:
# 기본 설정
n_splits = 5

# Stratified KFold 설정
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_seed)

In [20]:
X_train = train_data.drop('target', axis=1)
y_train = train_data['target']
X_test = test_data[X_train.columns]

In [21]:
object_columns = X_train.select_dtypes(include=['object']).columns.tolist()
object_columns.extend([x for x in X_train if "WORKMODE" in x.upper()])

In [22]:
# `object` 타입의 컬럼을 `category` 타입으로 변환
object_columns = X_train.select_dtypes(include=['object']).columns.tolist()
for col in object_columns:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')

In [23]:
def f1_metric(y_pred, data, threshold=0.5):
    y_true = data.get_label()
    y_pred = (y_pred >= threshold).astype(int)  # 임계값 적용
    return 'f1', f1_score(y_true, y_pred), True

In [24]:
scores = []
models = []

for fold, (train_index, valid_index) in enumerate(skf.split(X_train, y_train)):
    X_tr, X_val = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_tr, y_val = y_train.iloc[train_index], y_train.iloc[valid_index]

    train_data = lgb.Dataset(X_tr, label=y_tr, categorical_feature=object_columns, free_raw_data=False)
    valid_data = lgb.Dataset(X_val, label=y_val, categorical_feature=object_columns, reference=train_data, free_raw_data=False)

    params = {
        "objective": "binary",
        "boosting_type": "gbdt",
        "learning_rate": 0.1,
        "num_boost_round": 4000,
        "random_state": 101,
        "verbose": -1,
        "metric": "None"  # 기본 메트릭을 사용하지 않도록 설정
    }

    # 1차 학습 - 조기 종료 적용
    model = lgb.train(
        params,
        train_data,
        valid_sets=[valid_data],
        feval=lambda y_pred, data: f1_metric(y_pred, data, threshold=0.5),
        num_boost_round=4000,
        callbacks=[
            lgb.early_stopping(stopping_rounds=1000),  # F1 스코어를 기준으로 조기 종료 설정
            lgb.log_evaluation(500)
        ]
    )

    # 최적의 iteration 수로 재학습
    best_model = lgb.train(
        params,
        train_data,
        valid_sets=[valid_data],
        feval=lambda y_pred, data: f1_metric(y_pred, data, threshold=0.5),
        num_boost_round=model.best_iteration
    )
    
    models.append(best_model)
    scores.append(model.best_score['valid_0']['f1'])  # F1 스코어 저장

print(f"Best F1 Scores per fold: {scores}")
print(f"Mean F1 Score: {np.mean(scores)}")
print(scores)
print(np.mean(scores))

Training until validation scores don't improve for 1000 rounds
[500]	valid_0's f1: 0.183824
[1000]	valid_0's f1: 0.201058
[1500]	valid_0's f1: 0.196891
[2000]	valid_0's f1: 0.191304
Early stopping, best iteration is:
[1389]	valid_0's f1: 0.207972
Training until validation scores don't improve for 1000 rounds
[500]	valid_0's f1: 0.141002
[1000]	valid_0's f1: 0.148673
[1500]	valid_0's f1: 0.153043
[2000]	valid_0's f1: 0.166102
[2500]	valid_0's f1: 0.157534
[3000]	valid_0's f1: 0.159322
Early stopping, best iteration is:
[2224]	valid_0's f1: 0.167235
Training until validation scores don't improve for 1000 rounds
[500]	valid_0's f1: 0.144444
[1000]	valid_0's f1: 0.145907
[1500]	valid_0's f1: 0.147217
Early stopping, best iteration is:
[901]	valid_0's f1: 0.153571
Training until validation scores don't improve for 1000 rounds
[500]	valid_0's f1: 0.14882
[1000]	valid_0's f1: 0.149733
[1500]	valid_0's f1: 0.166957
[2000]	valid_0's f1: 0.161121
Early stopping, best iteration is:
[1435]	valid_0

In [25]:
# 각 모델별로 최적의 threshold와 관련된 정보를 저장할 리스트 초기화
thresholds = np.arange(0.0, 0.5, 0.02)
best_thresholds = []
fold_best_scores = []

# 전체 데이터를 위한 결과 저장용 DataFrame
results_df = pd.DataFrame()

# 각 threshold에 대해 f1_score 계산
for fold, (train_index, valid_index) in enumerate(skf.split(X_train, y_train)):
    X_val = X_train.iloc[valid_index]
    y_val = y_train.iloc[valid_index]
    
    best_threshold = 0
    best_score = 0
    
    for threshold in thresholds:
        # LightGBM 모델 예측 (확률값 반환)
        pred_proba = models[fold].predict(X_val)
        
        # Threshold 적용하여 클래스 결정
        pred = np.where(pred_proba >= threshold, 1, 0)
        
        # f1_score 계산
        score = f1_score(y_val, pred)
        
        # 최적의 threshold 업데이트
        if score > best_score:
            best_score = score
            best_threshold = threshold
    
    # 해당 Fold에 대한 최적의 threshold와 F1 스코어 저장
    best_thresholds.append(best_threshold)
    fold_best_scores.append(best_score)
    
    # 예측 결과 저장
    fold_result = pd.DataFrame({
        'fold': fold + 1,
        'true_label': y_val.values,
        'pred_proba': pred_proba,
        'pred_label': np.where(pred_proba >= best_threshold, 1, 0),
        'threshold': best_threshold
    })
    results_df = pd.concat([results_df, fold_result], axis=0)

# 각 Fold별 최적의 threshold와 F1 스코어 출력
print(f"\nBest Thresholds per fold: {best_thresholds}")
print(f"Best F1 Scores per fold: {fold_best_scores}")

# 예측 실패 사례 분석
misclassified_df = results_df[results_df['true_label'] != results_df['pred_label']]

print(f"\nMisclassified samples at each fold's best threshold:")
display(misclassified_df)


Best Thresholds per fold: [0.12, 0.02, 0.02, 0.02, 0.02]
Best F1 Scores per fold: [0.22543352601156072, 0.22419533851276358, 0.18367346938775508, 0.19866071428571427, 0.19889502762430936]

Misclassified samples at each fold's best threshold:


Unnamed: 0,fold,true_label,pred_proba,pred_label,threshold
50,1,0,0.165755,1,0.12
67,1,1,0.000036,0,0.12
86,1,1,0.000012,0,0.12
121,1,1,0.001205,0,0.12
132,1,0,0.562400,1,0.12
...,...,...,...,...,...
8057,5,0,0.020338,1,0.02
8058,5,0,0.292391,1,0.02
8062,5,1,0.000011,0,0.02
8075,5,1,0.000248,0,0.02


In [26]:
# Test 데이터에 대해 최적의 threshold를 적용한 예측
final_predictions = []

for fold in range(len(models)):
    # X_test에 대한 예측 확률 계산
    pred_proba = models[fold].predict(X_test)
    
    # 각 fold에 해당하는 최적의 threshold 적용하여 클래스 결정 (0 또는 1)
    pred = np.where(pred_proba >= best_thresholds[fold], 1, 0)
    
    # 예측 결과 저장
    final_predictions.append(pred)

# 모든 폴드의 예측 결과를 합쳐서 최종 결론 도출 (평균)
final_predictions = np.mean(final_predictions, axis=0)
final_predictions = np.where(final_predictions >= 0.5, 1, 0)  # 평균이 0.5 이상인 경우 1로 결정

# 최종 예측 결과 출력
print("Final Predictions for X_test:")
print(final_predictions)

Final Predictions for X_test:
[0 0 0 ... 0 0 0]


In [28]:
# 매핑 함수 정의
mapping = {1: 'AbNormal', 0: 'Normal'}
map_func = np.vectorize(mapping.get)

# 배열에 매핑 함수 적용
arr_mapped = map_func(final_predictions)
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = arr_mapped

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)