# 제품 이상여부 판별 프로젝트


## 1. 데이터 불러오기


### 필수 라이브러리


In [56]:
# Importing necessary libraries

# OS and pretty print for system operations and better print formatting
import os
from pprint import pprint

# Data manipulation and numerical operations
import numpy as np
import pandas as pd

# Machine Learning models from scikit-learn
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# Evaluation metrics from scikit-learn
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)

# Model selection utilities from scikit-learn
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold

# Additional machine learning libraries
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

# Progress bar utility
from tqdm import tqdm

# Suppress warnings
import warnings
warnings.filterwarnings(action='ignore')

# Visualization tools
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Clustering and preprocessing
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler


### 데이터 읽어오기


In [57]:
ROOT_DIR = "data"
random_seed = 110

# Load data
train_data = pd.read_csv("data/train.csv")
train_data
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))
test_data = test_data[train_data.columns]

In [58]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import StandardScaler

def perform_minibatch_kmeans_clustering(data, n_clusters=3, batch_size=100):
    kmeans = MiniBatchKMeans(n_clusters=n_clusters, batch_size=batch_size, random_state=42)
    return kmeans.fit_predict(data)

def apply_minibatch_kmeans_clustering_to_data(train_data, test_data, feature_columns, n_clusters=3, batch_size=100):
    scaler = StandardScaler()

    # 훈련 데이터에 대해 클러스터링
    train_data_scaled = scaler.fit_transform(train_data[feature_columns].dropna())
    train_labels = perform_minibatch_kmeans_clustering(train_data_scaled, n_clusters, batch_size)
    
    # 테스트 데이터에 대해 동일한 스케일링 적용 후 클러스터링
    test_data_scaled = scaler.transform(test_data[feature_columns].dropna())
    test_labels = perform_minibatch_kmeans_clustering(test_data_scaled, n_clusters, batch_size)
    
    return train_labels, test_labels

# 클러스터링 적용: Cure Start Position
cure_start_columns = ['CURE START POSITION X Collect Result_Dam', 'CURE START POSITION Z Collect Result_Dam']
train_data['CURE_START_CLUSTER'], test_data['CURE_START_CLUSTER'] = apply_minibatch_kmeans_clustering_to_data(train_data, test_data, cure_start_columns, n_clusters=2)

# 클러스터링 적용: Cure Standby Position
cure_standby_columns = ['CURE STANDBY POSITION X Collect Result_Dam', 'CURE STANDBY POSITION Z Collect Result_Dam']
train_data['CURE_STANDBY_CLUSTER'], test_data['CURE_STANDBY_CLUSTER'] = apply_minibatch_kmeans_clustering_to_data(train_data, test_data, cure_standby_columns, n_clusters=1)

# 클러스터링 적용: Cure End Position
cure_end_columns = ['CURE END POSITION X Collect Result_Dam', 'CURE END POSITION Z Collect Result_Dam']
train_data['CURE_END_CLUSTER'], test_data['CURE_END_CLUSTER'] = apply_minibatch_kmeans_clustering_to_data(train_data, test_data, cure_end_columns, n_clusters=2)

# 클러스터링 적용: Head Standby Position
head_standby_columns = ['HEAD Standby Position X Collect Result_Dam', 'HEAD Standby Position Y Collect Result_Dam', 'HEAD Standby Position Z Collect Result_Dam']
train_data['HEAD_STANDBY_CLUSTER'], test_data['HEAD_STANDBY_CLUSTER'] = apply_minibatch_kmeans_clustering_to_data(train_data, test_data, head_standby_columns, n_clusters=3)

# 클러스터링 적용: Head Clean Position
head_clean_columns = ['Head Clean Position X Collect Result_Dam', 'Head Clean Position Y Collect Result_Dam', 'Head Clean Position Z Collect Result_Dam']
train_data['HEAD_CLEAN_CLUSTER'], test_data['HEAD_CLEAN_CLUSTER'] = apply_minibatch_kmeans_clustering_to_data(train_data, test_data, head_clean_columns, n_clusters=2)

# 클러스터링 적용: Head Purge Position
head_purge_columns = ['Head Purge Position X Collect Result_Dam', 'Head Purge Position Y Collect Result_Dam', 'Head Purge Position Z Collect Result_Dam']
train_data['HEAD_PURGE_CLUSTER'], test_data['HEAD_PURGE_CLUSTER'] = apply_minibatch_kmeans_clustering_to_data(train_data, test_data, head_purge_columns, n_clusters=2)

# 클러스터링 적용: Head Zero Position
head_zero_columns = ['Head Zero Position X Collect Result_Dam', 'Head Zero Position Y Collect Result_Dam', 'Head Zero Position Z Collect Result_Dam']
train_data['HEAD_ZERO_CLUSTER'], test_data['HEAD_ZERO_CLUSTER'] = apply_minibatch_kmeans_clustering_to_data(train_data, test_data, head_zero_columns, n_clusters=2)

# 클러스터링 적용: Stage 2 Position
stage2_columns = ['HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam', 'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam', 'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam']
train_data['STAGE2_CLUSTER'], test_data['STAGE2_CLUSTER'] = apply_minibatch_kmeans_clustering_to_data(train_data, test_data, stage2_columns, n_clusters=6)

# 클러스터링 적용: Stage 3 Position
stage3_columns = ['HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam', 'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam', 'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam']
train_data['STAGE3_CLUSTER'], test_data['STAGE3_CLUSTER'] = apply_minibatch_kmeans_clustering_to_data(train_data, test_data, stage3_columns, n_clusters=5)


In [59]:
# 두 점 사이의 거리를 계산하는 함수
def calculate_distance(x1, y1, z1, x2, y2, z2):
    return np.sqrt((x2 - x1)**2 + (y2 - y1)**2 + (z2 - z1)**2)

# 1. Head Standby -> Head Clean Distance
train_data['HEAD_Standby_to_Clean_Distance'] = calculate_distance(
    train_data['HEAD Standby Position X Collect Result_Dam'],
    train_data['HEAD Standby Position Y Collect Result_Dam'],
    train_data['HEAD Standby Position Z Collect Result_Dam'],
    train_data['Head Clean Position X Collect Result_Dam'],
    train_data['Head Clean Position Y Collect Result_Dam'],
    train_data['Head Clean Position Z Collect Result_Dam']
)

test_data['HEAD_Standby_to_Clean_Distance'] = calculate_distance(
    test_data['HEAD Standby Position X Collect Result_Dam'],
    test_data['HEAD Standby Position Y Collect Result_Dam'],
    test_data['HEAD Standby Position Z Collect Result_Dam'],
    test_data['Head Clean Position X Collect Result_Dam'],
    test_data['Head Clean Position Y Collect Result_Dam'],
    test_data['Head Clean Position Z Collect Result_Dam']
)

# 2. Head Clean -> Head Purge Distance
train_data['HEAD_Clean_to_Purge_Distance'] = calculate_distance(
    train_data['Head Clean Position X Collect Result_Dam'],
    train_data['Head Clean Position Y Collect Result_Dam'],
    train_data['Head Clean Position Z Collect Result_Dam'],
    train_data['Head Purge Position X Collect Result_Dam'],
    train_data['Head Purge Position Y Collect Result_Dam'],
    train_data['Head Purge Position Z Collect Result_Dam']
)

test_data['HEAD_Clean_to_Purge_Distance'] = calculate_distance(
    test_data['Head Clean Position X Collect Result_Dam'],
    test_data['Head Clean Position Y Collect Result_Dam'],
    test_data['Head Clean Position Z Collect Result_Dam'],
    test_data['Head Purge Position X Collect Result_Dam'],
    test_data['Head Purge Position Y Collect Result_Dam'],
    test_data['Head Purge Position Z Collect Result_Dam']
)

# 3. Head Purge -> Head Zero Distance
train_data['HEAD_Purge_to_Zero_Distance'] = calculate_distance(
    train_data['Head Purge Position X Collect Result_Dam'],
    train_data['Head Purge Position Y Collect Result_Dam'],
    train_data['Head Purge Position Z Collect Result_Dam'],
    train_data['Head Zero Position X Collect Result_Dam'],
    train_data['Head Zero Position Y Collect Result_Dam'],
    train_data['Head Zero Position Z Collect Result_Dam']
)

test_data['HEAD_Purge_to_Zero_Distance'] = calculate_distance(
    test_data['Head Purge Position X Collect Result_Dam'],
    test_data['Head Purge Position Y Collect Result_Dam'],
    test_data['Head Purge Position Z Collect Result_Dam'],
    test_data['Head Zero Position X Collect Result_Dam'],
    test_data['Head Zero Position Y Collect Result_Dam'],
    test_data['Head Zero Position Z Collect Result_Dam']
)

# 4. Head Standby -> Head Zero Distance (Direct)
train_data['HEAD_Standby_to_Zero_Distance'] = calculate_distance(
    train_data['HEAD Standby Position X Collect Result_Dam'],
    train_data['HEAD Standby Position Y Collect Result_Dam'],
    train_data['HEAD Standby Position Z Collect Result_Dam'],
    train_data['Head Zero Position X Collect Result_Dam'],
    train_data['Head Zero Position Y Collect Result_Dam'],
    train_data['Head Zero Position Z Collect Result_Dam']
)

test_data['HEAD_Standby_to_Zero_Distance'] = calculate_distance(
    test_data['HEAD Standby Position X Collect Result_Dam'],
    test_data['HEAD Standby Position Y Collect Result_Dam'],
    test_data['HEAD Standby Position Z Collect Result_Dam'],
    test_data['Head Zero Position X Collect Result_Dam'],
    test_data['Head Zero Position Y Collect Result_Dam'],
    test_data['Head Zero Position Z Collect Result_Dam']
)

# Cure 데이터의 거리 계산 (X와 Z 좌표 사용)

# 1. Cure Start -> Cure Standby Distance
train_data['CURE_Start_to_Standby_Distance'] = calculate_distance(
    train_data['CURE START POSITION X Collect Result_Dam'],
    np.zeros_like(train_data['CURE START POSITION X Collect Result_Dam']),  # Y 좌표는 0으로 설정
    train_data['CURE START POSITION Z Collect Result_Dam'],
    train_data['CURE STANDBY POSITION X Collect Result_Dam'],
    np.zeros_like(train_data['CURE STANDBY POSITION X Collect Result_Dam']),  # Y 좌표는 0으로 설정
    train_data['CURE STANDBY POSITION Z Collect Result_Dam']
)

test_data['CURE_Start_to_Standby_Distance'] = calculate_distance(
    test_data['CURE START POSITION X Collect Result_Dam'],
    np.zeros_like(test_data['CURE START POSITION X Collect Result_Dam']),  # Y 좌표는 0으로 설정
    test_data['CURE START POSITION Z Collect Result_Dam'],
    test_data['CURE STANDBY POSITION X Collect Result_Dam'],
    np.zeros_like(test_data['CURE STANDBY POSITION X Collect Result_Dam']),  # Y 좌표는 0으로 설정
    test_data['CURE STANDBY POSITION Z Collect Result_Dam']
)

# 2. Cure Standby -> Cure End Distance
train_data['CURE_Standby_to_End_Distance'] = calculate_distance(
    train_data['CURE STANDBY POSITION X Collect Result_Dam'],
    np.zeros_like(train_data['CURE STANDBY POSITION X Collect Result_Dam']),  # Y 좌표는 0으로 설정
    train_data['CURE STANDBY POSITION Z Collect Result_Dam'],
    train_data['CURE END POSITION X Collect Result_Dam'],
    np.zeros_like(train_data['CURE END POSITION X Collect Result_Dam']),  # Y 좌표는 0으로 설정
    train_data['CURE END POSITION Z Collect Result_Dam']
)

test_data['CURE_Standby_to_End_Distance'] = calculate_distance(
    test_data['CURE STANDBY POSITION X Collect Result_Dam'],
    np.zeros_like(test_data['CURE STANDBY POSITION X Collect Result_Dam']),  # Y 좌표는 0으로 설정
    test_data['CURE STANDBY POSITION Z Collect Result_Dam'],
    test_data['CURE END POSITION X Collect Result_Dam'],
    np.zeros_like(test_data['CURE END POSITION X Collect Result_Dam']),  # Y 좌표는 0으로 설정
    test_data['CURE END POSITION Z Collect Result_Dam']
)

# 3. Cure Start -> Cure End Distance (Direct)
train_data['CURE_Start_to_End_Distance'] = calculate_distance(
    train_data['CURE START POSITION X Collect Result_Dam'],
    np.zeros_like(train_data['CURE START POSITION X Collect Result_Dam']),  # Y 좌표는 0으로 설정
    train_data['CURE START POSITION Z Collect Result_Dam'],
    train_data['CURE END POSITION X Collect Result_Dam'],
    np.zeros_like(train_data['CURE END POSITION X Collect Result_Dam']),  # Y 좌표는 0으로 설정
    train_data['CURE END POSITION Z Collect Result_Dam']
)

test_data['CURE_Start_to_End_Distance'] = calculate_distance(
    test_data['CURE START POSITION X Collect Result_Dam'],
    np.zeros_like(test_data['CURE START POSITION X Collect Result_Dam']),  # Y 좌표는 0으로 설정
    test_data['CURE START POSITION Z Collect Result_Dam'],
    test_data['CURE END POSITION X Collect Result_Dam'],
    np.zeros_like(test_data['CURE END POSITION X Collect Result_Dam']),  # Y 좌표는 0으로 설정
    test_data['CURE END POSITION Z Collect Result_Dam']
)

In [60]:
cluster_columns = [
    'CURE_START_CLUSTER',
    'CURE_STANDBY_CLUSTER',
    'CURE_END_CLUSTER',
    'HEAD_STANDBY_CLUSTER',
    'HEAD_CLEAN_CLUSTER',
    'HEAD_PURGE_CLUSTER',
    'HEAD_ZERO_CLUSTER',
    'STAGE2_CLUSTER',
    'STAGE3_CLUSTER',
]

for col in cluster_columns:
    values = train_data['CURE_START_CLUSTER'].unique()
    for value in values:
        train_data[f'{col}_{value}'] = (train_data[col]==value).replace({True: 1, False: 0})
        test_data[f'{col}_{value}'] = (test_data[col]==value).replace({True: 1, False: 0})

train_data = train_data.drop(cluster_columns, axis=1)
test_data = test_data.drop(cluster_columns, axis=1)

In [61]:
pd.DataFrame(
    train_data[[
        'target',
        'Model.Suffix_Dam',
        'Model.Suffix_AutoClave',
        'Model.Suffix_Fill1',
        'Model.Suffix_Fill2',
    ]].value_counts(dropna=False))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,count
target,Model.Suffix_Dam,Model.Suffix_AutoClave,Model.Suffix_Fill1,Model.Suffix_Fill2,Unnamed: 5_level_1
Normal,AJX75334501,AJX75334501,AJX75334501,AJX75334501,31902
Normal,AJX75334502,AJX75334502,AJX75334502,AJX75334502,3160
Normal,AJX75334505,AJX75334505,AJX75334505,AJX75334505,2505
AbNormal,AJX75334501,AJX75334501,AJX75334501,AJX75334501,1918
Normal,AJX75334507,AJX75334507,AJX75334507,AJX75334507,292
AbNormal,AJX75334502,AJX75334502,AJX75334502,AJX75334502,230
AbNormal,AJX75334505,AJX75334505,AJX75334505,AJX75334505,130
Normal,AJX75334506,AJX75334506,AJX75334506,AJX75334506,121
Normal,AJX75334503,AJX75334503,AJX75334503,AJX75334503,118
Normal,AJX75334508,AJX75334508,AJX75334508,AJX75334508,58


In [62]:
pd.DataFrame(
    train_data[[
        'target',
        'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam',
        'GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave',
        'GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave',
        'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1',
        'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2'
]].value_counts(dropna=False))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,count
target,HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam,GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave,GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave,HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1,HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2,Unnamed: 6_level_1
Normal,,,,,,27359
Normal,OK,OK,OK,OK,OK,10797
AbNormal,,,,,,1854
AbNormal,OK,OK,OK,OK,OK,496


In [63]:
pd.DataFrame(
    train_data[[
        'target',
        'Workorder_Dam',
        'Workorder_AutoClave',
        'Workorder_Fill1',
        'Workorder_Fill2',
]].value_counts(dropna=False))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,count
target,Workorder_Dam,Workorder_AutoClave,Workorder_Fill1,Workorder_Fill2,Unnamed: 5_level_1
Normal,3L1X8574-2,3L1X8574-2,3L1X8574-2,3L1X8574-2,266
Normal,3K1XC268-1,3K1XC268-1,3K1XC268-1,3K1XC268-1,217
Normal,3L1X8506-1,3L1X8506-1,3L1X8506-1,3L1X8506-1,208
Normal,3L1X9934-1,3L1X9934-1,3L1X9934-1,3L1X9934-1,204
Normal,3K1XA586-2,3K1XA586-2,3K1XA586-2,3K1XA586-2,188
...,...,...,...,...,...
AbNormal,3H1XE854-1,3H1XE854-1,3H1XE854-1,3H1XE854-1,1
AbNormal,3K1X9805-1,3K1X9805-1,3K1X9805-1,3K1X9805-1,1
AbNormal,3G1XB159-1,3G1XB159-1,3G1XB159-1,3G1XB159-1,1
AbNormal,3K1X9887-2,3K1X9887-2,3K1X9887-2,3K1X9887-2,1


In [64]:
pd.DataFrame(
    train_data[[
        'Workorder_Dam',
        'Workorder_AutoClave',
        'Workorder_Fill1',
        'Workorder_Fill2',
]].value_counts(dropna=False))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count
Workorder_Dam,Workorder_AutoClave,Workorder_Fill1,Workorder_Fill2,Unnamed: 4_level_1
3L1X8574-2,3L1X8574-2,3L1X8574-2,3L1X8574-2,272
3K1XC268-1,3K1XC268-1,3K1XC268-1,3K1XC268-1,227
3L1X9934-1,3L1X9934-1,3L1X9934-1,3L1X9934-1,217
3L1X8506-1,3L1X8506-1,3L1X8506-1,3L1X8506-1,217
3K1XA586-2,3K1XA586-2,3K1XA586-2,3K1XA586-2,189
...,...,...,...,...
4BPM0084-1,4BPM0084-1,4BPM0084-1,4BPM0084-1,2
4A1XE569-1,4A1XE569-1,4A1XE569-1,4A1XE569-1,2
3H1XB714-1,3H1XB714-1,3H1XB714-1,3H1XB714-1,1
3HPM0061-1,3HPM0061-1,3HPM0061-1,3HPM0061-1,1


In [65]:
pd.DataFrame(
    train_data[[
        'target',
        'Equipment_Dam',
        'Equipment_Fill1',
        'Equipment_Fill2',
]].value_counts(dropna=False))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,count
target,Equipment_Dam,Equipment_Fill1,Equipment_Fill2,Unnamed: 4_level_1
Normal,Dam dispenser #1,Fill1 dispenser #1,Fill2 dispenser #1,23545
Normal,Dam dispenser #2,Fill1 dispenser #2,Fill2 dispenser #2,14611
AbNormal,Dam dispenser #1,Fill1 dispenser #1,Fill2 dispenser #1,1466
AbNormal,Dam dispenser #2,Fill1 dispenser #2,Fill2 dispenser #2,850
AbNormal,Dam dispenser #1,Fill1 dispenser #2,Fill2 dispenser #2,13
AbNormal,Dam dispenser #2,Fill1 dispenser #1,Fill2 dispenser #1,10
AbNormal,Dam dispenser #1,Fill1 dispenser #1,Fill2 dispenser #2,6
AbNormal,Dam dispenser #2,Fill1 dispenser #2,Fill2 dispenser #1,5


In [66]:
cols_to_drop = train_data.columns[train_data.nunique(dropna=False) == 1]
train_data = train_data.drop(cols_to_drop, axis=1)

In [67]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Columns: 174 entries, Equipment_Dam to STAGE3_CLUSTER_1
dtypes: float64(75), int64(78), object(21)
memory usage: 53.8+ MB


In [68]:
object_columns = train_data.select_dtypes(include=['object']).columns.tolist()
train_data[object_columns]

Unnamed: 0,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam,HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam,Model.Suffix_AutoClave,Workorder_AutoClave,Chamber Temp. Judge Value_AutoClave,GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave,GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave,...,Model.Suffix_Fill1,Workorder_Fill1,HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1,HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1,Equipment_Fill2,Model.Suffix_Fill2,Workorder_Fill2,HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2,HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2,target
0,Dam dispenser #1,AJX75334505,4F1XA938-1,,,AJX75334505,4F1XA938-1,OK,,,...,AJX75334505,4F1XA938-1,,,Fill2 dispenser #1,AJX75334505,4F1XA938-1,,,Normal
1,Dam dispenser #1,AJX75334505,3KPM0016-2,550.3,,AJX75334505,3KPM0016-2,OK,,,...,AJX75334505,3KPM0016-2,838.4,,Fill2 dispenser #1,AJX75334505,3KPM0016-2,835.5,,Normal
2,Dam dispenser #2,AJX75334501,4E1X9167-1,OK,OK,AJX75334501,4E1X9167-1,OK,OK,OK,...,AJX75334501,4E1X9167-1,OK,OK,Fill2 dispenser #2,AJX75334501,4E1X9167-1,OK,OK,Normal
3,Dam dispenser #2,AJX75334501,3K1X0057-1,162.4,,AJX75334501,3K1X0057-1,OK,,,...,AJX75334501,3K1X0057-1,837.7,,Fill2 dispenser #2,AJX75334501,3K1X0057-1,305,,Normal
4,Dam dispenser #1,AJX75334501,3HPM0007-1,549,,AJX75334501,3HPM0007-1,NG,,,...,AJX75334501,3HPM0007-1,838.4,,Fill2 dispenser #1,AJX75334501,3HPM0007-1,835.5,,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40501,Dam dispenser #1,AJX75334501,3J1XF434-2,550.3,,AJX75334501,3J1XF434-2,OK,,,...,AJX75334501,3J1XF434-2,838.4,,Fill2 dispenser #1,AJX75334501,3J1XF434-2,835.5,,Normal
40502,Dam dispenser #2,AJX75334501,4E1XC796-1,,,AJX75334501,4E1XC796-1,NG,,,...,AJX75334501,4E1XC796-1,,,Fill2 dispenser #2,AJX75334501,4E1XC796-1,,,Normal
40503,Dam dispenser #1,AJX75334501,4C1XD438-1,OK,OK,AJX75334501,4C1XD438-1,NG,OK,OK,...,AJX75334501,4C1XD438-1,OK,OK,Fill2 dispenser #1,AJX75334501,4C1XD438-1,OK,OK,Normal
40504,Dam dispenser #2,AJX75334501,3I1XA258-1,162.4,,AJX75334501,3I1XA258-1,OK,,,...,AJX75334501,3I1XA258-1,837.7,,Fill2 dispenser #2,AJX75334501,3I1XA258-1,305,,Normal


In [69]:
model_cols = [
    'Model.Suffix_Dam',
    'Model.Suffix_AutoClave',
    'Model.Suffix_Fill1',
    'Model.Suffix_Fill2'
]

train_data['Model'] = train_data[model_cols[0]]
test_data['Model'] = test_data[model_cols[0]]

train_data = train_data.drop(model_cols, axis=1)
test_data = test_data.drop(model_cols, axis=1)

In [70]:
judge_cols = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam',
    'GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave',
    'GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2'
]

train_data['Judge'] = (train_data[judge_cols[0]]=='OK').replace({True: 1, False: 0})
test_data['Judge'] = (test_data[judge_cols[0]]=='OK').replace({True: 1, False: 0})

train_data = train_data.drop(judge_cols, axis=1)
test_data = test_data.drop(judge_cols, axis=1)

In [71]:
workorder_cols = [
    'Workorder_Dam',
    'Workorder_AutoClave',
    'Workorder_Fill1',
    'Workorder_Fill2',
]

train_data['Workorder'] = train_data[workorder_cols[0]]
test_data['Workorder'] = test_data[workorder_cols[0]]

train_data = train_data.drop(workorder_cols, axis=1)
test_data = test_data.drop(workorder_cols, axis=1)

In [72]:
train_data['Chamber Temp. Judge Value_AutoClave'] = (train_data['Chamber Temp. Judge Value_AutoClave']=="OK").replace({True: 1, False: 0})
test_data['Chamber Temp. Judge Value_AutoClave'] = (test_data['Chamber Temp. Judge Value_AutoClave'] == "OK").replace({True: 1, False: 0})

In [73]:
object_columns = train_data.select_dtypes(include=['object']).columns.tolist()
train_data[object_columns]

Unnamed: 0,Equipment_Dam,HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam,Equipment_Fill1,HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1,Equipment_Fill2,HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2,target,Model,Workorder
0,Dam dispenser #1,,Fill1 dispenser #1,,Fill2 dispenser #1,,Normal,AJX75334505,4F1XA938-1
1,Dam dispenser #1,550.3,Fill1 dispenser #1,838.4,Fill2 dispenser #1,835.5,Normal,AJX75334505,3KPM0016-2
2,Dam dispenser #2,OK,Fill1 dispenser #2,OK,Fill2 dispenser #2,OK,Normal,AJX75334501,4E1X9167-1
3,Dam dispenser #2,162.4,Fill1 dispenser #2,837.7,Fill2 dispenser #2,305,Normal,AJX75334501,3K1X0057-1
4,Dam dispenser #1,549,Fill1 dispenser #1,838.4,Fill2 dispenser #1,835.5,Normal,AJX75334501,3HPM0007-1
...,...,...,...,...,...,...,...,...,...
40501,Dam dispenser #1,550.3,Fill1 dispenser #1,838.4,Fill2 dispenser #1,835.5,Normal,AJX75334501,3J1XF434-2
40502,Dam dispenser #2,,Fill1 dispenser #2,,Fill2 dispenser #2,,Normal,AJX75334501,4E1XC796-1
40503,Dam dispenser #1,OK,Fill1 dispenser #1,OK,Fill2 dispenser #1,OK,Normal,AJX75334501,4C1XD438-1
40504,Dam dispenser #2,162.4,Fill1 dispenser #2,837.7,Fill2 dispenser #2,305,Normal,AJX75334501,3I1XA258-1


In [74]:
# 각 열 조합에 대한 일치 여부를 나타내는 컬럼 생성
train_data['Dam_vs_Fill1'] = train_data['Equipment_Dam'] == train_data['Equipment_Fill1']
train_data['Dam_vs_Fill2'] = train_data['Equipment_Dam'] == train_data['Equipment_Fill2']
train_data['Fill1_vs_Fill2'] = train_data['Equipment_Fill1'] == train_data['Equipment_Fill2']

# 세 열 모두 일치 여부를 나타내는 컬럼 생성
train_data['Dam_vs_Fill1'] = train_data['Dam_vs_Fill1'].replace({True: 1, False: 0})
train_data['Dam_vs_Fill2'] = train_data['Dam_vs_Fill2'].replace({True: 1, False: 0})
train_data['Fill1_vs_Fill2'] = train_data['Fill1_vs_Fill2'].replace({True: 1, False: 0})
train_data['All_Equal'] = (train_data['Dam_vs_Fill1'] & train_data['Dam_vs_Fill2'] & train_data['Fill1_vs_Fill2']).replace({True: 1, False: 0})

# 각 열 조합에 대한 일치 여부를 나타내는 컬럼 생성
test_data['Dam_vs_Fill1'] = test_data['Equipment_Dam'] == test_data['Equipment_Fill1']
test_data['Dam_vs_Fill2'] = test_data['Equipment_Dam'] == test_data['Equipment_Fill2']
test_data['Fill1_vs_Fill2'] = test_data['Equipment_Fill1'] == test_data['Equipment_Fill2']

# 세 열 모두 일치 여부를 나타내는 컬럼 생성
test_data['Dam_vs_Fill1'] = test_data['Dam_vs_Fill1'].replace({True: 1, False: 0})
test_data['Dam_vs_Fill2'] = test_data['Dam_vs_Fill2'].replace({True: 1, False: 0})
test_data['Fill1_vs_Fill2'] = test_data['Fill1_vs_Fill2'].replace({True: 1, False: 0})
test_data['All_Equal'] = (test_data['Dam_vs_Fill1'] & test_data['Dam_vs_Fill2'] & test_data['Fill1_vs_Fill2']).replace({True: 1, False: 0})

In [75]:
# 원-핫 인코딩을 적용할 컬럼 리스트
cat_cols = ['Equipment_Dam','Equipment_Fill1','Equipment_Fill2','Model', 'Workorder']
train_data[cat_cols]

Unnamed: 0,Equipment_Dam,Equipment_Fill1,Equipment_Fill2,Model,Workorder
0,Dam dispenser #1,Fill1 dispenser #1,Fill2 dispenser #1,AJX75334505,4F1XA938-1
1,Dam dispenser #1,Fill1 dispenser #1,Fill2 dispenser #1,AJX75334505,3KPM0016-2
2,Dam dispenser #2,Fill1 dispenser #2,Fill2 dispenser #2,AJX75334501,4E1X9167-1
3,Dam dispenser #2,Fill1 dispenser #2,Fill2 dispenser #2,AJX75334501,3K1X0057-1
4,Dam dispenser #1,Fill1 dispenser #1,Fill2 dispenser #1,AJX75334501,3HPM0007-1
...,...,...,...,...,...
40501,Dam dispenser #1,Fill1 dispenser #1,Fill2 dispenser #1,AJX75334501,3J1XF434-2
40502,Dam dispenser #2,Fill1 dispenser #2,Fill2 dispenser #2,AJX75334501,4E1XC796-1
40503,Dam dispenser #1,Fill1 dispenser #1,Fill2 dispenser #1,AJX75334501,4C1XD438-1
40504,Dam dispenser #2,Fill1 dispenser #2,Fill2 dispenser #2,AJX75334501,3I1XA258-1


In [76]:
target_col = 'Equipment_Dam'
uniq_values = train_data[target_col].unique()
for val in uniq_values:
    train_data[f'{target_col}_{val}'] = (train_data[target_col] == val).replace({True: 1, False: 0})
    test_data[f'{target_col}_{val}'] = (test_data[target_col] == val).replace({True: 1, False: 0})
    
train_data = train_data.drop(target_col, axis=1)
test_data = test_data.drop(target_col, axis=1)

In [77]:
# 'Equipment_Fill1','Equipment_Fill2' 동기화

target_cols = ['Equipment_Fill1','Equipment_Fill2']
equip_uniq_value = ['Fill1 dispenser #1','Fill1 dispenser #2']
for col in target_cols:
    for val in equip_uniq_value:
        train_data[col+'_'+val] = (train_data[col]==val).replace({True: 1, False: 0})
        test_data[col+'_'+val] = (test_data[col]==val).replace({True: 1, False: 0})

train_data = train_data.drop(target_cols, axis=1)
test_data = test_data.drop(target_cols, axis=1)

In [78]:
target_col = 'Model'
target_values = train_data[target_col].unique()
for val in equip_uniq_value:
    train_data[target_col+'_'+val] = (train_data[target_col]==val).replace({True: 1, False: 0})
    test_data[target_col+'_'+val] = (test_data[target_col]==val).replace({True: 1, False: 0})

train_data = train_data.drop(target_col, axis=1)
test_data = test_data.drop(target_col, axis=1)

In [79]:
drop_cols = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2',
]

train_data = train_data.drop(drop_cols, axis=1)
test_data = test_data.drop(drop_cols, axis=1)

In [80]:
# DataFrame에서 object 타입의 컬럼만 뽑아내기
object_columns = train_data.select_dtypes(include=['object']).columns.tolist()

# 추출된 컬럼들 출력
train_data[object_columns]

Unnamed: 0,target,Workorder
0,Normal,4F1XA938-1
1,Normal,3KPM0016-2
2,Normal,4E1X9167-1
3,Normal,3K1X0057-1
4,Normal,3HPM0007-1
...,...,...
40501,Normal,3J1XF434-2
40502,Normal,4E1XC796-1
40503,Normal,4C1XD438-1
40504,Normal,3I1XA258-1


In [81]:
# 처리할 컬럼들
target_cols = ['Workorder']
for col in target_cols:
    train_data[col] = train_data[col].apply(lambda x: x.split('-')[0] + str(int(x.split('-')[1])))
    test_data[col] = test_data[col].apply(lambda x: x.split('-')[0] + str(int(x.split('-')[1])))
    
# 결과를 저장할 딕셔너리 초기화
result_dict = {}

# 각 문자열을 한 글자씩 분리하여 딕셔너리에 저장
for item in train_data[col].unique():
    result_dict[item] = list(item)
    
df_result = pd.DataFrame(result_dict.values())
df_result

new_train_cols = []
new_test_cols = []

for target_col in target_cols:
    for i, col in enumerate(df_result.columns):
        for char in df_result[col].unique():
            new_train_cols.append((f'{target_col}_{col}_{i}_{char}', (train_data[target_col].str[i]==char).astype(int)))
            new_test_cols.append((f'{target_col}_{col}_{i}_{char}', (test_data[target_col].str[i]==char).astype(int)))

# 새로운 열들을 각각의 DataFrame에 추가
new_train_df = pd.concat([train_data] + [pd.Series(v, name=k) for k, v in new_train_cols], axis=1)
new_test_df = pd.concat([test_data] + [pd.Series(v, name=k) for k, v in new_test_cols], axis=1)

# target_cols 열 삭제
new_train_df = new_train_df.drop(target_cols, axis=1)
new_test_df = new_test_df.drop(target_cols, axis=1)

train_data = new_train_df
test_data = new_test_df

In [82]:
train_data['target'] = train_data['target'].map({'AbNormal':1, 'Normal':0})

In [83]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Columns: 240 entries, CURE END POSITION X Collect Result_Dam to Workorder_8_8_3
dtypes: float64(75), int64(165)
memory usage: 74.2 MB


In [84]:
features = train_data.select_dtypes(include=['float64', 'int64']).columns.tolist()

In [85]:
# 기본 설정
n_splits = 5

# Stratified KFold 설정
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_seed)

In [86]:
# X_train = train_data.drop('target', axis=1)
# y_train = train_data['target']
# X_test = test_data[X_train.columns]

In [87]:
X_train = train_data[features].drop('target', axis=1)
y_train = train_data['target']
X_test = test_data[X_train.columns]

In [88]:
from sklearn.feature_selection import RFE

In [89]:
scores = []
models = []

In [90]:
for fold, (train_index, valid_index) in enumerate(skf.split(X_train, y_train)):
    X_tr, X_val = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_tr, y_val = y_train.iloc[train_index], y_train.iloc[valid_index]

    model = CatBoostClassifier(random_state=101, eval_metric="F1")
    model.fit(X_tr, y_tr,
             eval_set=[(X_val, y_val)],
             early_stopping_rounds=100,
             verbose=500)
    
    models.append(model)
    scores.append(model.get_best_score()["validation"]["F1"])

Learning rate set to 0.074822
0:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 62.5ms	remaining: 1m 2s
500:	learn: 0.2151659	test: 0.1414538	best: 0.1414538 (499)	total: 5.4s	remaining: 5.37s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.1556420233
bestIteration = 699

Shrink model to first 700 iterations.
Learning rate set to 0.074822
0:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 11.6ms	remaining: 11.6s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.1192842942
bestIteration = 252

Shrink model to first 253 iterations.
Learning rate set to 0.074822
0:	learn: 0.0021254	test: 0.0000000	best: 0.0000000 (0)	total: 9.77ms	remaining: 9.76s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.0998003992
bestIteration = 367

Shrink model to first 368 iterations.
Learning rate set to 0.074822
0:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 9.96ms	remaining: 9.95s
Stopped by overfitting det

In [91]:
print(scores)
print(np.mean(scores))

[0.15564202334630348, 0.11928429423459243, 0.0998003992015968, 0.15891472868217055, 0.11177644710578843]
0.12908357851409033


In [92]:
# Threshold 값을 테스트할 범위 설정
thresholds = np.arange(0.0, 0.3, 0.01)  # 0.1에서 0.9까지 0.05 간격으로 테스트
best_threshold = 0
best_score = 0
best_scores = []

# 각 threshold에 대해 f1_score 계산
for threshold in thresholds:
    pred_list = []
    scores = []

    for fold, (train_index, valid_index) in enumerate(skf.split(X_train, y_train)):
        # 예측 확률 계산
        pred = models[fold].predict_proba(X_train.iloc[valid_index])
        
        # Threshold 적용하여 클래스 결정
        pred = np.where(pred[:,1] >= threshold, 1, 0)
        
        # f1_score 계산
        score = f1_score(y_train[valid_index], pred)
        scores.append(score)

    # 현재 threshold에 대한 평균 f1_score 계산
    mean_score = np.mean(scores)
    print(f"Threshold: {threshold}, F1 Score: {mean_score}")

    # 최적의 threshold 업데이트
    if mean_score > best_score:
        best_score = mean_score
        best_threshold = threshold
        best_scores = scores

# 최적의 threshold 및 해당 f1_score 출력
print(f"\nBest Threshold: {best_threshold}")
print(f"Best F1 Score: {best_score}")
print(f"Scores for Best Threshold: {best_scores}")


Threshold: 0.0, F1 Score: 0.1096695914279304
Threshold: 0.01, F1 Score: 0.109781277071924
Threshold: 0.02, F1 Score: 0.11183271595756517
Threshold: 0.03, F1 Score: 0.12174758654673537
Threshold: 0.04, F1 Score: 0.142166040930219
Threshold: 0.05, F1 Score: 0.16288472390019618
Threshold: 0.06, F1 Score: 0.17904915899318066
Threshold: 0.07, F1 Score: 0.19067025177773927
Threshold: 0.08, F1 Score: 0.1988794100279001
Threshold: 0.09, F1 Score: 0.20834133292652993
Threshold: 0.1, F1 Score: 0.21696396934017104
Threshold: 0.11, F1 Score: 0.21618477039446682
Threshold: 0.12, F1 Score: 0.2159109392939274
Threshold: 0.13, F1 Score: 0.20943781149230484
Threshold: 0.14, F1 Score: 0.20439738274198657
Threshold: 0.15, F1 Score: 0.1974354439268434
Threshold: 0.16, F1 Score: 0.19423364939363102
Threshold: 0.17, F1 Score: 0.1867678984680582
Threshold: 0.18, F1 Score: 0.18402829642553534
Threshold: 0.19, F1 Score: 0.17829445171981917
Threshold: 0.2, F1 Score: 0.17585560648225623
Threshold: 0.21, F1 Score

In [93]:
# 최적의 threshold를 사용해 X_test 예측
final_predictions = []

for fold in range(len(models)):
    # X_test에 대한 예측 확률 계산
    pred = models[fold].predict_proba(X_test)
    
    # 최적의 threshold 적용하여 클래스 결정 (0 또는 1)
    pred = np.where(pred[:,1] >= best_threshold, 1, 0)
    
    # 예측 결과 저장
    final_predictions.append(pred)

# 모든 폴드의 예측 결과를 합쳐서 최종 결론 도출
# 각 폴드에서의 예측 결과를 평균내고, 이를 기준으로 최종 결론 도출
final_predictions = np.mean(final_predictions, axis=0)
final_predictions = np.where(final_predictions >= best_threshold, 1, 0)

# 최종 예측 결과 출력
print("Final Predictions for X_test:")
print(final_predictions)

Final Predictions for X_test:
[1 0 1 ... 1 1 0]


In [94]:
# 매핑 함수 정의
mapping = {1: 'AbNormal', 0: 'Normal'}
map_func = np.vectorize(mapping.get)

# 배열에 매핑 함수 적용
arr_mapped = map_func(final_predictions)

In [95]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = arr_mapped

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)