# 제품 이상여부 판별 프로젝트


## 1. 데이터 불러오기


### 필수 라이브러리


In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import MiniBatchKMeans
from catboost import CatBoostClassifier
import lightgbm as lgb
from lightgbm import callback
import shap
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


### 데이터 읽어오기


In [2]:
ROOT_DIR = "data"
random_seed = 110

# Load data
train_data = pd.read_csv("train.csv")
train_data
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))
test_data = test_data[train_data.columns]

In [3]:
train_data['Chamber Temp. Judge Value_AutoClave'] = (train_data['Chamber Temp. Judge Value_AutoClave']=="OK").replace({True: 1, False: 0})
test_data['Chamber Temp. Judge Value_AutoClave'] = (test_data['Chamber Temp. Judge Value_AutoClave'] == "OK").replace({True: 1, False: 0})

In [4]:
# 원-핫 인코딩을 적용할 컬럼 리스트
cat_cols = ['Equipment_Dam','Equipment_Fill1','Equipment_Fill2','Model.Suffix_Dam', 'Model.Suffix_AutoClave', 'Model.Suffix_Fill1', 'Model.Suffix_Fill2']
train_data[cat_cols]

Unnamed: 0,Equipment_Dam,Equipment_Fill1,Equipment_Fill2,Model.Suffix_Dam,Model.Suffix_AutoClave,Model.Suffix_Fill1,Model.Suffix_Fill2
0,Dam dispenser #1,Fill1 dispenser #1,Fill2 dispenser #1,AJX75334505,AJX75334505,AJX75334505,AJX75334505
1,Dam dispenser #1,Fill1 dispenser #1,Fill2 dispenser #1,AJX75334505,AJX75334505,AJX75334505,AJX75334505
2,Dam dispenser #2,Fill1 dispenser #2,Fill2 dispenser #2,AJX75334501,AJX75334501,AJX75334501,AJX75334501
3,Dam dispenser #2,Fill1 dispenser #2,Fill2 dispenser #2,AJX75334501,AJX75334501,AJX75334501,AJX75334501
4,Dam dispenser #1,Fill1 dispenser #1,Fill2 dispenser #1,AJX75334501,AJX75334501,AJX75334501,AJX75334501
...,...,...,...,...,...,...,...
40501,Dam dispenser #1,Fill1 dispenser #1,Fill2 dispenser #1,AJX75334501,AJX75334501,AJX75334501,AJX75334501
40502,Dam dispenser #2,Fill1 dispenser #2,Fill2 dispenser #2,AJX75334501,AJX75334501,AJX75334501,AJX75334501
40503,Dam dispenser #1,Fill1 dispenser #1,Fill2 dispenser #1,AJX75334501,AJX75334501,AJX75334501,AJX75334501
40504,Dam dispenser #2,Fill1 dispenser #2,Fill2 dispenser #2,AJX75334501,AJX75334501,AJX75334501,AJX75334501


In [5]:
model_cols = [
    'Model.Suffix_Dam',
    'Model.Suffix_AutoClave',
    'Model.Suffix_Fill1',
    'Model.Suffix_Fill2'
]

train_data['Model'] = train_data[model_cols[0]]
test_data['Model'] = test_data[model_cols[0]]

train_data = train_data.drop(model_cols, axis=1)
test_data = test_data.drop(model_cols, axis=1)

In [6]:
judge_cols = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam',
    'GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave',
    'GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2'
]

train_data['Judge'] = (train_data[judge_cols[0]]=='OK').replace({True: 1, False: 0})
test_data['Judge'] = (test_data[judge_cols[0]]=='OK').replace({True: 1, False: 0})

train_data = train_data.drop(judge_cols, axis=1)
test_data = test_data.drop(judge_cols, axis=1)

In [7]:
workorder_cols = [
    'Workorder_Dam',
    'Workorder_AutoClave',
    'Workorder_Fill1',
    'Workorder_Fill2',
]

train_data['Workorder'] = train_data[workorder_cols[0]]
test_data['Workorder'] = test_data[workorder_cols[0]]

train_data = train_data.drop(workorder_cols, axis=1)
test_data = test_data.drop(workorder_cols, axis=1)

In [8]:
target_cols = [
    'Equipment_Dam',
    'Equipment_Fill1',
    'Equipment_Fill2',
]

train_data['Equipment'] = train_data['Equipment_Dam'] + '_' + train_data['Equipment_Fill1'] + '_' + train_data['Equipment_Fill2']
test_data['Equipment'] = test_data['Equipment_Dam'] + '_' + test_data['Equipment_Fill1'] + '_' + test_data['Equipment_Fill2']
train_data = train_data.drop(target_cols, axis=1)
test_data = test_data.drop(target_cols, axis=1)

In [9]:
drop_cols = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2',
]

train_data = train_data.drop(drop_cols, axis=1)
test_data = test_data.drop(drop_cols, axis=1)

In [10]:
# DataFrame에서 object 타입의 컬럼만 뽑아내기
object_columns = train_data.select_dtypes(include=['object']).columns.tolist()

# 추출된 컬럼들 출력
train_data[object_columns]

Unnamed: 0,target,Model,Workorder,Equipment
0,Normal,AJX75334505,4F1XA938-1,Dam dispenser #1_Fill1 dispenser #1_Fill2 disp...
1,Normal,AJX75334505,3KPM0016-2,Dam dispenser #1_Fill1 dispenser #1_Fill2 disp...
2,Normal,AJX75334501,4E1X9167-1,Dam dispenser #2_Fill1 dispenser #2_Fill2 disp...
3,Normal,AJX75334501,3K1X0057-1,Dam dispenser #2_Fill1 dispenser #2_Fill2 disp...
4,Normal,AJX75334501,3HPM0007-1,Dam dispenser #1_Fill1 dispenser #1_Fill2 disp...
...,...,...,...,...
40501,Normal,AJX75334501,3J1XF434-2,Dam dispenser #1_Fill1 dispenser #1_Fill2 disp...
40502,Normal,AJX75334501,4E1XC796-1,Dam dispenser #2_Fill1 dispenser #2_Fill2 disp...
40503,Normal,AJX75334501,4C1XD438-1,Dam dispenser #1_Fill1 dispenser #1_Fill2 disp...
40504,Normal,AJX75334501,3I1XA258-1,Dam dispenser #2_Fill1 dispenser #2_Fill2 disp...


In [11]:
train_data['target'] = train_data['target'].map({'AbNormal':1, 'Normal':0})

In [12]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Columns: 136 entries, CURE END POSITION X Collect Result_Dam to Equipment
dtypes: float64(68), int64(65), object(3)
memory usage: 42.0+ MB


In [13]:
# 기본 설정
n_splits = 10

# Stratified KFold 설정
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_seed)

In [14]:
X_train = train_data.drop('target', axis=1)
y_train = train_data['target']
X_test = test_data[X_train.columns]

In [15]:
object_columns = X_train.select_dtypes(include=['object']).columns.tolist()

In [16]:
X_train[object_columns] = X_train[object_columns].astype('category')
X_test[object_columns] = X_test[object_columns].astype('category')

In [17]:
def f1_metric(y_pred, data, threshold=0.5):
    y_true = data.get_label()
    y_pred = (y_pred >= threshold).astype(int)  # 임계값 적용
    return 'f1', f1_score(y_true, y_pred), True

In [18]:
scores = []
models = []

for fold, (train_index, valid_index) in enumerate(skf.split(X_train, y_train)):
    X_tr, X_val = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_tr, y_val = y_train.iloc[train_index], y_train.iloc[valid_index]

    train_data = lgb.Dataset(X_tr, label=y_tr, categorical_feature=object_columns, free_raw_data=False)
    valid_data = lgb.Dataset(X_val, label=y_val, categorical_feature=object_columns, reference=train_data, free_raw_data=False)

    params = {
        "objective": "binary",  # 이진 분류의 경우
        "boosting_type": "gbdt",
        "learning_rate": 0.1,
        "num_boost_round": 4000,  # boosting round를 지정
        "random_state": 101,
        "verbose": -1,
        "metric": "None"  # 기본 메트릭을 사용하지 않도록 설정
    }

    model = lgb.train(
        params,
        train_data,
        valid_sets=[valid_data],
        feval=lambda y_pred, data: f1_metric(y_pred, data, threshold=0.5),
        num_boost_round=4000,  # 부스팅 라운드 수
        callbacks=[
            lgb.early_stopping(stopping_rounds=1000),  # F1 스코어를 기준으로 조기 종료 설정
            lgb.log_evaluation(500)  # 학습 로그 출력 주기
        ]
    )

    models.append(model)
    scores.append(model.best_score['valid_0']['f1'])  # F1 스코어 저장

Training until validation scores don't improve for 1000 rounds
[500]	valid_0's f1: 0.218182
[1000]	valid_0's f1: 0.214533
[1500]	valid_0's f1: 0.22069
[2000]	valid_0's f1: 0.214533
[2500]	valid_0's f1: 0.213058
Early stopping, best iteration is:
[1655]	valid_0's f1: 0.229965
Training until validation scores don't improve for 1000 rounds
[500]	valid_0's f1: 0.185874
[1000]	valid_0's f1: 0.185714
Early stopping, best iteration is:
[343]	valid_0's f1: 0.193309
Training until validation scores don't improve for 1000 rounds
[500]	valid_0's f1: 0.155556
[1000]	valid_0's f1: 0.156584
[1500]	valid_0's f1: 0.173611
[2000]	valid_0's f1: 0.160839
[2500]	valid_0's f1: 0.151724
Early stopping, best iteration is:
[1547]	valid_0's f1: 0.181818
Training until validation scores don't improve for 1000 rounds
[500]	valid_0's f1: 0.158273
[1000]	valid_0's f1: 0.189831
[1500]	valid_0's f1: 0.182432
[2000]	valid_0's f1: 0.173333
Early stopping, best iteration is:
[1106]	valid_0's f1: 0.195286
Training until

In [19]:
print(scores)
print(np.mean(scores))

[0.2299651567944251, 0.19330855018587362, 0.18181818181818182, 0.19528619528619526, 0.1751824817518248, 0.15613382899628253, 0.2268041237113402, 0.15384615384615385, 0.17142857142857143, 0.20350877192982458]
0.1887282015748673


In [20]:
# 각 모델별로 최적의 threshold와 관련된 정보를 저장할 리스트 초기화
thresholds = np.arange(0.0, 0.3, 0.02)
best_thresholds = []
fold_best_scores = []

# 전체 데이터를 위한 결과 저장용 DataFrame
results_df = pd.DataFrame()

# 각 threshold에 대해 f1_score 계산
for fold, (train_index, valid_index) in enumerate(skf.split(X_train, y_train)):
    X_val = X_train.iloc[valid_index]
    y_val = y_train.iloc[valid_index]
    
    best_threshold = 0
    best_score = 0
    
    for threshold in thresholds:
        # LightGBM 모델 예측 (확률값 반환)
        pred_proba = models[fold].predict(X_val)
        
        # Threshold 적용하여 클래스 결정
        pred = np.where(pred_proba >= threshold, 1, 0)
        
        # f1_score 계산
        score = f1_score(y_val, pred)
        
        # 최적의 threshold 업데이트
        if score > best_score:
            best_score = score
            best_threshold = threshold
    
    # 해당 Fold에 대한 최적의 threshold와 F1 스코어 저장
    best_thresholds.append(best_threshold)
    fold_best_scores.append(best_score)
    
    # 예측 결과 저장
    fold_result = pd.DataFrame({
        'fold': fold + 1,
        'true_label': y_val.values,
        'pred_proba': pred_proba,
        'pred_label': np.where(pred_proba >= best_threshold, 1, 0),
        'threshold': best_threshold
    })
    results_df = pd.concat([results_df, fold_result], axis=0)

# 각 Fold별 최적의 threshold와 F1 스코어 출력
print(f"\nBest Thresholds per fold: {best_thresholds}")
print(f"Best F1 Scores per fold: {fold_best_scores}")

# 예측 실패 사례 분석
misclassified_df = results_df[results_df['true_label'] != results_df['pred_label']]

print(f"\nMisclassified samples at each fold's best threshold:")
display(misclassified_df)


Best Thresholds per fold: [0.16, 0.16, 0.08, 0.16, 0.12, 0.2, 0.24, 0.12, 0.18, 0.26]
Best F1 Scores per fold: [0.23587223587223585, 0.22131147540983603, 0.1968503937007874, 0.2173913043478261, 0.2191780821917808, 0.22506393861892582, 0.2513089005235602, 0.20294599018003273, 0.22641509433962265, 0.23655913978494622]

Misclassified samples at each fold's best threshold:


Unnamed: 0,fold,true_label,pred_proba,pred_label,threshold
32,1,0,0.202658,1,0.16
35,1,0,0.201069,1,0.16
58,1,1,0.021831,0,0.16
60,1,0,0.469546,1,0.16
63,1,0,0.166153,1,0.16
...,...,...,...,...,...
3985,10,1,0.020781,0,0.26
3987,10,1,0.020467,0,0.26
4022,10,1,0.007387,0,0.26
4029,10,1,0.116633,0,0.26


In [21]:
X_train.iloc[misclassified_df.index].to_csv("missed.csv")

In [22]:
# Test 데이터에 대해 최적의 threshold를 적용한 예측
final_predictions = []

for fold in range(len(models)):
    # X_test에 대한 예측 확률 계산
    pred_proba = models[fold].predict(X_test)
    
    # 각 fold에 해당하는 최적의 threshold 적용하여 클래스 결정 (0 또는 1)
    pred = np.where(pred_proba >= best_thresholds[fold], 1, 0)
    
    # 예측 결과 저장
    final_predictions.append(pred)

# 모든 폴드의 예측 결과를 합쳐서 최종 결론 도출 (평균)
final_predictions = np.mean(final_predictions, axis=0)
final_predictions = np.where(final_predictions >= 0.5, 1, 0)  # 평균이 0.5 이상인 경우 1로 결정

# 최종 예측 결과 출력
print("Final Predictions for X_test:")
print(final_predictions)

Final Predictions for X_test:
[0 0 0 ... 0 0 0]


In [26]:
# 매핑 함수 정의
mapping = {1: 'AbNormal', 0: 'Normal'}
map_func = np.vectorize(mapping.get)

# 배열에 매핑 함수 적용
arr_mapped = map_func(final_predictions)

# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = arr_mapped

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)