In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import MiniBatchKMeans
from catboost import CatBoostClassifier
import shap

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ROOT_DIR = "data"
random_state = 110

# Load data
train_data = pd.read_csv("train.csv")
train_data
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))
test_data = test_data[train_data.columns]

In [3]:
train_data['Chamber Temp. Judge Value_AutoClave'] = (train_data['Chamber Temp. Judge Value_AutoClave']=="OK").replace({True: 1, False: 0})
test_data['Chamber Temp. Judge Value_AutoClave'] = (test_data['Chamber Temp. Judge Value_AutoClave'] == "OK").replace({True: 1, False: 0})

In [4]:
# 원-핫 인코딩을 적용할 컬럼 리스트
cat_cols = ['Equipment_Dam','Equipment_Fill1','Equipment_Fill2','Model.Suffix_Dam', 'Model.Suffix_AutoClave', 'Model.Suffix_Fill1', 'Model.Suffix_Fill2']
train_data[cat_cols]

Unnamed: 0,Equipment_Dam,Equipment_Fill1,Equipment_Fill2,Model.Suffix_Dam,Model.Suffix_AutoClave,Model.Suffix_Fill1,Model.Suffix_Fill2
0,Dam dispenser #1,Fill1 dispenser #1,Fill2 dispenser #1,AJX75334505,AJX75334505,AJX75334505,AJX75334505
1,Dam dispenser #1,Fill1 dispenser #1,Fill2 dispenser #1,AJX75334505,AJX75334505,AJX75334505,AJX75334505
2,Dam dispenser #2,Fill1 dispenser #2,Fill2 dispenser #2,AJX75334501,AJX75334501,AJX75334501,AJX75334501
3,Dam dispenser #2,Fill1 dispenser #2,Fill2 dispenser #2,AJX75334501,AJX75334501,AJX75334501,AJX75334501
4,Dam dispenser #1,Fill1 dispenser #1,Fill2 dispenser #1,AJX75334501,AJX75334501,AJX75334501,AJX75334501
...,...,...,...,...,...,...,...
40501,Dam dispenser #1,Fill1 dispenser #1,Fill2 dispenser #1,AJX75334501,AJX75334501,AJX75334501,AJX75334501
40502,Dam dispenser #2,Fill1 dispenser #2,Fill2 dispenser #2,AJX75334501,AJX75334501,AJX75334501,AJX75334501
40503,Dam dispenser #1,Fill1 dispenser #1,Fill2 dispenser #1,AJX75334501,AJX75334501,AJX75334501,AJX75334501
40504,Dam dispenser #2,Fill1 dispenser #2,Fill2 dispenser #2,AJX75334501,AJX75334501,AJX75334501,AJX75334501


In [5]:
model_cols = [
    'Model.Suffix_Dam',
    'Model.Suffix_AutoClave',
    'Model.Suffix_Fill1',
    'Model.Suffix_Fill2'
]

train_data['Model'] = train_data[model_cols[0]]
test_data['Model'] = test_data[model_cols[0]]

train_data = train_data.drop(model_cols, axis=1)
test_data = test_data.drop(model_cols, axis=1)

In [6]:
judge_cols = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam',
    'GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave',
    'GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2'
]

train_data['Judge'] = (train_data[judge_cols[0]]=='OK').replace({True: 1, False: 0})
test_data['Judge'] = (test_data[judge_cols[0]]=='OK').replace({True: 1, False: 0})

train_data = train_data.drop(judge_cols, axis=1)
test_data = test_data.drop(judge_cols, axis=1)

In [7]:
workorder_cols = [
    'Workorder_Dam',
    'Workorder_AutoClave',
    'Workorder_Fill1',
    'Workorder_Fill2',
]

train_data['Workorder'] = train_data[workorder_cols[0]]
test_data['Workorder'] = test_data[workorder_cols[0]]

train_data = train_data.drop(workorder_cols, axis=1)
test_data = test_data.drop(workorder_cols, axis=1)

In [8]:
target_cols = [
    'Equipment_Dam',
    'Equipment_Fill1',
    'Equipment_Fill2',
]

train_data['Equipment'] = train_data['Equipment_Dam'] + '_' + train_data['Equipment_Fill1'] + '_' + train_data['Equipment_Fill2']
test_data['Equipment'] = test_data['Equipment_Dam'] + '_' + test_data['Equipment_Fill1'] + '_' + test_data['Equipment_Fill2']
train_data = train_data.drop(target_cols, axis=1)
test_data = test_data.drop(target_cols, axis=1)

In [9]:
drop_cols = [
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',
    'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2',
]

train_data = train_data.drop(drop_cols, axis=1)
test_data = test_data.drop(drop_cols, axis=1)

In [10]:
# DataFrame에서 object 타입의 컬럼만 뽑아내기
object_columns = train_data.select_dtypes(include=['object']).columns.tolist()

# 추출된 컬럼들 출력
train_data[object_columns]

Unnamed: 0,target,Model,Workorder,Equipment
0,Normal,AJX75334505,4F1XA938-1,Dam dispenser #1_Fill1 dispenser #1_Fill2 disp...
1,Normal,AJX75334505,3KPM0016-2,Dam dispenser #1_Fill1 dispenser #1_Fill2 disp...
2,Normal,AJX75334501,4E1X9167-1,Dam dispenser #2_Fill1 dispenser #2_Fill2 disp...
3,Normal,AJX75334501,3K1X0057-1,Dam dispenser #2_Fill1 dispenser #2_Fill2 disp...
4,Normal,AJX75334501,3HPM0007-1,Dam dispenser #1_Fill1 dispenser #1_Fill2 disp...
...,...,...,...,...
40501,Normal,AJX75334501,3J1XF434-2,Dam dispenser #1_Fill1 dispenser #1_Fill2 disp...
40502,Normal,AJX75334501,4E1XC796-1,Dam dispenser #2_Fill1 dispenser #2_Fill2 disp...
40503,Normal,AJX75334501,4C1XD438-1,Dam dispenser #1_Fill1 dispenser #1_Fill2 disp...
40504,Normal,AJX75334501,3I1XA258-1,Dam dispenser #2_Fill1 dispenser #2_Fill2 disp...


In [11]:
train_data['target'] = train_data['target'].map({'AbNormal':1, 'Normal':0})

In [12]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Columns: 136 entries, CURE END POSITION X Collect Result_Dam to Equipment
dtypes: float64(68), int64(65), object(3)
memory usage: 42.0+ MB


In [13]:
# 기본 설정
n_splits = 5

# Stratified KFold 설정
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

In [14]:
X_train = train_data.drop('target', axis=1)
y_train = train_data['target']
X_test = test_data[X_train.columns]

In [15]:
object_columns = X_train.select_dtypes(include=['object']).columns.tolist()

In [16]:
scores = []
models = []

for fold, (train_index, valid_index) in enumerate(skf.split(X_train, y_train)):
    X_tr, X_val = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_tr, y_val = y_train.iloc[train_index], y_train.iloc[valid_index]

    model = CatBoostClassifier(random_state=101, 
                               eval_metric="F1", 
                               cat_features=object_columns,
                               iterations=4000,
                               early_stopping_rounds=1000)
    
    model.fit(X_tr, y_tr,
              eval_set=[(X_val, y_val)],
              verbose=500)
    
    # 이미 최적 iteration에서 학습된 모델을 저장
    models.append(model)
    scores.append(model.get_best_score()["validation"]["F1"])

# 모든 fold에 대한 최적의 모델과 F1 스코어 출력
print(f"Best F1 Scores per fold: {scores}")


Learning rate set to 0.040938
0:	learn: 0.0010633	test: 0.0000000	best: 0.0000000 (0)	total: 75.9ms	remaining: 5m 3s
500:	learn: 0.1384083	test: 0.0965795	best: 0.0965795 (405)	total: 9.91s	remaining: 1m 9s
1000:	learn: 0.1973180	test: 0.1192843	best: 0.1230159 (802)	total: 19.9s	remaining: 59.6s
1500:	learn: 0.2525534	test: 0.1333333	best: 0.1333333 (1471)	total: 29.9s	remaining: 49.7s
2000:	learn: 0.2921960	test: 0.1481481	best: 0.1481481 (1963)	total: 39.9s	remaining: 39.9s
2500:	learn: 0.3291925	test: 0.1475728	best: 0.1514563 (2164)	total: 50s	remaining: 30s
3000:	learn: 0.3725151	test: 0.1544402	best: 0.1579961 (2979)	total: 1m	remaining: 20s
3500:	learn: 0.4204784	test: 0.1538462	best: 0.1579961 (2979)	total: 1m 10s	remaining: 10s
Stopped by overfitting detector  (1000 iterations wait)

bestTest = 0.1579961464
bestIteration = 2979

Shrink model to first 2980 iterations.
Learning rate set to 0.040939
0:	learn: 0.0178853	test: 0.0126582	best: 0.0126582 (0)	total: 22.8ms	remaining:

In [17]:
print(scores)
print(np.mean(scores))

[0.1579961464354528, 0.13307240704500978, 0.10756972111553785, 0.16153846153846155, 0.14145383104125736]
0.14032611343514384


In [18]:
# Threshold 값을 테스트할 범위 설정
thresholds = np.arange(0.0, 0.3, 0.01)
best_thresholds = []
best_scores = []

# 각 fold에 대해 최적의 threshold와 F1 스코어 계산
for fold, (train_index, valid_index) in enumerate(skf.split(X_train, y_train)):
    X_val = X_train.iloc[valid_index]
    y_val = y_train.iloc[valid_index]
    
    best_threshold = 0
    best_score = 0
    
    for threshold in thresholds:
        # 예측 확률 계산
        pred_proba = models[fold].predict_proba(X_val)
        
        # Threshold 적용하여 클래스 결정
        pred = np.where(pred_proba[:, 1] >= threshold, 1, 0)
        
        # f1_score 계산
        score = f1_score(y_val, pred)
        
        # 최적의 threshold 업데이트
        if score > best_score:
            best_score = score
            best_threshold = threshold
    
    # 해당 Fold에 대한 최적의 threshold와 F1 스코어 저장
    best_thresholds.append(best_threshold)
    best_scores.append(best_score)
    
    print(f"Fold {fold+1}: Best Threshold: {best_threshold}, Best F1 Score: {best_score}")

# 각 Fold별 최적의 threshold와 F1 스코어 출력
print(f"\nBest Thresholds per fold: {best_thresholds}")
print(f"Best F1 Scores per fold: {best_scores}")

Fold 1: Best Threshold: 0.13, Best F1 Score: 0.24163969795037757
Fold 2: Best Threshold: 0.11, Best F1 Score: 0.21718602455146363
Fold 3: Best Threshold: 0.1, Best F1 Score: 0.21874999999999997
Fold 4: Best Threshold: 0.12, Best F1 Score: 0.24554039874081848
Fold 5: Best Threshold: 0.11, Best F1 Score: 0.23047619047619045

Best Thresholds per fold: [0.13, 0.11, 0.1, 0.12, 0.11]
Best F1 Scores per fold: [0.24163969795037757, 0.21718602455146363, 0.21874999999999997, 0.24554039874081848, 0.23047619047619045]


In [19]:
# Test 데이터에 대해 최적의 threshold를 적용한 예측
final_predictions = []

for fold in range(len(models)):
    # X_test에 대한 예측 확률 계산
    pred_proba = models[fold].predict_proba(X_test)
    
    # 각 fold에 해당하는 최적의 threshold 적용하여 클래스 결정 (0 또는 1)
    pred = np.where(pred_proba[:, 1] >= best_thresholds[fold], 1, 0)
    
    # 예측 결과 저장
    final_predictions.append(pred)

# 모든 폴드의 예측 결과를 합쳐서 최종 결론 도출 (평균)
final_predictions = np.mean(final_predictions, axis=0)
final_predictions = np.where(final_predictions >= 0.5, 1, 0)  # 평균이 0.5 이상인 경우 1로 결정

# 최종 예측 결과 출력
print("Final Predictions for X_test:")
print(final_predictions)

Final Predictions for X_test:
[0 0 0 ... 0 0 0]


In [20]:
# 매핑 함수 정의
mapping = {1: 'AbNormal', 0: 'Normal'}
map_func = np.vectorize(mapping.get)

# 배열에 매핑 함수 적용
arr_mapped = map_func(final_predictions)

# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = arr_mapped

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)