# 제품 이상여부 판별 프로젝트


## 1. 데이터 불러오기


### 필수 라이브러리


In [1]:
import os
from pprint import pprint

import numpy as np
import pandas as pd

from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

from tqdm import tqdm

### 데이터 읽어오기


In [2]:
ROOT_DIR = "data"
random_seed = 110

# Load data
train_data = pd.read_csv("train.csv")
train_data
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))
test_data = test_data[train_data.columns]

In [3]:
# 결측치가 있는지 확인
missing_values = train_data.isnull().sum()

drop_columns = missing_values[missing_values > 0].index.to_list()

In [4]:
train_data = train_data.drop(drop_columns, axis=1)
test_data = test_data.drop(drop_columns, axis=1)

In [5]:
train_data['Chamber Temp. Judge Value_AutoClave'] = (train_data['Chamber Temp. Judge Value_AutoClave']=="OK").replace({True: 1, False: 0})
test_data['Chamber Temp. Judge Value_AutoClave'] = (test_data['Chamber Temp. Judge Value_AutoClave'] == "OK").replace({True: 1, False: 0})

In [6]:
# 원-핫 인코딩을 적용할 컬럼 리스트
cat_cols = ['Equipment_Dam','Equipment_Fill1','Equipment_Fill2','Model.Suffix_Dam', 'Model.Suffix_AutoClave', 'Model.Suffix_Fill1', 'Model.Suffix_Fill2']
train_data[cat_cols]

Unnamed: 0,Equipment_Dam,Equipment_Fill1,Equipment_Fill2,Model.Suffix_Dam,Model.Suffix_AutoClave,Model.Suffix_Fill1,Model.Suffix_Fill2
0,Dam dispenser #1,Fill1 dispenser #1,Fill2 dispenser #1,AJX75334505,AJX75334505,AJX75334505,AJX75334505
1,Dam dispenser #1,Fill1 dispenser #1,Fill2 dispenser #1,AJX75334505,AJX75334505,AJX75334505,AJX75334505
2,Dam dispenser #2,Fill1 dispenser #2,Fill2 dispenser #2,AJX75334501,AJX75334501,AJX75334501,AJX75334501
3,Dam dispenser #2,Fill1 dispenser #2,Fill2 dispenser #2,AJX75334501,AJX75334501,AJX75334501,AJX75334501
4,Dam dispenser #1,Fill1 dispenser #1,Fill2 dispenser #1,AJX75334501,AJX75334501,AJX75334501,AJX75334501
...,...,...,...,...,...,...,...
40501,Dam dispenser #1,Fill1 dispenser #1,Fill2 dispenser #1,AJX75334501,AJX75334501,AJX75334501,AJX75334501
40502,Dam dispenser #2,Fill1 dispenser #2,Fill2 dispenser #2,AJX75334501,AJX75334501,AJX75334501,AJX75334501
40503,Dam dispenser #1,Fill1 dispenser #1,Fill2 dispenser #1,AJX75334501,AJX75334501,AJX75334501,AJX75334501
40504,Dam dispenser #2,Fill1 dispenser #2,Fill2 dispenser #2,AJX75334501,AJX75334501,AJX75334501,AJX75334501


In [7]:
# 'Equipment_Fill1','Equipment_Fill2' 동기화

target_cols = ['Equipment_Fill1','Equipment_Fill2']
equip_uniq_value = ['Equipment_Fill1','Equipment_Fill2']
for col in target_cols:
    for val in equip_uniq_value:
        train_data[col+'_'+val] = (train_data[col]==val).replace({True: 1, False: 0})
        test_data[col+'_'+val] = (test_data[col]==val).replace({True: 1, False: 0})
    train_data = train_data.drop(col, axis=1)
    test_data = test_data.drop(col, axis=1)

In [8]:
target_cols = ['Model.Suffix_Dam', 'Model.Suffix_AutoClave', 'Model.Suffix_Fill1', 'Model.Suffix_Fill2']
target_values = ['AJX75334505', 'AJX75334501', 'AJX75334502', 'AJX75334507', 'AJX75334506', 'AJX75334508', 'AJX75334503']
for col in target_cols:
    for val in equip_uniq_value:
        train_data[col+'_'+val] = (train_data[col]==val).replace({True: 1, False: 0})
        test_data[col+'_'+val] = (test_data[col]==val).replace({True: 1, False: 0})
    train_data = train_data.drop(col, axis=1)
    test_data = test_data.drop(col, axis=1)

In [9]:
target_cols = ['Equipment_Dam']
target_values = ['Dam dispenser #1', 'Dam dispenser #2']
for col in target_cols:
    for val in equip_uniq_value:
        train_data[col+'_'+val] = (train_data[col]==val).replace({True: 1, False: 0})
        test_data[col+'_'+val] = (test_data[col]==val).replace({True: 1, False: 0})
    train_data = train_data.drop(col, axis=1)
    test_data = test_data.drop(col, axis=1)

In [10]:
# DataFrame에서 object 타입의 컬럼만 뽑아내기
object_columns = train_data.select_dtypes(include=['object']).columns.tolist()

# 추출된 컬럼들 출력
train_data[object_columns]

Unnamed: 0,Workorder_Dam,Workorder_AutoClave,Workorder_Fill1,Workorder_Fill2,target
0,4F1XA938-1,4F1XA938-1,4F1XA938-1,4F1XA938-1,Normal
1,3KPM0016-2,3KPM0016-2,3KPM0016-2,3KPM0016-2,Normal
2,4E1X9167-1,4E1X9167-1,4E1X9167-1,4E1X9167-1,Normal
3,3K1X0057-1,3K1X0057-1,3K1X0057-1,3K1X0057-1,Normal
4,3HPM0007-1,3HPM0007-1,3HPM0007-1,3HPM0007-1,Normal
...,...,...,...,...,...
40501,3J1XF434-2,3J1XF434-2,3J1XF434-2,3J1XF434-2,Normal
40502,4E1XC796-1,4E1XC796-1,4E1XC796-1,4E1XC796-1,Normal
40503,4C1XD438-1,4C1XD438-1,4C1XD438-1,4C1XD438-1,Normal
40504,3I1XA258-1,3I1XA258-1,3I1XA258-1,3I1XA258-1,Normal


In [11]:
# 처리할 컬럼들
target_cols = ['Workorder_Dam', 'Workorder_AutoClave', 'Workorder_Fill1', 'Workorder_Fill2']
for col in target_cols:
    train_data[col] = train_data[col].apply(lambda x: x.split('-')[0] + str(int(x.split('-')[1])))
    test_data[col] = test_data[col].apply(lambda x: x.split('-')[0] + str(int(x.split('-')[1])))
    
# 결과를 저장할 딕셔너리 초기화
result_dict = {}

# 각 문자열을 한 글자씩 분리하여 딕셔너리에 저장
for item in train_data[col].unique():
    result_dict[item] = list(item)
    
df_result = pd.DataFrame(result_dict.values())
df_result

new_train_cols = []
new_test_cols = []

for target_col in target_cols:
    for i, col in enumerate(df_result.columns):
        for char in df_result[col].unique():
            new_train_cols.append((f'{target_col}_{col}_{i}_{char}', (train_data[target_col].str[i]==char).astype(int)))
            new_test_cols.append((f'{target_col}_{col}_{i}_{char}', (test_data[target_col].str[i]==char).astype(int)))

# 새로운 열들을 각각의 DataFrame에 추가
new_train_df = pd.concat([train_data] + [pd.Series(v, name=k) for k, v in new_train_cols], axis=1)
new_test_df = pd.concat([test_data] + [pd.Series(v, name=k) for k, v in new_test_cols], axis=1)

# target_cols 열 삭제
new_train_df = new_train_df.drop(target_cols, axis=1)
new_test_df = new_test_df.drop(target_cols, axis=1)

train_data = new_train_df
test_data = new_test_df

In [12]:
train_data['target'] = train_data['target'].map({'AbNormal':1, 'Normal':0})

In [13]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40506 entries, 0 to 40505
Columns: 434 entries, CURE END POSITION X Collect Result_Dam to Workorder_Fill2_8_8_3
dtypes: float64(68), int64(366)
memory usage: 134.1 MB


In [14]:
features = train_data.select_dtypes(include=['float64', 'int64']).columns.tolist()

In [15]:
# 기본 설정
n_splits = 5

# Stratified KFold 설정
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_seed)

In [16]:
# X_train = train_data.drop('target', axis=1)
# y_train = train_data['target']
# X_test = test_data[X_train.columns]

In [17]:
X_train = train_data[features].drop('target', axis=1)
y_train = train_data['target']
X_test = test_data[X_train.columns]

In [18]:
from sklearn.feature_selection import RFE

In [19]:
scores = []
models = []

In [21]:
for fold, (train_index, valid_index) in enumerate(skf.split(X_train, y_train)):
    X_tr, X_val = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_tr, y_val = y_train.iloc[train_index], y_train.iloc[valid_index]

    model = CatBoostClassifier(random_state=101, eval_metric="F1")
    model.fit(X_tr, y_tr,
             eval_set=[(X_val, y_val)],
             early_stopping_rounds=100,
             verbose=500)
    
    models.append(model)
    scores.append(model.get_best_score()["validation"]["F1"])

Learning rate set to 0.074822
0:	learn: 0.0341262	test: 0.0416667	best: 0.0416667 (0)	total: 62.3ms	remaining: 1m 2s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.09311740891
bestIteration = 104

Shrink model to first 105 iterations.
Learning rate set to 0.074822
0:	learn: 0.0383420	test: 0.0247934	best: 0.0247934 (0)	total: 11.6ms	remaining: 11.6s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.134122288
bestIteration = 382

Shrink model to first 383 iterations.
Learning rate set to 0.074822
0:	learn: 0.0372478	test: 0.0291060	best: 0.0291060 (0)	total: 11.3ms	remaining: 11.3s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.1037924152
bestIteration = 349

Shrink model to first 350 iterations.
Learning rate set to 0.074822
0:	learn: 0.0301142	test: 0.0573770	best: 0.0573770 (0)	total: 11.6ms	remaining: 11.6s
500:	learn: 0.1991383	test: 0.1559454	best: 0.1562500 (476)	total: 4.88s	remaining: 4.86s
Stopped by overfitting de

In [22]:
print(scores)
print(np.mean(scores))

[0.09311740890688261, 0.1341222879684418, 0.10379241516966067, 0.16988416988416988, 0.11553784860557768]
0.12329082610694653


In [28]:
# Threshold 값을 테스트할 범위 설정
thresholds = np.arange(0.0, 0.3, 0.01)  # 0.1에서 0.9까지 0.05 간격으로 테스트
best_threshold = 0
best_score = 0
best_scores = []

# 각 threshold에 대해 f1_score 계산
for threshold in thresholds:
    pred_list = []
    scores = []

    for fold, (train_index, valid_index) in enumerate(skf.split(X_train, y_train)):
        # 예측 확률 계산
        pred = models[fold].predict_proba(X_train.iloc[valid_index])
        
        # Threshold 적용하여 클래스 결정
        pred = np.where(pred[:,1] >= threshold, 1, 0)
        
        # f1_score 계산
        score = f1_score(y_train[valid_index], pred)
        scores.append(score)

    # 현재 threshold에 대한 평균 f1_score 계산
    mean_score = np.mean(scores)
    print(f"Threshold: {threshold}, F1 Score: {mean_score}")

    # 최적의 threshold 업데이트
    if mean_score > best_score:
        best_score = mean_score
        best_threshold = threshold
        best_scores = scores

# 최적의 threshold 및 해당 f1_score 출력
print(f"\nBest Threshold: {best_threshold}")
print(f"Best F1 Score: {best_score}")
print(f"Scores for Best Threshold: {best_scores}")


Threshold: 0.0, F1 Score: 0.1096695914279304
Threshold: 0.01, F1 Score: 0.10980226351010139
Threshold: 0.02, F1 Score: 0.11149246166040787
Threshold: 0.03, F1 Score: 0.12171867218041596
Threshold: 0.04, F1 Score: 0.13949633116510174
Threshold: 0.05, F1 Score: 0.1596768185607508
Threshold: 0.06, F1 Score: 0.1760360374089019
Threshold: 0.07, F1 Score: 0.1868685707946576
Threshold: 0.08, F1 Score: 0.1927477513765698
Threshold: 0.09, F1 Score: 0.19821964979670342
Threshold: 0.1, F1 Score: 0.20716594911299646
Threshold: 0.11, F1 Score: 0.20388355715651169
Threshold: 0.12, F1 Score: 0.1983593909463877
Threshold: 0.13, F1 Score: 0.19781100448784758
Threshold: 0.14, F1 Score: 0.18989345689030032
Threshold: 0.15, F1 Score: 0.18379784528371185
Threshold: 0.16, F1 Score: 0.17840700369198498
Threshold: 0.17, F1 Score: 0.1738811023212193
Threshold: 0.18, F1 Score: 0.16908595473510207
Threshold: 0.19, F1 Score: 0.1664118431069535
Threshold: 0.2, F1 Score: 0.1633776099238417
Threshold: 0.21, F1 Score

In [32]:
# 최적의 threshold를 사용해 X_test 예측
final_predictions = []

for fold in range(len(models)):
    # X_test에 대한 예측 확률 계산
    pred = models[fold].predict_proba(X_test)
    
    # 최적의 threshold 적용하여 클래스 결정 (0 또는 1)
    pred = np.where(pred[:,1] >= best_threshold, 1, 0)
    
    # 예측 결과 저장
    final_predictions.append(pred)

# 모든 폴드의 예측 결과를 합쳐서 최종 결론 도출
# 각 폴드에서의 예측 결과를 평균내고, 이를 기준으로 최종 결론 도출
final_predictions = np.mean(final_predictions, axis=0)
final_predictions = np.where(final_predictions >= best_threshold, 1, 0)

# 최종 예측 결과 출력
print("Final Predictions for X_test:")
print(final_predictions)

Final Predictions for X_test:
[0 0 0 ... 0 0 0]


In [33]:
# 매핑 함수 정의
mapping = {1: 'AbNormal', 0: 'Normal'}
map_func = np.vectorize(mapping.get)

# 배열에 매핑 함수 적용
arr_mapped = map_func(final_predictions)

In [34]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = arr_mapped

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)