In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler, RobustScaler
import lightgbm as lgb
import warnings
# warnings.filterwarnings('ignore')

In [2]:
ROOT_DIR = "data"
random_state = 110

# Load data
train_data = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
train_data['target'] = train_data['target'].map({"Normal":0, "AbNormal":1})
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))
test_data = test_data[train_data.columns]
train_data.head()

Unnamed: 0,Wip Line_Dam,Process Desc._Dam,Equipment_Dam,Model.Suffix_Dam,Workorder_Dam,Insp. Seq No._Dam,Insp Judge Code_Dam,CURE END POSITION X Collect Result_Dam,CURE END POSITION X Unit Time_Dam,CURE END POSITION X Judge Value_Dam,...,Production Qty Collect Result_Fill2,Production Qty Unit Time_Fill2,Production Qty Judge Value_Fill2,Receip No Collect Result_Fill2,Receip No Unit Time_Fill2,Receip No Judge Value_Fill2,WorkMode Collect Result_Fill2,WorkMode Unit Time_Fill2,WorkMode Judge Value_Fill2,target
0,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,4F1XA938-1,1,OK,240.0,,,...,7,,,127,,,1,,,0
1,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334505,3KPM0016-2,1,OK,240.0,,,...,185,,,1,,,0,,,0
2,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,4E1X9167-1,1,OK,1000.0,,,...,10,,,73,,,1,,,0
3,IVI-OB6,Dam Dispenser,Dam dispenser #2,AJX75334501,3K1X0057-1,1,OK,1000.0,,,...,268,,,1,,,0,,,0
4,IVI-OB6,Dam Dispenser,Dam dispenser #1,AJX75334501,3HPM0007-1,1,OK,240.0,,,...,121,,,1,,,0,,,0


In [3]:
def data_recovery(data):
    mixed_columns = [
        "GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave",
        "GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave",
        'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam',
        'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1',
        'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill2',
        'Receip No Collect Result_Fill1',
        'Receip No Collect Result_Fill2',
    ]
    
    workmode_cols = [
        'WorkMode Collect Result_Dam',
        'WorkMode Collect Result_Fill1',
        'WorkMode Collect Result_Fill2',
    ]

    dam_cols = [
        'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam',
        'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam',
        'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam',
        'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam',
        'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam',
        'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam',
        'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam',
        'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam',
        'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam',
        'HEAD Standby Position X Collect Result_Dam',
        'HEAD Standby Position Y Collect Result_Dam',
        'HEAD Standby Position Z Collect Result_Dam',
        'Head Clean Position X Collect Result_Dam',
        'Head Clean Position Y Collect Result_Dam',
        'Head Clean Position Z Collect Result_Dam',
        'Head Purge Position X Collect Result_Dam',
        'Head Purge Position Y Collect Result_Dam',
        'Head Purge Position Z Collect Result_Dam',
        'Head Zero Position X Collect Result_Dam',
        'Head Zero Position Y Collect Result_Dam',
        'Head Zero Position Z Collect Result_Dam',
        'Machine Tact time Collect Result_Dam',
        'PalletID Collect Result_Dam',
        'Production Qty Collect Result_Dam',
        'Receip No Collect Result_Dam',
        'Stage1 Circle1 Distance Speed Collect Result_Dam',
        'Stage1 Circle2 Distance Speed Collect Result_Dam',
        'Stage1 Circle3 Distance Speed Collect Result_Dam',
        'Stage1 Circle4 Distance Speed Collect Result_Dam',
        'Stage1 Line1 Distance Speed Collect Result_Dam',
        'Stage1 Line2 Distance Speed Collect Result_Dam',
        'Stage1 Line3 Distance Speed Collect Result_Dam',
        'Stage1 Line4 Distance Speed Collect Result_Dam',
        'Stage2 Circle1 Distance Speed Collect Result_Dam',
        'Stage2 Circle2 Distance Speed Collect Result_Dam',
        'Stage2 Circle3 Distance Speed Collect Result_Dam',
        'Stage2 Circle4 Distance Speed Collect Result_Dam',
        'Stage2 Line1 Distance Speed Collect Result_Dam',
        'Stage2 Line2 Distance Speed Collect Result_Dam',
        'Stage2 Line3 Distance Speed Collect Result_Dam',
        'Stage2 Line4 Distance Speed Collect Result_Dam',
        'Stage3 Circle1 Distance Speed Collect Result_Dam',
        'Stage3 Circle2 Distance Speed Collect Result_Dam',
        'Stage3 Circle3 Distance Speed Collect Result_Dam',
        'Stage3 Circle4 Distance Speed Collect Result_Dam',
        'Stage3 Line1 Distance Speed Collect Result_Dam',
        'Stage3 Line2 Distance Speed Collect Result_Dam',
        'Stage3 Line3 Distance Speed Collect Result_Dam',
        'Stage3 Line4 Distance Speed Collect Result_Dam',
        'THICKNESS 1 Collect Result_Dam',
        'THICKNESS 2 Collect Result_Dam',
        'THICKNESS 3 Collect Result_Dam',
        'WorkMode Collect Result_Dam',
    ]

    fill1_cols = [
        'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1',
        'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1',
        'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1',
        'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1',
        'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1',
        'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1',
        'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1',
        'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1',
        'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1',
        'HEAD Standby Position X Collect Result_Fill1',
        'HEAD Standby Position Y Collect Result_Fill1',
        'HEAD Standby Position Z Collect Result_Fill1',
        'Head Clean Position X Collect Result_Fill1',
        'Head Clean Position Y Collect Result_Fill1',
        'Head Clean Position Z Collect Result_Fill1',
        'Head Purge Position X Collect Result_Fill1',
        'Head Purge Position Y Collect Result_Fill1',
        'Head Purge Position Z Collect Result_Fill1',
        'Machine Tact time Collect Result_Fill1',
        'PalletID Collect Result_Fill1',
        'Production Qty Collect Result_Fill1',
        'Receip No Collect Result_Fill1',
        'WorkMode Collect Result_Fill1',
    ]

    fill2_cols = [
        'HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2',
        'HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2',
        'HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2',
        'HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2',
        'HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2',
        'HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2',
        'HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2',
        'HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2',
        'HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2',
        'HEAD Standby Position X Collect Result_Fill2',
        'HEAD Standby Position Y Collect Result_Fill2',
        'HEAD Standby Position Z Collect Result_Fill2',
        'Head Clean Position X Collect Result_Fill2',
        'Head Clean Position Y Collect Result_Fill2',
        'Head Clean Position Z Collect Result_Fill2',
        'Head Purge Position X Collect Result_Fill2',
        'Head Purge Position Y Collect Result_Fill2',
        'Head Purge Position Z Collect Result_Fill2',
        'Machine Tact time Collect Result_Fill2',
        'PalletID Collect Result_Fill2',
        'Production Qty Collect Result_Fill2',
        'Receip No Collect Result_Fill2',
        'WorkMode Collect Result_Fill2',
    ]

    model_cols = [
        'Model.Suffix_Dam',
        'Model.Suffix_AutoClave',
        'Model.Suffix_Fill1',
        'Model.Suffix_Fill2'
    ]
    
    workorder_cols = [
        'Workorder_Dam',
        'Workorder_AutoClave',
        'Workorder_Fill1',
        'Workorder_Fill2',
    ]
    
    condition = data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].replace("OK", np.nan).isna()

    data.loc[condition, dam_cols] = data[condition][dam_cols].shift(-1, axis=1)[dam_cols]
    data.loc[condition, fill1_cols] = data[condition][fill1_cols].shift(-1, axis=1)[fill1_cols]
    data.loc[condition, fill2_cols] = data[condition][fill2_cols].shift(-1, axis=1)[fill2_cols]

    data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'] = data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam'].astype(np.float64)
    data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'] = data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].astype(np.float64)
    data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'] = data['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2'].astype(np.float64)
    
    data['Model'] = data[model_cols[0]]
    data['Workorder'] = data[workorder_cols[0]]
    data['Chamber Temp. Judge Value_AutoClave'] = (data['Chamber Temp. Judge Value_AutoClave']=="OK").replace({True: 1, False: 0})
    
    data= data.drop(mixed_columns + workmode_cols + model_cols + workorder_cols, axis=1)
    
    
    return data

train_data = data_recovery(train_data)
test_data = data_recovery(test_data)

  data.loc[condition, dam_cols] = data[condition][dam_cols].shift(-1, axis=1)[dam_cols]
  data.loc[condition, fill2_cols] = data[condition][fill2_cols].shift(-1, axis=1)[fill2_cols]
  data.loc[condition, dam_cols] = data[condition][dam_cols].shift(-1, axis=1)[dam_cols]
  data.loc[condition, fill2_cols] = data[condition][fill2_cols].shift(-1, axis=1)[fill2_cols]


In [4]:
cols_to_drop = train_data.columns[train_data.nunique(dropna=False) <= 1]
train_data = train_data.drop(cols_to_drop, axis=1)

In [5]:
dam_cols = [
    "Model",
    "Workorder",
    "Equipment_Dam",
    "CURE END POSITION X Collect Result_Dam",
    "CURE END POSITION Z Collect Result_Dam",
    "CURE END POSITION Θ Collect Result_Dam",
    "CURE SPEED Collect Result_Dam",
    "CURE START POSITION X Collect Result_Dam",
    "CURE START POSITION Θ Collect Result_Dam",
    "DISCHARGED SPEED OF RESIN Collect Result_Dam",
    "DISCHARGED TIME OF RESIN(Stage1) Collect Result_Dam",
    "DISCHARGED TIME OF RESIN(Stage2) Collect Result_Dam",
    "DISCHARGED TIME OF RESIN(Stage3) Collect Result_Dam",
    "Dispense Volume(Stage1) Collect Result_Dam",
    "Dispense Volume(Stage2) Collect Result_Dam",
    "Dispense Volume(Stage3) Collect Result_Dam",
    "HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Dam",
    "HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Dam",
    "HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Dam",
    "HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Dam",
    "HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Dam",
    "HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Dam",
    "HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Dam",
    "HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Dam",
    "HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Dam",
    "Head Clean Position Z Collect Result_Dam",
    "Head Purge Position Z Collect Result_Dam",
    "Head Zero Position Y Collect Result_Dam",
    "Head Zero Position Z Collect Result_Dam",
    "Machine Tact time Collect Result_Dam",
    "PalletID Collect Result_Dam",
    "Production Qty Collect Result_Dam",
    "Receip No Collect Result_Dam",
    "Stage1 Circle1 Distance Speed Collect Result_Dam",
    "Stage1 Circle2 Distance Speed Collect Result_Dam",
    "Stage1 Circle3 Distance Speed Collect Result_Dam",
    "Stage1 Circle4 Distance Speed Collect Result_Dam",
    "Stage1 Line1 Distance Speed Collect Result_Dam",
    "Stage1 Line2 Distance Speed Collect Result_Dam",
    "Stage1 Line3 Distance Speed Collect Result_Dam",
    "Stage1 Line4 Distance Speed Collect Result_Dam",
    "Stage2 Circle1 Distance Speed Collect Result_Dam",
    "Stage2 Circle2 Distance Speed Collect Result_Dam",
    "Stage2 Circle3 Distance Speed Collect Result_Dam",
    "Stage2 Circle4 Distance Speed Collect Result_Dam",
    "Stage2 Line1 Distance Speed Collect Result_Dam",
    "Stage2 Line2 Distance Speed Collect Result_Dam",
    "Stage2 Line3 Distance Speed Collect Result_Dam",
    "Stage2 Line4 Distance Speed Collect Result_Dam",
    "Stage3 Circle1 Distance Speed Collect Result_Dam",
    "Stage3 Circle2 Distance Speed Collect Result_Dam",
    "Stage3 Circle3 Distance Speed Collect Result_Dam",
    "Stage3 Circle4 Distance Speed Collect Result_Dam",
    "Stage3 Line1 Distance Speed Collect Result_Dam",
    "Stage3 Line2 Distance Speed Collect Result_Dam",
    "Stage3 Line3 Distance Speed Collect Result_Dam",
    "Stage3 Line4 Distance Speed Collect Result_Dam",
    "THICKNESS 1 Collect Result_Dam",
    "THICKNESS 2 Collect Result_Dam",
    "THICKNESS 3 Collect Result_Dam"
]


clave_cols = [
    "Model",
    "Workorder",
    "1st Pressure Collect Result_AutoClave",
    "1st Pressure 1st Pressure Unit Time_AutoClave",
    "2nd Pressure Collect Result_AutoClave",
    "2nd Pressure Unit Time_AutoClave",
    "3rd Pressure Collect Result_AutoClave",
    "3rd Pressure Unit Time_AutoClave",
    "Chamber Temp. Collect Result_AutoClave",
    "Chamber Temp. Unit Time_AutoClave",
    "Chamber Temp. Judge Value_AutoClave"
]

fill1_cols = [
    "Model",
    "Workorder",
    "Equipment_Fill1",
    "DISCHARGED SPEED OF RESIN Collect Result_Fill1",
    "DISCHARGED TIME OF RESIN(Stage1) Collect Result_Fill1",
    "DISCHARGED TIME OF RESIN(Stage2) Collect Result_Fill1",
    "DISCHARGED TIME OF RESIN(Stage3) Collect Result_Fill1",
    "Dispense Volume(Stage1) Collect Result_Fill1",
    "Dispense Volume(Stage2) Collect Result_Fill1",
    "Dispense Volume(Stage3) Collect Result_Fill1",
    "HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1",
    "HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill1",
    "HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill1",
    "HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill1",
    "HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill1",
    "HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill1",
    "HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill1",
    "HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill1",
    "HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill1",
    "Head Purge Position Z Collect Result_Fill1",
    "Machine Tact time Collect Result_Fill1",
    "PalletID Collect Result_Fill1",
    "Production Qty Collect Result_Fill1"
]

fill2_cols = [
    "Model",
    "Workorder",
    "Equipment_Fill2",
    "CURE END POSITION X Collect Result_Fill2",
    "CURE END POSITION Z Collect Result_Fill2",
    "CURE SPEED Collect Result_Fill2",
    "CURE STANDBY POSITION Z Collect Result_Fill2",
    "CURE START POSITION X Collect Result_Fill2",
    "CURE START POSITION Z Collect Result_Fill2",
    "HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill2",
    "HEAD NORMAL COORDINATE X AXIS(Stage2) Collect Result_Fill2",
    "HEAD NORMAL COORDINATE X AXIS(Stage3) Collect Result_Fill2",
    "HEAD NORMAL COORDINATE Y AXIS(Stage1) Collect Result_Fill2",
    "HEAD NORMAL COORDINATE Y AXIS(Stage2) Collect Result_Fill2",
    "HEAD NORMAL COORDINATE Y AXIS(Stage3) Collect Result_Fill2",
    "HEAD NORMAL COORDINATE Z AXIS(Stage1) Collect Result_Fill2",
    "HEAD NORMAL COORDINATE Z AXIS(Stage2) Collect Result_Fill2",
    "HEAD NORMAL COORDINATE Z AXIS(Stage3) Collect Result_Fill2",
    "Head Purge Position Z Collect Result_Fill2",
    "Machine Tact time Collect Result_Fill2",
    "PalletID Collect Result_Fill2",
    "Production Qty Collect Result_Fill2"
]


In [6]:
X_train_dam = train_data[dam_cols]
X_train_clave = train_data[clave_cols]
X_train_fill1 = train_data[fill1_cols]
X_train_fill2 = train_data[fill2_cols]

In [7]:
def drop_one_value_col(data):
    cols_to_drop = data.columns[data.nunique(dropna=False) <= 1]
    return data.drop(cols_to_drop, axis=1)

In [8]:
X_train_dam = drop_one_value_col(X_train_dam)
X_train_clave = drop_one_value_col(X_train_clave)
X_train_fill1 = drop_one_value_col(X_train_fill1)
X_train_fill2 = drop_one_value_col(X_train_fill2)

y_train = train_data['target']

X_test_dam = test_data[X_train_dam.columns]
X_test_clave = test_data[X_train_clave.columns]
X_test_fill1 = test_data[X_train_fill1.columns]
X_test_fill2 = test_data[X_train_fill2.columns]

In [9]:
# `object` 타입의 컬럼을 `category` 타입으로 변환
dam_object_columns = X_train_dam.select_dtypes(include=['object']).columns.tolist()
X_train_dam[dam_object_columns] = X_train_dam[dam_object_columns].astype(str).astype('category')
X_test_dam[dam_object_columns] = X_test_dam[dam_object_columns].astype(str).astype('category')
print(dam_object_columns)

['Model', 'Workorder', 'Equipment_Dam']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_dam[dam_object_columns] = X_test_dam[dam_object_columns].astype(str).astype('category')


In [10]:
# `object` 타입의 컬럼을 `category` 타입으로 변환

clave_object_columns = X_train_clave.select_dtypes(include=['object']).columns.tolist()

X_train_clave[clave_object_columns] = X_train_clave[clave_object_columns].astype(str).astype('category')

X_test_clave[clave_object_columns] = X_test_clave[clave_object_columns].astype(str).astype('category')

print(clave_object_columns)

['Model', 'Workorder']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_clave[clave_object_columns] = X_test_clave[clave_object_columns].astype(str).astype('category')


In [11]:
# `object` 타입의 컬럼을 `category` 타입으로 변환

fill1_object_columns = X_train_fill1.select_dtypes(include=['object']).columns.tolist()

X_train_fill1[fill1_object_columns] = X_train_fill1[fill1_object_columns].astype(str).astype('category')

X_test_fill1[fill1_object_columns] = X_test_fill1[fill1_object_columns].astype(str).astype('category')

print(fill1_object_columns)

['Model', 'Workorder', 'Equipment_Fill1']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fill1[fill1_object_columns] = X_test_fill1[fill1_object_columns].astype(str).astype('category')


In [12]:
# `object` 타입의 컬럼을 `category` 타입으로 변환

fill2_object_columns = X_train_fill2.select_dtypes(include=['object']).columns.tolist()

X_train_fill2[fill2_object_columns] = X_train_fill2[fill2_object_columns].astype(str).astype('category')

X_test_fill2[fill2_object_columns] = X_test_fill2[fill2_object_columns].astype(str).astype('category')

print(fill2_object_columns)

['Model', 'Workorder', 'Equipment_Fill2']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_fill2[fill2_object_columns] = X_test_fill2[fill2_object_columns].astype(str).astype('category')


In [13]:
# 기본 설정
n_splits = 10

# Stratified KFold 설정
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

In [14]:
def f1_metric(y_pred, data, threshold=0.5):
    y_true = data.get_label()
    y_pred = (y_pred >= threshold).astype(int)  # 임계값 적용
    return 'f1', f1_score(y_true, y_pred), True

In [None]:
multi_scores = []
multi_models = []

for fold, (train_index, valid_index) in enumerate(skf.split(X_train_dam, y_train)):
    models = [ ]
    scores = { }
    y_tr, y_val = y_train.iloc[train_index], y_train.iloc[valid_index]

    X_tr_dam, X_val_dam = X_train_dam.iloc[train_index], X_train_dam.iloc[valid_index]
    X_tr_clave, X_val_clave = X_train_clave.iloc[train_index], X_train_clave.iloc[valid_index]
    X_tr_fill1, X_val_fill1 = X_train_fill1.iloc[train_index], X_train_fill1.iloc[valid_index]
    X_tr_fill2, X_val_fill2 = X_train_fill2.iloc[train_index], X_train_fill2.iloc[valid_index]
    
    train_set_dam = lgb.Dataset(X_tr_dam, label=y_tr, categorical_feature=dam_object_columns, free_raw_data=False)
    valid_set_dam = lgb.Dataset(X_val_dam, label=y_val, categorical_feature=dam_object_columns, reference=train_set_dam, free_raw_data=False)
    train_set_clave = lgb.Dataset(X_tr_clave, label=y_tr, categorical_feature=clave_object_columns, free_raw_data=False)
    valid_set_clave = lgb.Dataset(X_val_clave, label=y_val, categorical_feature=clave_object_columns, reference=train_set_clave, free_raw_data=False)
    train_set_fill1 = lgb.Dataset(X_tr_fill1, label=y_tr, categorical_feature=fill1_object_columns, free_raw_data=False)
    valid_set_fill1 = lgb.Dataset(X_val_fill1, label=y_val, categorical_feature=fill1_object_columns, reference=train_set_fill1, free_raw_data=False)
    train_set_fill2 = lgb.Dataset(X_tr_fill2, label=y_tr, categorical_feature=fill2_object_columns, free_raw_data=False)
    valid_set_fill2 = lgb.Dataset(X_val_fill2, label=y_val, categorical_feature=fill2_object_columns, reference=train_set_fill2, free_raw_data=False)
    
    params = {
        "objective": "binary",  # 이진 분류의 경우
        "boosting_type": "gbdt",
#         "learning_rate": 0.05,
        "random_state": random_state,
        "verbose": -1,
        "metric": "None",  # 기본 메트릭을 사용하지 않도록 설정
        'importance_type': "gain",
        "n_jobs":-1,
    }

    # DAM 모델 훈련

    model_dam = lgb.train(

        params,

        train_set_dam,

        valid_sets=[valid_set_dam],

        feval=lambda y_pred, data: f1_metric(y_pred, data, threshold=0.5),

        num_boost_round=4000,

        callbacks=[

            lgb.early_stopping(stopping_rounds=1000),

            lgb.log_evaluation(500)

        ]

    )

    models.append(model_dam)

    scores['dam'] = model_dam.best_score['valid_0']['f1']  # F1 스코어 저장

    

    # Clave 모델 훈련

    model_clave = lgb.train(

        params,

        train_set_clave,

        valid_sets=[valid_set_clave],

        feval=lambda y_pred, data: f1_metric(y_pred, data, threshold=0.5),

        num_boost_round=4000,

        callbacks=[

            lgb.early_stopping(stopping_rounds=1000),

            lgb.log_evaluation(500)

        ]

    )

    models.append(model_clave)

    scores['clave'] = model_clave.best_score['valid_0']['f1']  # F1 스코어 저장

    # Fill1 모델 훈련

    model_fill1 = lgb.train(

        params,

        train_set_fill1,

        valid_sets=[valid_set_fill1],

        feval=lambda y_pred, data: f1_metric(y_pred, data, threshold=0.5),

        num_boost_round=4000,

        callbacks=[

            lgb.early_stopping(stopping_rounds=1000),

            lgb.log_evaluation(500)

        ]

    )

    models.append(model_fill1)

    scores['fill1'] = model_fill1.best_score['valid_0']['f1']  # F1 스코어 저장

    # Fill2 모델 훈련

    model_fill2 = lgb.train(

        params,

        train_set_fill2,

        valid_sets=[valid_set_fill2],

        feval=lambda y_pred, data: f1_metric(y_pred, data, threshold=0.5),

        num_boost_round=4000,

        callbacks=[

            lgb.early_stopping(stopping_rounds=1000),

            lgb.log_evaluation(500)

        ]

    )

    models.append(model_fill2)
    scores['fill2'] = model_fill2.best_score['valid_0']['f1']
    
    multi_scores .append(scores)
    multi_models.append(models)

Training until validation scores don't improve for 1000 rounds


In [None]:
print(multi_scores)

In [None]:
values_2d_array = np.array([[d['dam'], d['clave'], d['fill1'], d['fill2']] for d in multi_scores])

score_per_model = np.mean(values_2d_array, axis=0)
weight_per_model = score_per_model / score_per_model.sum()

In [None]:
# 각 모델별로 최적의 threshold와 관련된 정보를 저장할 리스트 초기화
thresholds = np.arange(-0.01, 1.0, 0.01)
multi_best_thresholds = []
multi_fold_best_scores = []
f1_scores_per_group = []
total_best_thresholds = []
total_best_scores = []

# 전체 데이터를 위한 결과 저장용 DataFrame
results_df = pd.DataFrame()

# 각 threshold에 대해 f1_score 계산
for fold, (train_index, valid_index) in enumerate(skf.split(X_train_dam, y_train)):
    y_tr, y_val = y_train.iloc[train_index], y_train.iloc[valid_index]
    X_tr_dam, X_val_dam = X_train_dam.iloc[train_index], X_train_dam.iloc[valid_index]
    X_tr_clave, X_val_clave = X_train_clave.iloc[train_index], X_train_clave.iloc[valid_index]
    X_tr_fill1, X_val_fill1 = X_train_fill1.iloc[train_index], X_train_fill1.iloc[valid_index]
    X_tr_fill2, X_val_fill2 = X_train_fill2.iloc[train_index], X_train_fill2.iloc[valid_index]
    
    dam_best_threshold = 0
    dam_best_score = 0
    clave_best_threshold = 0
    clave_best_score = 0
    fill1_best_threshold = 0
    fill1_best_score = 0
    fill2_best_threshold = 0
    fill2_best_score = 0
    
    dam_pred_proba = multi_models[fold][0].predict(X_val_dam)
    clave_pred_proba = multi_models[fold][1].predict(X_val_clave)
    fill1_pred_proba = multi_models[fold][2].predict(X_val_fill1)
    fill2_pred_proba = multi_models[fold][3].predict(X_val_fill2)
    
    best_thresholds = []
    fold_best_scores = []
    for threshold in thresholds:
        # Threshold 적용하여 클래스 결정
        dam_pred = np.where(dam_pred_proba >= threshold, 1, 0)
        clave_pred = np.where(clave_pred_proba >= threshold, 1, 0)
        fill1_pred = np.where(fill1_pred_proba >= threshold, 1, 0)
        fill2_pred = np.where(fill2_pred_proba >= threshold, 1, 0)
        
        # f1_score 계산
        dam_score = f1_score(y_val, dam_pred)
        clave_score = f1_score(y_val, clave_pred)
        fill1_score = f1_score(y_val, fill1_pred)
        fill2_score = f1_score(y_val, fill2_pred)

        # 최적의 threshold 업데이트
        if dam_score > dam_best_score:
            dam_best_score = dam_score
            dam_best_threshold = threshold
        if clave_score > clave_best_score:
            clave_best_score = clave_score
            clave_best_threshold = threshold
        if fill1_score > fill1_best_score:
            fill1_best_score = fill1_score
            fill1_best_threshold = threshold
        if fill2_score > fill2_best_score:
            fill2_best_score = fill2_score
            fill2_best_threshold = threshold
    
    # 해당 Fold에 대한 최적의 threshold와 F1 스코어 저장
    multi_best_thresholds.append([dam_best_threshold, clave_best_threshold, fill1_best_threshold, fill2_best_threshold])
    multi_fold_best_scores.append([dam_best_score, clave_best_score, fill1_best_score, fill2_best_score])
    
    y_pred_dam = (dam_pred_proba >= dam_best_threshold).astype(int)
    y_pred_clave = (clave_pred_proba >= clave_best_threshold).astype(int)
    y_pred_fill1 = (fill1_pred_proba >= fill1_best_threshold).astype(int)
    y_pred_fill2 = (fill2_pred_proba >= fill2_best_threshold).astype(int)
    
    total_best_score = 0
    total_best_threshold = 0
    for threshold in thresholds:
        final_prediction = (
            weight_per_model[0] * y_pred_dam +
            weight_per_model[1] * y_pred_clave +
            weight_per_model[2] * y_pred_fill1 +
            weight_per_model[3] * y_pred_fill2
        ) >= threshold
        final_prediction = final_prediction.astype(int)
        score = f1_score(y_val, final_prediction)
        
        if score > total_best_score:
            total_best_score = score
            total_best_threshold = threshold

    total_best_thresholds.append(total_best_threshold)
    total_best_scores.append(total_best_score)
    
# 각 Fold별 최적의 threshold와 F1 스코어 출력
print(f"\nBest Thresholds per fold: {multi_best_thresholds}")
# print(f"Best F1 Scores per fold: {multi_fold_best_scores}")
print(f"Mean F1 Score : {np.mean(multi_fold_best_scores, axis=0)}")
print(f"Total Mean F1 Score : {np.mean(f1_scores_per_group)}")
print(f"\nBest Thresholds per fold: {total_best_thresholds}")
print(f"Best F1 Scores per fold: {total_best_scores}")

In [None]:
# Test 데이터에 대해 최적의 threshold를 적용한 예측
final_predictions = []

for fold in range(len(models)):
    # X_test에 대한 예측 확률 계산
    dam_pred_proba = multi_models[fold][0].predict(X_test_dam)
    clave_pred_proba = multi_models[fold][1].predict(X_test_clave)
    fill1_pred_proba = multi_models[fold][2].predict(X_test_fill1)
    fill2_pred_proba = multi_models[fold][3].predict(X_test_fill2)
    
    dam_best_threshold, clave_best_threshold, fill1_best_threshold, fill2_best_threshold = multi_best_thresholds[fold]
    # 각 fold에 해당하는 최적의 threshold 적용하여 클래스 결정 (0 또는 1)
    y_pred_dam = (dam_pred_proba >= dam_best_threshold).astype(int)
    y_pred_clave = (clave_pred_proba >= clave_best_threshold).astype(int)
    y_pred_fill1 = (fill1_pred_proba >= fill1_best_threshold).astype(int)
    y_pred_fill2 = (fill2_pred_proba >= fill2_best_threshold).astype(int)
    
    final_prediction = (
        weight_per_model[0] * y_pred_dam +
        weight_per_model[1] * y_pred_clave +
        weight_per_model[2] * y_pred_fill1 +
        weight_per_model[3] * y_pred_fill2
    ) >= total_best_thresholds[fold]
    final_prediction = final_prediction.astype(int)
    
    # 예측 결과 저장
    final_predictions.append(final_prediction)

# 모든 폴드의 예측 결과를 합쳐서 최종 결론 도출 (평균)
final_predictions = np.mean(final_predictions, axis=0)
final_predictions = np.where(final_predictions >= 0.5, 1, 0)  # 평균이 0.5 이상인 경우 1로 결정

# 최종 예측 결과 출력
print("Final Predictions for X_test:")
print(final_predictions)
print(final_predictions.sum())

In [None]:
# 매핑 함수 정의
mapping = {1: 'AbNormal', 0: 'Normal'}
map_func = np.vectorize(mapping.get)

# 배열에 매핑 함수 적용
arr_mapped = map_func(final_predictions)
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = arr_mapped

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)