In [38]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

import optuna
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from functools import partial
from scipy.optimize import minimize

#### 데이터 불러오기

In [5]:
path = '/kaggle/input/playground-series-s4e3/'

train_data = pd.read_csv(path + 'train.csv')
test_data = pd.read_csv(path + 'test.csv')
original_data = pd.read_csv('/kaggle/input/faulty-steel-plates/' + 'faults.csv')

TARGETS = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains','Dirtiness', 'Bumps', 'Other_Faults']

train_data.drop(['id'],axis = 1,inplace = True)
test_data.drop(['id'],inplace = True,axis = 1)

# 원본 데이터 합치기
train_data = pd.concat([train_data,original_data],axis = 0)
train_data.reset_index(drop=True, inplace=True)

print(train_data.shape)
print(test_data.shape)

(21160, 34)

(12814, 27)


#### XGB 따로 처리

In [6]:
train_data = train_data[train_data[TARGETS].sum(axis=1) <= 1]
# XGB 쓰려면 해야됨
train_data['Outside_Global_Index'] = np.where(train_data['Outside_Global_Index']==0.7, 0.5, train_data['Outside_Global_Index'])
targets_bin = train_data[TARGETS]
y_xgb = targets_bin

#### Label encoding

In [7]:
# 7개의 결함 중에서 어느것에 속하는 지 'Target'으로 표시하고 만약 어느것에도 해당하지 아니하면 
# 0이 부여된다.
train_data['Target'] = np.argmax(train_data[TARGETS].values, axis=1) + 1
train_data.loc[train_data[TARGETS].sum(axis=1) == 0, 'Target'] = 0
train_data.drop(TARGETS, inplace=True,axis =1)

#### Derived features

In [9]:
train_data['Ratio_Length_Thickness'] = train_data['Length_of_Conveyer'] / train_data['Steel_Plate_Thickness']
train_data['Average_Luminosity'] = train_data['Sum_of_Luminosity'] / train_data['Pixels_Areas']
train_data['X_Range*Pixels_Areas'] = (train_data['X_Maximum'] - train_data['X_Minimum']) * train_data['Pixels_Areas']

test_data['Ratio_Length_Thickness'] = test_data['Length_of_Conveyer'] / test_data['Steel_Plate_Thickness']
test_data['Average_Luminosity'] = test_data['Sum_of_Luminosity'] / test_data['Pixels_Areas']
test_data['X_Range*Pixels_Areas'] = (test_data['X_Maximum'] - test_data['X_Minimum']) * train_data['Pixels_Areas']

#### 클러스터링

In [12]:
# 클러스터링에 사용할 특성 선택
features = ['X_Minimum', 'Y_Minimum', 'Pixels_Areas', 'Sum_of_Luminosity', 'Steel_Plate_Thickness']
# 클러스터링 모델 생성 및 학습
kmeans = KMeans(n_clusters=4)

kmeans.fit(train_data[features])
# train 데이터에 클러스터링 결과 추가
train_data['Cluster'] = kmeans.labels_
# test 데이터에 클러스터링 결과 추가
test_data['Cluster'] = kmeans.predict(test_data[features])

#### 로그 스케일링 (Steel Plate Thickness는 제거함)

In [13]:
# log scaling 
for col in ['X_Perimeter', 'Pixels_Areas']:
    train_data[col] = np.log1p(train_data[col])
    test_data[col] = np.log1p(test_data[col])

#### 피쳐 제거

In [14]:
features_to_drop = ['Y_Minimum', 'Steel_Plate_Thickness', 'Sum_of_Luminosity', 'Edges_X_Index', 'SigmoidOfAreas', 'Luminosity_Index']

train_data = train_data.drop(features_to_drop,axis = 1)
test_data = test_data.drop(features_to_drop,axis = 1)

#### Train-Test set 분리, StratifiedKFold

In [15]:
X = train_data.drop(['Target'], axis=1)  
y = train_data['Target']  
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

#### 하이퍼파라미터

In [16]:
lgbm_params = {
    "objective": "multiclass",
    'num_class': 8,
    "boosting_type": "gbdt",
    "verbosity": -1,
    "early_stopping_rounds": 50,
    'n_estimators': 3000,
    'learning_rate': 0.00711725041768046,
    'lambda_l1': 0.00440985033109004,
    'lambda_l2': 1.0606279361419868e-08,
    'max_depth': 8,
    'colsample_bytree': 0.33443191322905963,
    'subsample': 0.6551114943335209,
    'min_child_weight': 1,
    'num_leaves': 1019,
    'min_child_samples': 94,
}

cat_params = {
    "loss_function": "MultiClass",
    "eval_metric": "MultiClass",
    "bootstrap_type": "Bernoulli",
    "early_stopping_rounds": 100,
    "verbose": False,
    'n_estimators': 2000,
    'learning_rate': 0.018384945397206656,
    'l2_leaf_reg': 0.8746200477430335,
    'depth': 6,
    'colsample_bylevel': 0.1250483657354198,
    'subsample': 0.8794673581253512
}

xgb_params = {
    'grow_policy': 'depthwise',
    'multi_strategy': 'multi_output_tree',
    'n_estimators': 3000,
    'early_stopping_rounds': 50,
    'learning_rate': 0.011991933688453596,
    'gamma': 0.037522403671044376,
    'subsample': 0.8810951837647127,
    'colsample_bytree': 0.36803100807865063,
    'max_depth': 7,
    'min_child_weight': 2,
    'lambda': 0.38010636016035176,
    'alpha': 0.047233571811149226,
}

hgbc_params = {
    'learning_rate': 0.038914759891422,
    'max_iter': 1381,
    'max_depth': 4,
    'l2_regularization': 9.698385966882728e-07,
    'min_samples_leaf': 281,
    'max_bins': 100
}

In [25]:
def compute_xgb(X, y, model, cv):
    cv_oof = pd.DataFrame(np.zeros((len(y), len(TARGETS))), columns=TARGETS)
    prob_predictions_test = []
    
    for train_idx, valid_idx in cv.split(X, y):
        X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
        y_xgb_train_fold, y_xgb_valid_fold = y_xgb.iloc[train_idx], y_xgb.iloc[valid_idx]

        model.fit(X_train_fold, y_xgb_train_fold, eval_set=[(X_valid_fold, y_xgb_valid_fold)],verbose=False)
        cv_oof.iloc[valid_idx, :] = model.predict_proba(X_valid_fold)
        prob_predictions_test.append(model.predict_proba(test_data))
    preds = np.mean(prob_predictions_test, axis=0)
    return cv_oof, preds

In [26]:
xgb_oof, xgb_preds = compute_xgb(X, y, XGBClassifier(**xgb_params), cv)

In [27]:
def compute_cat(X, y, model, cv):
    cv_oof = pd.DataFrame(np.zeros((len(y), len(TARGETS))), columns=TARGETS)
    prob_predictions_test = []
    for train_idx, valid_idx in cv.split(X, y):
        X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
        y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]

        model.fit(X_train_fold, y_train_fold, eval_set=[(X_valid_fold, y_valid_fold)])
        cv_oof.iloc[valid_idx, :] = model.predict_proba(X_valid_fold)[:, 1:]
        prob_predictions_test.append(model.predict_proba(test_data)[:, 1:])

    preds = np.mean(prob_predictions_test, axis=0)
    return cv_oof, preds

In [28]:
cat_oof, cat_preds = compute_cat(X, y, CatBoostClassifier(**cat_params), cv)

In [29]:
def compute_lgbm(X, y, model, cv):
    cv_oof = pd.DataFrame(np.zeros((len(y), len(TARGETS))), columns=TARGETS)
    prob_predictions_test = []
    for train_idx, valid_idx in cv.split(X, y):
        X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
        y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]

        model.fit(X_train_fold, y_train_fold, eval_set=[(X_valid_fold, y_valid_fold)])
        cv_oof.iloc[valid_idx, :] = model.predict_proba(X_valid_fold)[:, 1:]
        prob_predictions_test.append(model.predict_proba(test_data)[:, 1:])

    preds = np.mean(prob_predictions_test, axis=0)
    return cv_oof, preds

In [30]:
lgbm_oof, lgbm_preds = compute_lgbm(X, y, LGBMClassifier(**lgbm_params), cv)

In [33]:
def compute_hgbc(X, y, model, cv):
    cv_oof = pd.DataFrame(np.zeros((len(y), len(TARGETS))), columns=TARGETS)
    prob_predictions_test = []
    for train_idx, valid_idx in cv.split(X, y):
        X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
        y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]

        model.fit(X_train_fold, y_train_fold)
        cv_oof.iloc[valid_idx, :] = model.predict_proba(X_valid_fold)[:, 1:]
        prob_predictions_test.append(model.predict_proba(test_data)[:, 1:])

    preds = np.mean(prob_predictions_test, axis=0)
    return cv_oof, preds

In [34]:
hgbc_oof, hgbc_preds = compute_hgbc(X, y, HistGradientBoostingClassifier(**hgbc_params), cv)

#### PD to Numpy array

In [35]:
xgb_oof = np.array(xgb_oof)
cat_oof = np.array(cat_oof)
lgbm_oof = np.array(lgbm_oof)
hgbc_oof = np.array(hgbc_oof)

xgb_preds = np.array(xgb_preds)
cat_preds = np.array(cat_preds)
lgbm_preds = np.array(lgbm_preds)
hgbc_preds = np.array(hgbc_preds)

#### 각 모델 성능 확인

In [36]:
xgb_score = roc_auc_score(targets_bin, xgb_oof, multi_class='ovr')
lgbm_score = roc_auc_score(targets_bin, lgbm_oof, multi_class='ovr')
cat_score = roc_auc_score(targets_bin, cat_oof, multi_class='ovr')
hgbc_score = roc_auc_score(targets_bin, hgbc_oof, multi_class='ovr')

print(f"oof roc-auc score for XGB  model: {xgb_score:0.5f}")
print(f"oof roc-auc score for LGBM model: {lgbm_score:0.5f}")
print(f"oof roc-auc score for CAT  model: {cat_score:0.5f}")
print(f"oof roc-auc score for HGBC model: {hgbc_score:0.5f}")

oof roc-auc score for XGB  model: 0.89876

oof roc-auc score for LGBM model: 0.89906

oof roc-auc score for CAT  model: 0.89867

oof roc-auc score for HGBC model: 0.89602


#### Ensemble

In [40]:
def calculate_roc_auc(weights, oof_1, oof_2, oof_3, oof_4, target):
    # Normalize weights
    weights /= np.sum(weights)
    weighted_sum = oof_1 * weights[0] + oof_2 * weights[1] + oof_3 * weights[2] + oof_4 * weights[3]
    # Calculate ROC AUC score
    score = roc_auc_score(target, weighted_sum, multi_class='ovr')
    return -score

Optimal weights for target 0: [0.186, 0.240, 0.574, 0.000]

Optimal weights for target 1: [0.446, 0.390, 0.000, 0.163]

Optimal weights for target 2: [0.329, 0.425, 0.091, 0.155]

Optimal weights for target 3: [0.045, 0.460, 0.493, 0.001]

Optimal weights for target 4: [0.080, 0.583, 0.336, 0.000]

Optimal weights for target 5: [0.313, 0.353, 0.334, 0.000]

Optimal weights for target 6: [0.408, 0.247, 0.211, 0.134]


In [None]:
ensemble = np.zeros((xgb_oof.shape[0], xgb_oof.shape[1]))
preds = np.zeros((xgb_preds.shape[0], xgb_preds.shape[1]))
initial_weights = np.array([0.25, 0.25, 0.25, 0.25])
bounds = [(0, None), (0, None), (0, None), (0, None)]


for k in range(len(TARGETS)):
    result = minimize(partial(calculate_roc_auc,oof_1=xgb_oof[:, k],oof_2=lgbm_oof[:, k],
                              oof_3=cat_oof[:, k],oof_4=hgbc_oof[:, k],target=targets_bin.iloc[:, k]), 
                      initial_weights, 
                      method='Nelder-Mead',
                      bounds=bounds)
    optimal_weights = result.x / np.sum(result.x)
    # Update print statement and calculation for four models
    ensemble[:, k] = (xgb_oof[:, k] * optimal_weights[0] + lgbm_oof[:, k] * optimal_weights[1] +
                   cat_oof[:, k] * optimal_weights[2] + hgbc_oof[:, k] * optimal_weights[3])
    preds[:, k] = (xgb_preds[:, k] * optimal_weights[0] + lgbm_preds[:, k] * optimal_weights[1] +
                   cat_preds[:, k] * optimal_weights[2] + hgbc_preds[:, k] * optimal_weights[3])
    
    formatted_weights = ', '.join(f"{weight:.3f}" for weight in optimal_weights)
    print(f"Optimal weights for target {k}: [{formatted_weights}]")

In [49]:
ensemble_score = roc_auc_score(targets_bin, ensemble, multi_class='ovr')
print(f"roc-auc score for ensemble model: {ensemble_score:0.5f}")

oof roc-auc score for blend of models: 0.90064


#### Submission

In [50]:
submission = pd.read_csv(path + 'sample_submission.csv')
submission.iloc[:, 1:] = preds

In [51]:
submission.to_csv("submission.csv", index=False)

In [52]:
submission

Unnamed: 0,id,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,19219,0.487746,0.002019,0.003899,0.000124,0.031033,0.137308,0.363444
1,19220,0.263607,0.016861,0.007353,0.000262,0.137746,0.190877,0.341212
2,19221,0.002069,0.038993,0.051891,0.000682,0.005637,0.319911,0.524919
3,19222,0.141271,0.003744,0.001519,0.001267,0.007119,0.400107,0.409799
4,19223,0.002474,0.004327,0.001934,0.001721,0.003757,0.680521,0.272246
...,...,...,...,...,...,...,...,...
12809,32028,0.076400,0.079212,0.005897,0.000227,0.018152,0.187160,0.510282
12810,32029,0.145548,0.005775,0.019582,0.007301,0.152096,0.214529,0.406209
12811,32030,0.000209,0.000478,0.911281,0.000070,0.000166,0.000696,0.067486
12812,32031,0.418061,0.007533,0.019296,0.000175,0.063564,0.150062,0.312594
