In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

import optuna
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

from functools import partial
from scipy.optimize import minimize

In [2]:
path = '/kaggle/input/playground-series-s4e3/'

train_data = pd.read_csv(path + 'train.csv')
test_data = pd.read_csv(path + 'test.csv')
original_data = pd.read_csv('/kaggle/input/faulty-steel-plates/' + 'faults.csv')

TARGETS = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains','Dirtiness', 'Bumps', 'Other_Faults']

train_data.drop(['id'],axis = 1,inplace = True)
test_data.drop(['id'],inplace = True,axis = 1)

# 원본 데이터 합치기
train_data = pd.concat([train_data,original_data],axis = 0)
train_data.reset_index(drop=True, inplace=True)

print(train_data.shape)
print(test_data.shape)

(21160, 34)
(12814, 27)


In [3]:
train_data = train_data[train_data[TARGETS].sum(axis=1) <= 1]
# XGB 쓰려면 해야됨
train_data['Outside_Global_Index'] = np.where(train_data['Outside_Global_Index']==0.7, 0.5, train_data['Outside_Global_Index'])
targets_bin = train_data[TARGETS]
y_xgb = targets_bin

In [4]:
# 7개의 결함 중에서 어느것에 속하는 지 'Target'으로 표시하고 만약 어느것에도 해당하지 아니하면 
# 0이 부여된다.
train_data['Target'] = np.argmax(train_data[TARGETS].values, axis=1) + 1
train_data.loc[train_data[TARGETS].sum(axis=1) == 0, 'Target'] = 0
train_data.drop(TARGETS, inplace=True,axis =1)

In [5]:
train_data['Ratio_Length_Thickness'] = train_data['Length_of_Conveyer'] / train_data['Steel_Plate_Thickness']
train_data['Average_Luminosity'] = train_data['Sum_of_Luminosity'] / train_data['Pixels_Areas']
train_data['X_Range*Pixels_Areas'] = (train_data['X_Maximum'] - train_data['X_Minimum']) * train_data['Pixels_Areas']

test_data['Ratio_Length_Thickness'] = test_data['Length_of_Conveyer'] / test_data['Steel_Plate_Thickness']
test_data['Average_Luminosity'] = test_data['Sum_of_Luminosity'] / test_data['Pixels_Areas']
test_data['X_Range*Pixels_Areas'] = (test_data['X_Maximum'] - test_data['X_Minimum']) * train_data['Pixels_Areas']

In [6]:
# 클러스터링에 사용할 특성 선택
features = ['X_Minimum', 'Y_Minimum', 'Pixels_Areas', 'Sum_of_Luminosity', 'Steel_Plate_Thickness']
# 클러스터링 모델 생성 및 학습
kmeans = KMeans(n_clusters=4)

kmeans.fit(train_data[features])
# train 데이터에 클러스터링 결과 추가
train_data['Cluster'] = kmeans.labels_
# test 데이터에 클러스터링 결과 추가
test_data['Cluster'] = kmeans.predict(test_data[features])



In [7]:
# log scaling 
for col in ['X_Perimeter', 'Pixels_Areas']:
    train_data[col] = np.log1p(train_data[col])
    test_data[col] = np.log1p(test_data[col])

In [8]:
features_to_drop = ['Y_Minimum', 'Steel_Plate_Thickness', 'Sum_of_Luminosity', 'Edges_X_Index', 'SigmoidOfAreas', 'Luminosity_Index']

train_data = train_data.drop(features_to_drop,axis = 1)
test_data = test_data.drop(features_to_drop,axis = 1)

In [9]:
X = train_data.drop(['Target'], axis=1)  
y = train_data['Target']  
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

#### XGB

In [None]:
def objective(trial):
    # Define hyperparameters to tune
    param = {
        'grow_policy': trial.suggest_categorical('grow_policy', ["depthwise", "lossguide"]),
#         'multi_strategy': trial.suggest_categorical('multi_strategy', ["one_output_per_tree"]),
#         'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1),
#         'gamma' : trial.suggest_float('gamma', 1e-5, 0.5, log=True),
#         'subsample': trial.suggest_float('subsample', 0.3, 1.0),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
#         'max_depth': trial.suggest_int('max_depth', 3, 15),
#         'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
#         'lambda': trial.suggest_float('lambda', 1e-3, 10.0, log=True),
#         'alpha': trial.suggest_float('alpha', 1e-3, 10.0, log=True),
#         'n_estimators': 3000,
#         'early_stopping_rounds': 50,
#         'booster': 'gbtree',
#         'tree_method': 'hist'
        
    }
    
    auc_scores = []
    for train_idx, valid_idx in cv.split(X, y):
        X_train_fold = X.iloc[train_idx]
        X_valid_fold = X.iloc[valid_idx]
        
        y_xgb_train_fold = y_xgb.iloc[train_idx]
        y_xgb_valid_fold = y_xgb.iloc[valid_idx]
                
        # Create and fit the model
        model = XGBClassifier(**param)
        model.fit(X_train_fold, y_xgb_train_fold, eval_set=[(X_valid_fold, y_xgb_valid_fold)],verbose=False)

        # Predict class probabilities
        y_prob = model.predict_proba(X_valid_fold)

        # Compute the AUC for each class and take the average
        average_auc = roc_auc_score(targets_bin.iloc[valid_idx], y_prob, multi_class="ovr", average="macro")
        auc_scores.append(average_auc)

    # Return the average AUC score across all folds
    return np.mean(auc_scores)


study = optuna.create_study(direction='maximize',study_name = "xgb_model_training")
study.optimize(objective, n_trials=100)  # Adjust the number of trials as necessary
# Output the optimization results
print(f"Best trial average AUC: {study.best_value:.4f}")
print(study.best_params)
for key, value in study.best_params.items():
    print(f"{key}: {value}")/

[I 2024-06-20 07:42:15,414] A new study created in memory with name: xgb_model_training
[I 2024-06-20 07:42:58,511] Trial 0 finished with value: 0.8914799128399071 and parameters: {'grow_policy': 'lossguide'}. Best is trial 0 with value: 0.8914799128399071.
[I 2024-06-20 07:43:40,694] Trial 1 finished with value: 0.8914799128399071 and parameters: {'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.8914799128399071.
[I 2024-06-20 07:44:22,900] Trial 2 finished with value: 0.8914799128399071 and parameters: {'grow_policy': 'lossguide'}. Best is trial 0 with value: 0.8914799128399071.
[I 2024-06-20 07:45:05,174] Trial 3 finished with value: 0.8914799128399071 and parameters: {'grow_policy': 'lossguide'}. Best is trial 0 with value: 0.8914799128399071.
[I 2024-06-20 07:45:47,666] Trial 4 finished with value: 0.8914799128399071 and parameters: {'grow_policy': 'lossguide'}. Best is trial 0 with value: 0.8914799128399071.
[I 2024-06-20 07:46:28,454] Trial 5 finished with value: 0.891

#### CATBoost

In [None]:
def objective(trial):
    param = {
        "loss_function": "MultiClass",
        "eval_metric": "MultiClass",
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.1),
#         'n_estimators': 2000,
#         "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0.001, 10.0, log=True),
#         "depth": trial.suggest_int("depth", 4, 10),
#         "subsample": trial.suggest_float("subsample", 0.4, 1.0),
#         "bootstrap_type": "Bernoulli",
#         "early_stopping_rounds": 100,
#         "task_type": 'CPU',
   }

    auc_scores = []

    for train_idx, valid_idx in cv.split(X, y):
        X_train_fold = X.iloc[train_idx]
        X_valid_fold = X.iloc[valid_idx]
        y_train_fold = y.iloc[train_idx]
        y_valid_fold = y.iloc[valid_idx]

        # Create and fit the model
        model = CatBoostClassifier(**param)
        model.fit(X_train_fold, y_train_fold, eval_set=[(X_valid_fold, y_valid_fold)])

        # Predict class probabilities
        y_prob = model.predict_proba(X_valid_fold)

        # Compute the AUC for each class and take the average
        average_auc = roc_auc_score(targets_bin.iloc[valid_idx], y_prob[:, 1:], multi_class="ovr", average="macro")
        auc_scores.append(average_auc)

    # Return the average AUC score across all folds
    return np.mean(auc_scores)

# Run Optuna optimization
catboost_study = optuna.create_study(direction='maximize', study_name="catboost_model_training")
catboost_study.optimize(objective, n_trials=100)  # Adjust the number of trials as necessary

# Output the optimization results
print(f"Best average AUC: {study.best_value:.4f}")
print(study.best_params)
for key, value in study.best_params.items():
    print(f"{key}: {value}")

#### LGBM

In [None]:
def objective(trial):
    param = {
    'objective': 'multiclass',  # Equivalent to multi:softmax but needs num_class as well
    'num_class': 8,  # Specify the number of classes if your task is multi-class classification
    'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1),
    'n_estimators': 3000,
#     'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
#     'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
#     'max_depth': trial.suggest_int('max_depth', 3, 15),
#     'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
#     'subsample': trial.suggest_float('subsample', 0.5, 1.0),
#     'min_child_weight': trial.suggest_int('min_child_weight', 1, 8),
#     'device_type': 'cpu',
#     'num_leaves': trial.suggest_int('num_leaves', 4, 2048),
#     "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
#     "verbosity": -1,
#     "early_stopping_rounds": 50,
    }

    auc_scores = []

    for train_idx, valid_idx in cv.split(X, y):
        X_train_fold = X.iloc[train_idx]
        X_valid_fold = X.iloc[valid_idx]
        y_train_fold = y.iloc[train_idx]
        y_valid_fold = y.iloc[valid_idx]

        # Create and fit the model
        model = LGBMClassifier(**param)
        model.fit(X_train_fold, y_train_fold, eval_set=[(X_valid_fold, y_valid_fold)], verbose=False)

        # Predict class probabilities
        y_prob = model.predict_proba(X_valid_fold)

        # Compute the AUC for each class and take the average
        average_auc = roc_auc_score(targets_bin.iloc[valid_idx], y_prob[:, 1:], multi_class="ovr", average="macro")
        auc_scores.append(average_auc)

    # Return the average AUC score across all folds
    return np.mean(auc_scores)

lgbm_study = optuna.create_study(direction='maximize',study_name = "lgbm_model_training")
lgbm_study.optimize(objective, n_trials=100)  # Adjust the number of trials as necessary

# Output the optimization results
print(f"Best average AUC: {study.best_value:.4f}")
print(study.best_params)
for key, value in study.best_params.items():
    print(f"{key}: {value}")

#### HGBM

In [None]:
def objective(trial):
    param = {
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1),
        'max_iter': trial.suggest_int('max_iter', 100, 2500),  # Equivalent to n_estimators
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'l2_regularization': trial.suggest_float('l2_regularization', 1e-8, 10.0, log=True),  # Equivalent to reg_lambda
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 20, 300),
        'max_bins': trial.suggest_int('max_bins', 25, 255),
    }
    
    auc_scores = []

    for train_idx, valid_idx in cv.split(X, y):
        X_train_fold = X.iloc[train_idx]
        X_valid_fold = X.iloc[valid_idx]
        y_train_fold = y.iloc[train_idx]
        y_valid_fold = y.iloc[valid_idx]
        
        # Create and fit the model
        model = HistGradientBoostingClassifier(**param)
        model.fit(X_train_fold, y_train_fold)

        # Predict class probabilities
        y_prob = model.predict_proba(X_valid_fold)

        # Compute the AUC for each class and take the average
        average_auc = roc_auc_score(targets_bin.iloc[valid_idx], y_prob[:, 1:], multi_class="ovr", average="macro")
        auc_scores.append(average_auc)

    # Return the average AUC score across all folds
    return np.mean(auc_scores)


hgbc_study = optuna.create_study(direction='maximize', study_name="HistGradientBoostingClassifier_model_training")
hgbc_study.optimize(objective, n_trials=100)  # Adjust the number of trials as necessary

# Output the optimization results
print(f"Best average AUC: {study.best_value:.4f}")
print(study.best_params)
for key, value in study.best_params.items():
    print(f"{key}: {value}")