# 데이터 불러오기 및 전처리

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import os
import wandb

from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.metrics import f1_score, roc_auc_score, recall_score, precision_score, classification_report
from sklearn.cluster import DBSCAN
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import pairwise_distances

In [None]:
DATA_PATH = "/content/drive/MyDrive/멋쟁이사자처럼/DataThon/dataset/Moulding/"
SEED = 42
os.environ["WANDB_SILENT"] = "true"

In [None]:
train = pd.read_csv(f"{DATA_PATH}train_cn7.csv")    # cn7 / rg3
test = pd.read_csv(f"{DATA_PATH}test_cn7.csv")  # cn7 / rg3
unlab = pd.read_csv(f"{DATA_PATH}unlab_cn7.csv")    # cn7 / rg3

unlab.drop(columns=["Unnamed: 0"], inplace=True)

train['PART_FACT_SERIAL'] = train['PART_FACT_SERIAL'].astype(object)
test['PART_FACT_SERIAL'] = test['PART_FACT_SERIAL'].astype(object)
unlab['PART_FACT_SERIAL'] = unlab['PART_FACT_SERIAL'].astype(object)

labeled = pd.concat([train, test], axis=0)
serial = labeled["PART_FACT_SERIAL"].unique().tolist()

unlab = unlab[unlab["EQUIP_NAME"] == "650톤-우진2호기"]
unlab = unlab[unlab["PART_FACT_SERIAL"].isin(serial)]
unlab = unlab[unlab["Switch_Over_Position"] == 0]

train_ft = train.copy()
test_ft = test.copy()
unlab_ft = unlab.copy()

train_ft = labeled[:train_ft.shape[0]].copy()
test_ft = labeled[train_ft.shape[0]:].copy()

label_drop_cols = ["_id", "TimeStamp", "PART_FACT_PLAN_DATE", "EQUIP_CD", "EQUIP_NAME", "PassOrFail", "Reason", 'Barrel_Temperature_7',
             'Mold_Temperature_1', 'Mold_Temperature_2', 'Mold_Temperature_5', 'Mold_Temperature_6', 'Mold_Temperature_7',
             'Mold_Temperature_8', 'Mold_Temperature_9', 'Mold_Temperature_10', 'Mold_Temperature_11', 'Mold_Temperature_12']
unlab_drop_cols = [col for col in unlab_ft.columns if col not in train_ft.columns] + [col for col in label_drop_cols if col in unlab_ft.columns]

train_ft = train_ft.drop(label_drop_cols, axis=1).copy()
test_ft = test_ft.drop(label_drop_cols, axis=1).copy()
unlab_ft = unlab_ft.drop(unlab_drop_cols, axis=1).copy()

y_train = (train['PassOrFail'] == 'N').astype(int)

target = (test['PassOrFail'] == 'N').astype(int)

def preprocess(df, test=False, enc=None, scaler=None):
    obj_col = df.select_dtypes('object').columns

    if not(test):
        enc = OneHotEncoder(handle_unknown='ignore')
        enc.fit(df[obj_col])
    df[enc.get_feature_names_out()] = enc.transform(df[obj_col]).toarray()
    df.drop(obj_col, axis=1, inplace=True)

    if not(test):
        scaler = RobustScaler()
        scaler.fit(df)
    df = scaler.transform(df)

    return df, enc, scaler

train_ft, enc, scaler = preprocess(train_ft)
test_ft, _, _ = preprocess(test_ft, test=True, enc=enc, scaler=scaler)
unlab_ft, _, _ = preprocess(unlab_ft, test=True, enc=enc, scaler=scaler)

train_ft.shape, test_ft.shape, unlab_ft.shape, y_train.shape, target.shape

((2400, 35), (1574, 35), (10048, 35), (2400,), (1574,))

In [None]:
valid_ft, test_ft, valid_target, target = train_test_split(test_ft, target, test_size=0.65, random_state=SEED, shuffle=True, stratify=target)
valid_ft.shape, test_ft.shape, valid_target.shape, target.shape

((550, 35), (1024, 35), (550,), (1024,))

# WandB - DBSCAN

In [None]:
wandb.login()

True

In [None]:
# wandb.init(
#     project="datathon-dbscan",  # 프로젝트 이름
#     entity="sr279-team" # 팀 이름
# )

In [None]:
sweep_config = {
    "name" : "dbscan_tunning_01",
    "method": "bayes",
    "metric": {
        "name": "f1_score",
        "goal": "maximize"
    },
    "parameters": {
        "eps": {
            "distribution": "uniform",
            "min": 8.0,
            "max": 16.0
        },
        "min_samples": {
            "distribution": "int_uniform",
            "min": 25,
            "max": 40
        }
    }
}

sweep_id = wandb.sweep(sweep_config, project="datathon-dbscan", entity="sr279-team")

Create sweep with ID: g0c77e5v
Sweep URL: https://wandb.ai/sr279-team/datathon-dbscan/sweeps/g0c77e5v


In [None]:
class AgentFunction:
    def __init__(self, x, y, target, seed):
        self.x, self.y, self.target, self.seed = x, y, target, seed

    def __call__(self, config=None):
        with wandb.init(config=config):
            config = wandb.config
            model = DBSCAN(eps=config.eps,
                           min_samples=config.min_samples,
                           metric='euclidean',
                           n_jobs=-1)
            model.fit(self.x)

            pred = model.fit_predict(self.y)
            pred = np.where(pred==-1, 1, 0)

            f1 = f1_score(self.target, pred)
            precision = precision_score(self.target, pred)
            recall = recall_score(self.target, pred)
            roc_auc = roc_auc_score(self.target, pred)
            outlier_cnt = np.sum(pred)

            wandb.log({"f1_score": f1,
                       "precision": precision,
                        "recall": recall,
                        "roc_auc": roc_auc,
                        "outlier_cnt": outlier_cnt})

In [None]:
agent_func = AgentFunction(train_ft, valid_ft, target, SEED)
wandb.agent(sweep_id, agent_func, count=2000)

In [None]:
wandb.finish()

# WandB - Pseudo Labeling with DBSCAN

In [None]:
wandb.login()

True

In [None]:
# wandb.init(
#     project="pseudo-dbscan",  # 프로젝트 이름
#     entity="sr279-team" # 팀 이름
# )

In [None]:
sweep_config = {
    "name" : "pseudo_dbscan_tunning_01",
    "method": "bayes",
    "metric": {
        "name": "f1_score",
        "goal": "maximize"
    },
    "parameters": {
        "eps": {
            "distribution": "uniform",
            "min" : 2.0,
            "max" : 20.0
        },
        "min_samples": {
            "distribution": "int_uniform",
            "min" : 5,
            "max" : 36
        },
        "usage_size": {
            "distribution": "uniform",
            "min": 0.1,
            "max": 1.0
        },
        "concat_size": {
            "distribution": "uniform",
            "min": 0.001,
            "max": 0.1
        },
        "abnormal_rate": {
            "distribution": "uniform",
            "min": 0.01,
            "max": 1.0
        }
    }
}

sweep_id = wandb.sweep(sweep_config, project="pseudo-dbscan", entity="sr279-team")

Create sweep with ID: ozzoanmp
Sweep URL: https://wandb.ai/sr279-team/pseudo-dbscan/sweeps/ozzoanmp


In [None]:
class AgentFunction:
    def __init__(self, train_ft, unlab_ft, test_ft, target, seed):
        self.x, self.y, self.test_ft, self.target, self.seed = train_ft, unlab_ft, test_ft, target, seed

    def __call__(self, config=None):
        with wandb.init(config=config):
            config = wandb.config

            stop_point = self.y.shape[0] * (1 - config.usage_size)
            X_train = self.x.copy()
            not_labeled = self.y.copy()
            outliers_data = np.empty((0, self.y.shape[1]))

            while not_labeled.shape[0] >= stop_point:
                model = DBSCAN(eps=config.eps, min_samples=config.min_samples)
                model.fit(X_train)
                clustered_labels = model.fit_predict(not_labeled)

                core_sample_idx = model.core_sample_indices_  # core sample index
                core_samples = model.components_  # core sample data
                core_distances = np.min(pairwise_distances(not_labeled, core_samples), axis=1)    # 각 데이터가 가장 가까운 코어 샘플과의 거리

                normal_idx = np.where(clustered_labels != -1)[0]    # 정상
                outliers_idx = np.where(clustered_labels == -1)[0]  # 이상치 (-1)

                normal_top_idx = []
                unique_clusters = np.unique(clustered_labels[normal_idx])

                for cluster_id in unique_clusters:
                    cluster_points = np.where(clustered_labels == cluster_id)[0]
                    n_normal_top = max(1, int(len(cluster_points) * config.concat_size))
                    top_idx = cluster_points[np.argsort(core_distances[cluster_points])[:n_normal_top]]
                    normal_top_idx.extend(top_idx)

                n_outliers_top = max(1, int(len(outliers_idx) * config.abnormal_rate))
                outliers_top_idx = outliers_idx[np.argsort(core_distances[outliers_idx])[:n_outliers_top]]

                # 정상, 이상치 데이터 추출
                pseudo_labeled_normal = not_labeled[normal_top_idx, :]
                pseudo_labeled_outliers = not_labeled[outliers_top_idx, :]

                X_train = np.vstack([X_train, pseudo_labeled_normal])  # 학습 데이터에 pseudo-labeled 정상 데이터 추가
                not_labeled = np.delete(not_labeled, np.concatenate([normal_top_idx, outliers_top_idx]), axis=0)
                outliers_data = np.vstack([outliers_data, pseudo_labeled_outliers]) if outliers_data.size > 0 else pseudo_labeled_outliers

            concat_ft = np.concatenate([X_train, outliers_data], axis=0)
            target_normal, target_outliers = np.zeros(X_train.shape[0]), np.ones(outliers_data.shape[0])
            concat_target = np.concatenate([target_normal, target_outliers], axis=0)
            outliers_cnt = np.sum(concat_target)

            wandb.log(
                {
                    "outliers_cnt" : outliers_cnt
                    }
                )

            # Supervised Learning
            models = {
                "Logistic Regression": LogisticRegression(random_state=self.seed),
                "XGBoost": XGBClassifier(random_state=self.seed),
                "Random Forest": RandomForestClassifier(random_state=self.seed),
                "LightGBM": LGBMClassifier(random_state=self.seed),
            }

            for model_name, model in models.items():
                model.fit(concat_ft, concat_target)
                pred = model.predict(self.test_ft)

                f1 = f1_score(self.target, pred, zero_division=0)
                precision = precision_score(self.target, pred, zero_division=0)
                recall = recall_score(self.target, pred, zero_division=0)
                roc_auc = roc_auc_score(self.target, pred)

                metrics = {
                    f"{model_name} f1_score": f1,
                    f"{model_name} precision": precision,
                    f"{model_name} recall": recall,
                    f"{model_name} roc_auc": roc_auc,
                    f"{model_name} outlier_cnt": np.sum(pred),
                }
                wandb.log(metrics)

In [None]:
agent_func = AgentFunction(train_ft, unlab_ft, test_ft, target, SEED)
wandb.agent(sweep_id, agent_func, count=2000)

In [None]:
wandb.finish()

# Logistic Coefficent 확인

In [None]:
def pseudo_labeling_dbscan(train_ft, unlab_ft, abnormal_rate=0.25903, concat_size=0.028164, usage_size=0.11064, eps=4.58987, min_samples=35):
    X_train = train_ft.copy()
    not_labeled = unlab_ft.copy()
    outliers_data = np.empty((0, unlab_ft.shape[1]))
    stop_point = unlab_ft.shape[0] * (1 - usage_size)
    while not_labeled.shape[0] >= stop_point:
        model = DBSCAN(eps=eps, min_samples=min_samples)
        model.fit(X_train)
        clustered_labels = model.fit_predict(not_labeled)

        core_sample_idx = model.core_sample_indices_  # core sample index
        core_samples = model.components_  # core sample data
        core_distances = np.min(pairwise_distances(not_labeled, core_samples), axis=1)    # 각 데이터가 가장 가까운 코어 샘플과의 거리

        normal_idx = np.where(clustered_labels != -1)[0]    # 정상
        outliers_idx = np.where(clustered_labels == -1)[0]  # 이상치 (-1)

        normal_top_idx = []
        unique_clusters = np.unique(clustered_labels[normal_idx])

        for cluster_id in unique_clusters:
            cluster_points = np.where(clustered_labels == cluster_id)[0]
            n_normal_top = max(1, int(len(cluster_points) * concat_size))
            top_idx = cluster_points[np.argsort(core_distances[cluster_points])[:n_normal_top]]
            normal_top_idx.extend(top_idx)

        n_outliers_top = max(1, int(len(outliers_idx) * abnormal_rate))
        outliers_top_idx = outliers_idx[np.argsort(core_distances[outliers_idx])[:n_outliers_top]]

        # 정상, 이상치 데이터 추출
        pseudo_labeled_normal = not_labeled[normal_top_idx, :]
        pseudo_labeled_outliers = not_labeled[outliers_top_idx, :]

        X_train = np.vstack([X_train, pseudo_labeled_normal])  # 학습 데이터에 pseudo-labeled 정상 데이터 추가
        not_labeled = np.delete(not_labeled, np.concatenate([normal_top_idx, outliers_top_idx]), axis=0)
        outliers_data = np.vstack([outliers_data, pseudo_labeled_outliers]) if outliers_data.size > 0 else pseudo_labeled_outliers

    concat_ft = np.concatenate([X_train, outliers_data], axis=0)
    target_normal, target_outliers = np.zeros(X_train.shape[0]), np.ones(outliers_data.shape[0])
    concat_target = np.concatenate([target_normal, target_outliers], axis=0)
    outliers_cnt = np.sum(concat_target)

    return concat_ft, concat_target, outliers_cnt

In [None]:
concat_ft, concat_target, outliers_cnt = pseudo_labeling_dbscan(train_ft, unlab_ft)
concat_ft.shape, concat_target.shape, outliers_cnt

((3562, 35), (3562,), 107.0)

In [None]:
# 1번 - abnormal_rate=0.26661, concat_size=0.080206, usage_size=0.28521, eps=7.5285, min_samples=25
# f1, precision, recall = 0.76 / 1 / 0.61

# 2번 - abnormal_rate=0.16131, concat_size=0.021646, usage_size=0.13654, eps=7.6763, min_samples=13
# f1, precision, recall = 0.71 / 1 / 0.56

# 3번 - abnormal_ratio : 0.76389, concat_size : 0.040528, usage_size : 0.2329, eps=7.83883, min_samples=21
# f1, precision, recall = 0.76 / 1 / 0.61

# 4번 - abnormal_rate=0.25903, concat_size=0.028164, usage_size=0.11064, eps=4.58987, min_samples=35
# f1, precision, recall = 0.73 / 0.92 / 0.61

In [None]:
model = LogisticRegression(random_state=SEED)
model.fit(concat_ft, concat_target)
pred = model.predict(test_ft)

print(classification_report(target, pred))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1006
           1       0.92      0.61      0.73        18

    accuracy                           0.99      1024
   macro avg       0.95      0.81      0.86      1024
weighted avg       0.99      0.99      0.99      1024



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
cols = [
    'Injection_Time', 'Filling_Time', 'Plasticizing_Time', 'Cycle_Time',
    'Clamp_Close_Time', 'Cushion_Position', 'Switch_Over_Position',
    'Plasticizing_Position', 'Clamp_Open_Position', 'Max_Injection_Speed',
    'Max_Screw_RPM', 'Average_Screw_RPM', 'Max_Injection_Pressure',
    'Max_Switch_Over_Pressure', 'Max_Back_Pressure',
    'Average_Back_Pressure', 'Barrel_Temperature_1', 'Barrel_Temperature_2',
    'Barrel_Temperature_3', 'Barrel_Temperature_4', 'Barrel_Temperature_5',
    'Barrel_Temperature_6', 'Hopper_Temperature', 'Mold_Temperature_3',
    'Mold_Temperature_4', 'PART_FACT_SERIAL_9', 'PART_FACT_SERIAL_10',
    'PART_FACT_SERIAL_13', 'PART_FACT_SERIAL_14', 'PART_FACT_SERIAL_21',
    'PART_FACT_SERIAL_22', 'PART_FACT_SERIAL_23', 'PART_FACT_SERIAL_24',
    "PART_NAME_CN7 W/S SIDE MLD'G LH", "PART_NAME_CN7 W/S SIDE MLD'G RH"
]

In [None]:
concat_coeff = pd.DataFrame(concat_ft, columns=cols)

model = LogisticRegression(random_state=SEED)
model.fit(concat_coeff, concat_target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
coefficients = model.coef_[0]
coef_df = pd.DataFrame(coefficients, index=cols, columns=['Coefficient'])
coef_df_sorted = coef_df.sort_values(by='Coefficient', ascending=False)
coef_df_sorted

Unnamed: 0,Coefficient
Max_Back_Pressure,1.401329
Max_Injection_Speed,1.38706
Filling_Time,1.380212
Plasticizing_Position,0.8784
Mold_Temperature_3,0.506031
PART_NAME_CN7 W/S SIDE MLD'G LH,0.498488
Average_Screw_RPM,0.465418
PART_FACT_SERIAL_24,0.301896
Cycle_Time,0.135872
Plasticizing_Time,0.125295
