# 데이터 불러오기 및 전처리

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import random
import os
import matplotlib.pyplot as plt
import seaborn as sns
import wandb

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler, RobustScaler
from tqdm.auto import tqdm
from sklearn.metrics import f1_score, roc_auc_score, recall_score, classification_report, silhouette_score, precision_score, accuracy_score
from sklearn.cluster import DBSCAN, KMeans
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, KFold, cross_val_score, train_test_split
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import NearestNeighbors

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
# import matplotlib
# from matplotlib import font_manager

# !sudo apt-get install -y fonts-nanum
# !sudo fc-cache -fv

# fontpaths = ["/usr/share/fonts/truetype/nanum/"]
# font_files = font_manager.findSystemFonts(fontpaths=fontpaths)

# for ff in font_files:
#     font_manager.fontManager.addfont(ff)

# matplotlib.rc('font', family="NanumGothic")
# matplotlib.rcParams['axes.unicode_minus'] = False

In [None]:
DATA_PATH = "/content/drive/MyDrive/멋쟁이사자처럼/DataThon/dataset/Moulding/"
SEED = 42

In [None]:
train = pd.read_csv(f"{DATA_PATH}train_cn7.csv")    # cn7 / rg3
test = pd.read_csv(f"{DATA_PATH}test_cn7.csv")  # cn7 / rg3
unlab = pd.read_csv(f"{DATA_PATH}unlab_cn7.csv")    # cn7 / rg3

unlab.drop(columns=["Unnamed: 0"], inplace=True)

train['PART_FACT_SERIAL'] = train['PART_FACT_SERIAL'].astype(object)
test['PART_FACT_SERIAL'] = test['PART_FACT_SERIAL'].astype(object)
unlab['PART_FACT_SERIAL'] = unlab['PART_FACT_SERIAL'].astype(object)

labeled = pd.concat([train, test], axis=0)
serial = labeled["PART_FACT_SERIAL"].unique().tolist()

unlab = unlab[unlab["EQUIP_NAME"] == "650톤-우진2호기"]
unlab = unlab[unlab["PART_FACT_SERIAL"].isin(serial)]
unlab = unlab[unlab["Switch_Over_Position"] == 0]

train_ft = train.copy()
test_ft = test.copy()
unlab_ft = unlab.copy()

train_ft = labeled[:train_ft.shape[0]].copy()
test_ft = labeled[train_ft.shape[0]:].copy()

label_drop_cols = ["_id", "TimeStamp", "PART_FACT_PLAN_DATE", "EQUIP_CD", "EQUIP_NAME", "PassOrFail", "Reason", 'Barrel_Temperature_7',
             'Mold_Temperature_1', 'Mold_Temperature_2', 'Mold_Temperature_5', 'Mold_Temperature_6', 'Mold_Temperature_7',
             'Mold_Temperature_8', 'Mold_Temperature_9', 'Mold_Temperature_10', 'Mold_Temperature_11', 'Mold_Temperature_12']
unlab_drop_cols = [col for col in unlab_ft.columns if col not in train_ft.columns] + [col for col in label_drop_cols if col in unlab_ft.columns]

train_ft = train_ft.drop(label_drop_cols, axis=1).copy()
test_ft = test_ft.drop(label_drop_cols, axis=1).copy()
unlab_ft = unlab_ft.drop(unlab_drop_cols, axis=1).copy()

# 학습용 정답데이터 => 제품마다 코드 수정 필요
y_train = (train['PassOrFail'] == 'N').astype(int)

# 테스트용 정답데이터 => 제품마다 코드 수정 필요
target = (test['PassOrFail'] == 'N').astype(int)

def preprocess(df, test=False, enc=None, scaler=None):
    obj_col = df.select_dtypes('object').columns

    if not(test):
        enc = OneHotEncoder(handle_unknown='ignore')
        enc.fit(df[obj_col])
    df[enc.get_feature_names_out()] = enc.transform(df[obj_col]).toarray()
    df.drop(obj_col, axis=1, inplace=True)

    if not(test):
        scaler = RobustScaler()
        scaler.fit(df)
    df = scaler.transform(df)

    return df, enc, scaler

train_ft, enc, scaler = preprocess(train_ft)
test_ft, _, _ = preprocess(test_ft, test=True, enc=enc, scaler=scaler)
unlab_ft, _, _ = preprocess(unlab_ft, test=True, enc=enc, scaler=scaler)

train_ft.shape, test_ft.shape, unlab_ft.shape, y_train.shape, target.shape

((2400, 35), (1574, 35), (10048, 35), (2400,), (1574,))

In [None]:
valid_ft, test_ft, valid_target, target = train_test_split(test_ft, target, test_size=0.65, random_state=SEED, shuffle=True, stratify=target)
valid_ft.shape, test_ft.shape, valid_target.shape, target.shape

((550, 35), (1024, 35), (550,), (1024,))

# WandB - DBSCAN

In [None]:
import wandb

In [None]:
wandb.login()

True

In [None]:
# wandb.init(
#     project="datathon-dbscan",  # 프로젝트 이름
#     entity="sr279-team" # 팀 이름
# )

In [None]:
sweep_config = {
    "name" : "dbscan_tunning_01",
    "method": "bayes",
    "metric": {
        "name": "f1_score",
        "goal": "maximize"
    },
    "parameters": {
        "eps": {
            "distribution": "uniform",
            "min": 8.0,
            "max": 16.0
        },
        "min_samples": {
            "distribution": "int_uniform",
            "min": 25,
            "max": 40
        }
    }
}

sweep_id = wandb.sweep(sweep_config, project="datathon-dbscan", entity="sr279-team")

Create sweep with ID: g0c77e5v
Sweep URL: https://wandb.ai/sr279-team/datathon-dbscan/sweeps/g0c77e5v


In [None]:
class AgentFunction:
    def __init__(self, x, y, target, seed):
        self.x, self.y, self.target, self.seed = x, y, target, seed

    def __call__(self, config=None):
        with wandb.init(config=config):
            config = wandb.config
            model = DBSCAN(eps=config.eps,
                           min_samples=config.min_samples,
                           metric='euclidean',
                           n_jobs=-1)
            model.fit(self.x)

            pred = model.fit_predict(self.y)
            pred = np.where(pred==-1, 1, 0)

            f1 = f1_score(self.target, pred)
            precision = precision_score(self.target, pred)
            recall = recall_score(self.target, pred)
            roc_auc = roc_auc_score(self.target, pred)
            outlier_cnt = np.sum(pred)

            wandb.log({"f1_score": f1,
                       "precision": precision,
                        "recall": recall,
                        "roc_auc": roc_auc,
                        "outlier_cnt": outlier_cnt})

In [None]:
agent_func = AgentFunction(train_ft, test_ft, target, SEED)
wandb.agent(sweep_id, agent_func, count=2000)

In [None]:
wandb.finish()

# WandB - KMeans

In [None]:
# wandb.init(
#     project="datathon-kmeans",  # 프로젝트 이름
#     entity="sr279-team" # 팀 이름
# )

In [None]:
sweep_config = {
    "name": "kmeans_tunning_01",    # 튜닝 진행 시 변경 필요
    "method": "bayes",
    "metric": {
        "name": "f1_score",
        "goal": "maximize"
    },
    "parameters": {
        "n_clusters": {
            "distribution": "int_uniform",
            "min": 2,
            "max": 10
        },
        "init": {
            "values": ["k-means++", "random"]
        },
        "max_iter": {
            "distribution": "int_uniform",
            "min": 100,
            "max": 500
        },
        "tol": {
            "distribution": "log_uniform",
            "min": 1e-5,
            "max": 1e-2
        },
        "threshold" : {
            "distribution": "int_uniform",
            "min": 90,
            "max": 99
        }
    }
}

sweep_id = wandb.sweep(sweep_config, project="datathon-kmeans", entity="sr279-team")

In [None]:
class AgentFunction:
    def __init__(self, x, y, target, seed):
        self.x, self.y, self.target, self.seed = x, y, target, seed

    def __call__(self, config=None):
        with wandb.init(config=config):
            config = wandb.config
            model = KMeans(n_clusters=config.n_clusters,
                           init=config.init,
                           max_iter=config.max_iter,
                           tol=config.tol,
                           random_state=self.seed)
            model.fit(self.x)

            distances = model.transform(self.y)
            min_distances = distances[:, 0]
            threshold = np.percentile(min_distances, config.threshold)

            pred = np.where(min_distances > threshold, 1, 0)

            f1 = f1_score(self.target, pred)
            precision = precision_score(self.target, pred)
            recall = recall_score(self.target, pred)
            roc_auc = roc_auc_score(self.target, pred)
            outlier_cnt = np.sum(pred)

            wandb.log({"f1_score": f1,
                       "precision": precision,
                        "recall": recall,
                        "roc_auc": roc_auc,
                        "outlier_cnt": outlier_cnt})

In [None]:
agent_func = AgentFunction(train_ft, test_ft, target, SEED)
wandb.agent(sweep_id, agent_func, count=2000)

In [None]:
wandb.finish()

# WandB - Pseudo Labeling DBSCAN

In [None]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33meodnjs190[0m ([33msr279-team[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
# wandb.init(
#     project="pseudo-dbscan",  # 프로젝트 이름
#     entity="sr279-team" # 팀 이름
# )

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [None]:
sweep_config = {
    "name" : "pseudo_dbscan_tunning_02",
    "method": "bayes",
    "metric": {
        "name": "f1_score",
        "goal": "maximize"
    },
    "parameters": {
        "eps": {
            "distribution": "uniform",
            "min": 2.0,
            "max": 16.0
        },
        "min_samples": {
            "distribution": "int_uniform",
            "min": 10,
            "max": 36
        },
        "usage_size": {
            "distribution": "uniform",
            "min": 0.0,
            "max": 1.0
        },
        "concat_size": {
            "distribution": "uniform",
            "min": 0.001,
            "max": 0.1
        },
        "abnormal_rate": {
            "distribution": "uniform",
            "min": 0.01,
            "max": 1.0
        }
    }
}

sweep_id = wandb.sweep(sweep_config, project="pseudo-dbscan", entity="sr279-team")

Create sweep with ID: plq7wbme
Sweep URL: https://wandb.ai/sr279-team/pseudo-dbscan/sweeps/plq7wbme


In [None]:
class AgentFunction:
    def __init__(self, train_ft, unlab_ft, test_ft, target, seed):
        self.x, self.y, self.test_ft, self.target, self.seed = train_ft, unlab_ft, test_ft, target, seed

    def __call__(self, config=None):
        with wandb.init(config=config):
            config = wandb.config

            stop_point = self.y.shape[0] * (1 - config.usage_size)
            X_train = self.x.copy()
            not_labeled = self.y.copy()
            outliers_data = np.empty((0, self.y.shape[1]))

            while not_labeled.shape[0] >= stop_point:
                model = DBSCAN(eps=config.eps, min_samples=config.min_samples)
                model.fit(X_train)
                clustered_labels = model.fit_predict(not_labeled)

                core_sample_idx = model.core_sample_indices_  # core sample index
                core_samples = model.components_  # core sample data
                core_distances = np.min(pairwise_distances(not_labeled, core_samples), axis=1)    # 각 데이터가 가장 가까운 코어 샘플과의 거리

                normal_idx = np.where(clustered_labels != -1)[0]    # 정상
                outliers_idx = np.where(clustered_labels == -1)[0]  # 이상치 (-1)

                normal_top_idx = []
                unique_clusters = np.unique(clustered_labels[normal_idx])

                for cluster_id in unique_clusters:
                    cluster_points = np.where(clustered_labels == cluster_id)[0]
                    n_normal_top = max(1, int(len(cluster_points) * config.concat_size))
                    top_idx = cluster_points[np.argsort(core_distances[cluster_points])[:n_normal_top]]
                    normal_top_idx.extend(top_idx)

                n_outliers_top = max(1, int(len(outliers_idx) * config.abnormal_rate))
                outliers_top_idx = outliers_idx[np.argsort(core_distances[outliers_idx])[:n_outliers_top]]

                # 정상, 이상치 데이터 추출
                pseudo_labeled_normal = not_labeled[normal_top_idx, :]
                pseudo_labeled_outliers = not_labeled[outliers_top_idx, :]

                X_train = np.vstack([X_train, pseudo_labeled_normal])  # 학습 데이터에 pseudo-labeled 정상 데이터 추가
                not_labeled = np.delete(not_labeled, np.concatenate([normal_top_idx, outliers_top_idx]), axis=0)
                outliers_data = np.vstack([outliers_data, pseudo_labeled_outliers]) if outliers_data.size > 0 else pseudo_labeled_outliers

            concat_ft = np.concatenate([X_train, outliers_data], axis=0)
            target_normal, target_outliers = np.zeros(X_train.shape[0]), np.ones(outliers_data.shape[0])
            concat_target = np.concatenate([target_normal, target_outliers], axis=0)
            outliers_cnt = np.sum(concat_target)

            wandb.log({
                "outliers_cnt": outliers_cnt
            })
            # Supervised Learning
            models = {
                "Logistic Regression": LogisticRegression(random_state=self.seed),
                "XGBoost": XGBClassifier(random_state=self.seed),
                "Random Forest": RandomForestClassifier(random_state=self.seed),
                "LightGBM": LGBMClassifier(random_state=self.seed),
            }

            for model_name, model in models.items():
                model.fit(concat_ft, concat_target)
                pred = model.predict(self.test_ft)

                f1 = f1_score(target, pred, zero_division=0)
                precision = precision_score(target, pred, zero_division=0)
                recall = recall_score(target, pred, zero_division=0)
                roc_auc = roc_auc_score(target, pred)

                metrics = {
                    f"{model_name} f1_score": f1,
                    f"{model_name} precision": precision,
                    f"{model_name} recall": recall,
                    f"{model_name} roc_auc": roc_auc,
                    f"{model_name} outlier_cnt": np.sum(pred),
                }
                wandb.log(metrics)

In [None]:
agent_func = AgentFunction(train_ft, unlab_ft, test_ft, target, SEED)
wandb.agent(sweep_id, agent_func, count=2000)

In [None]:
wandb.finish()