In [1]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from tqdm import tqdm
ROOT_DIR = "."
RANDOM_STATE = 110

# Load data
train_data = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))
df = train_data.copy()
# Drop columns with all NaN values
df = df.dropna(axis=1, how='all')

# 값이 1인 열 제거
cols_to_remove = [col for col in df.columns if df[col].nunique() == 1]
df = df.drop(columns=cols_to_remove)

print("Columns removed due to having only 1 unique value:", cols_to_remove)

# OK 값 제거
df['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'] = df['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].replace("OK", np.nan)

# 숫자형 열의 고유 값 개수 확인 및 범주형으로 변환
for col in df.columns:
    unique_count = df[col].nunique()
    if unique_count <= 10:
        print(f"Column '{col}' has {unique_count} unique values (<= 5). Consider converting to categorical.")
        df[col] = df[col].astype('category')
    else:
        print(f"Column '{col}' has {unique_count} unique values (> 5).")

# 'object' 타입 열 확인
object_cols = df.select_dtypes(include=['object']).columns
print("Object columns to be converted:", object_cols)

# 범주형 데이터로 변환
for col in object_cols:
    df[col] = df[col].astype('category')

# # 범주형으로 변환된 열 확인
# categorical_cols = df.select_dtypes(include=['category'])

# 예시로 target 컬럼 이름이 'target'이라고 가정
df['target'] = df['target'].map({'Normal': 0, 'AbNormal': 1})

# 최종 데이터프레임 정보 출력
train_df = df.copy()
print("\nFinal DataFrame:")
print(df.info())

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Columns removed due to having only 1 unique value: ['Wip Line_Dam', 'Process Desc._Dam', 'Insp. Seq No._Dam', 'Insp Judge Code_Dam', 'CURE STANDBY POSITION X Collect Result_Dam', 'CURE STANDBY POSITION Z Collect Result_Dam', 'CURE STANDBY POSITION Θ Collect Result_Dam', 'CURE START POSITION Z Collect Result_Dam', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Dam', 'Wip Line_AutoClave', 'Process Desc._AutoClave', 'Equipment_AutoClave', 'Insp. Seq No._AutoClave', 'Insp Judge Code_AutoClave', '1st Pressure Judge Value_AutoClave', '2nd Pressure Judge Value_AutoClave', '3rd Pressure Judge Value_AutoClave', 'GMES_ORIGIN_INSP_JUDGE_CODE Collect Result_AutoClave', 'GMES_ORIGIN_INSP_JUDGE_CODE Judge Value_AutoClave', 'Wip Line_Fill1', 'Process Desc._Fill1', 'Insp. Seq No._Fill1', 'Insp Judge Code_Fill1', 'HEAD NORMAL COORDINATE X AXIS(Stage1) Judge Value_Fill1', 'Wip Line_Fill2', 'Process Desc._Fill2', 'Insp. Seq No._Fill2', 'Insp Judge Code_Fill2', 'CURE END POSITION Θ Collect Result_Fill

In [2]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, recall_score, accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# 데이터 준비
X = train_df.drop(columns=['target'])  # 피처 데이터
y = train_df['target']  # 타겟 데이터

# 학습 데이터와 테스트 데이터로 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# LGBMClassifier 모델 설정
lgbm_model = lgb.LGBMClassifier(
    objective='binary',
    boosting_type='gbdt',
    metric='f1',  # f1 score를 평가 지표로 사용
    is_unbalance=True,  # 데이터 불균형일 경우
    learning_rate=0.01,
    num_leaves=500,
    max_depth=-1,
    feature_fraction=0.9,
    bagging_fraction=0.8,
    bagging_freq=5,
    class_weight={0: 1, 1: 1},  # 클래스 1의 가중치를 높임
    n_estimators=1000,  # 최대 부스팅 라운드 수
)
# 모델 학습
lgbm_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='f1',
)

lgbm_model.booster_.save_model(os.path.join(ROOT_DIR, "lgbm_model.txt"))

# 예측 수행
y_pred = lgbm_model.predict(X_test)

# 평가
f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(f"F1 Score: {f1}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

OSError: dlopen(/Users/coldbrew/miniconda3/envs/beaver/lib/python3.10/site-packages/lightgbm/lib/lib_lightgbm.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib
  Referenced from: <D3923ACB-D836-32D3-A031-CF91999FDAFC> /Users/coldbrew/miniconda3/envs/beaver/lib/python3.10/site-packages/lightgbm/lib/lib_lightgbm.dylib
  Reason: tried: '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/local/lib/libomp/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/local/lib/libomp/libomp.dylib' (no such file), '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/local/lib/libomp/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/local/lib/libomp/libomp.dylib' (no such file), '/Users/coldbrew/miniconda3/envs/beaver/lib/python3.10/lib-dynload/../../libomp.dylib' (no such file), '/Users/coldbrew/miniconda3/envs/beaver/bin/../lib/libomp.dylib' (no such file)

In [None]:
import os
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.impute import SimpleImputer

# 설정값
ROOT_DIR = "."
RANDOM_STATE = 110

# 데이터 로드
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))
df = test_data.copy()

# 1. NaN 값이 있는 모든 열 제거
df = df.dropna(axis=1, how='all')

# 2. 값이 1인 열 제거
cols_to_remove = [col for col in df.columns if df[col].nunique() == 1]
df = df.drop(columns=cols_to_remove)

# 3. 특정 열의 "OK" 값을 NaN으로 대체
df['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'] = df['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].replace("OK", np.nan)

# 4. 숫자형 열의 고유 값 개수에 따른 처리
for col in df.columns:
    unique_count = df[col].nunique()
    if unique_count <= 10:
        df[col] = df[col].astype('category')

# 5. 'object' 타입 열을 범주형으로 변환
object_cols = df.select_dtypes(include=['object']).columns
for col in object_cols:
    df[col] = df[col].astype('category')

# 타겟 열('target')은 학습 시에만 존재하므로, 추론 시에는 제외
X_infer = df.copy()

# 학습된 모델 불러오기
model = lgb.Booster(model_file=os.path.join(ROOT_DIR, "lgbm_model.txt"))

# 추론
y_pred = model.predict(X_infer, num_iteration=model.best_iteration)

# 임계값 설정 (0.5 기준)
y_pred_binary = (y_pred > 0.5).astype(int)

# 결과 저장
output = pd.DataFrame({'id': X_infer.index, 'prediction': y_pred_binary})
output.to_csv(os.path.join(ROOT_DIR, "predictions.csv"), index=False)

print("추론 완료 및 결과 저장 완료.")