In [4]:
import os
import numpy as np
import pandas as pd
from autogluon.tabular import TabularPredictor
from sklearn.model_selection import train_test_split

# 설정값
ROOT_DIR = "."
RANDOM_STATE = 110

# 데이터 로드
train_data = pd.read_csv(os.path.join(ROOT_DIR, "train.csv"))

# 전처리 작업
df = train_data.copy()
df = df.dropna(axis=1, how='all')  # 모든 값이 NaN인 열 제거

# 값이 1인 열 제거
cols_to_remove = [col for col in df.columns if df[col].nunique() == 1]
df = df.drop(columns=cols_to_remove)

# 특정 열의 "OK" 값을 NaN으로 대체
df['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'] = df['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].replace("OK", np.nan)

# 숫자형 열의 고유 값 개수에 따른 처리
for col in df.columns:
    unique_count = df[col].nunique()
    if unique_count <= 10:
        df[col] = df[col].astype('category')

# 타겟 컬럼 처리 ('Normal'을 0, 'AbNormal'을 1로 맵핑)
df['target'] = df['target'].map({'Normal': 0, 'AbNormal': 1})

# 학습 및 검증 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['target']), df['target'], test_size=0.2, random_state=RANDOM_STATE, stratify=df['target'])

# Autogluon TabularPredictor 초기화 및 학습
train_data_processed = pd.concat([X_train, y_train], axis=1)  # Autogluon expects target in the same DataFrame

predictor = TabularPredictor(label='target', path='agModels-predictor', eval_metric='f1', problem_type='binary').fit(train_data=train_data_processed, presets='best_quality')

# 평가 데이터로 모델 평가
y_pred = predictor.predict(X_test)
y_test = y_test.reset_index(drop=True)  # 인덱스를 초기화하여 맞춰줌

# 평가 지표 계산
from sklearn.metrics import f1_score, recall_score, accuracy_score, classification_report

f1 = f1_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(f"F1 Score: {f1}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# 테스트 데이터 로드 및 전처리
test_data = pd.read_csv(os.path.join(ROOT_DIR, "test.csv"))
df_test = test_data.copy()
df_test = df_test.drop('Set ID', axis=1)
df_test = df_test.dropna(axis=1, how='all')

# 값이 1인 열 제거
cols_to_remove_test = [col for col in df_test.columns if df_test[col].nunique() == 1]
df_test = df_test.drop(columns=cols_to_remove_test)

# 특정 열의 "OK" 값을 NaN으로 대체
df_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'] = df_test['HEAD NORMAL COORDINATE X AXIS(Stage1) Collect Result_Fill1'].replace("OK", np.nan)

# 예측 수행
predictions = predictor.predict(df_test)

# 결과 저장
output = pd.DataFrame({'id': df_test.index, 'prediction': predictions})
output.to_csv(os.path.join(ROOT_DIR, "predictions.csv"), index=False)

print("추론 완료 및 결과 저장 완료.")

# 제출 데이터 읽어오기 및 결과 저장
df_sub = pd.read_csv("submission.csv")
df_sub["target"] = output['prediction'].map({0: "Normal", 1: "AbNormal"})
df_sub.to_csv("submission0812.csv", index=False)

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.10.13
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.5.0: Wed May  1 20:13:18 PDT 2024; root:xnu-10063.121.3~5/RELEASE_ARM64_T6030
CPU Count:          11
Memory Avail:       4.71 GB / 18.00 GB (26.2%)
Disk Space Avail:   74.61 GB / 460.43 GB (16.2%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacke

KeyboardInterrupt: 