In [1]:
import pandas as pd
import joblib
from autogluon.tabular import TabularPredictor

# 1. 데이터 로드
train = pd.read_csv('dataset/train.csv', index_col='ID')
test = pd.read_csv('dataset/test.csv', index_col='ID')

# 2. 타겟 값 'SUBCLASS' 분리
y = train['SUBCLASS']
X = train.drop(columns=['SUBCLASS'])

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 처리했던 것들을 불러오는 부분
X_encoded_pca=joblib.load('./joblib/X_encoded_pca.csv')
X_encoded_pca_df = pd.DataFrame(X_encoded_pca)
y_df = pd.DataFrame(y)
y_df.reset_index(drop=True, inplace=True)
test_encoded_pca=joblib.load('./joblib/test_encoded_pca.csv')
test_encoded_pca_df = pd.DataFrame(test_encoded_pca)

In [3]:
# 3. AutoML을 사용한 학습 및 예측
# AutoGluon은 범주형 인코딩을 자동으로 처리해줍니다.
predictor = TabularPredictor(label='SUBCLASS', problem_type='multiclass', path='AutoGluonModels').fit(
    train_data=pd.concat([X_encoded_pca_df, y_df], axis=1), 
    presets='best_quality', 
    ag_args_fit={'num_gpus': 1}  # GPU 사용 설정
)

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.8.19
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #169-Ubuntu SMP Tue Jun 6 22:23:09 UTC 2023
CPU Count:          8
Memory Avail:       463.23 GB / 503.56 GB (92.0%)
Disk Space Avail:   1798.24 GB / 1862.65 GB (96.5%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked overfitting.
	Running DyStack for up to 900s of the 3

In [4]:
# 4. 테스트 데이터에 대해 예측 수행
predictions = predictor.predict(test_encoded_pca_df)

In [5]:
# 6. 모델 저장
joblib.dump(predictor, './joblib/automl_model.joblib')

['./joblib/automl_model.joblib']

In [6]:
# 5. 결과를 submission.csv로 저장
submission = pd.DataFrame({'ID': test.index, 'SUBCLASS': predictions})
submission.to_csv('automl_submission.csv', index=False)

# 6. 모델 성능 평가 (필요시)
leaderboard = predictor.leaderboard(silent=True)
print(leaderboard)

                    model  score_val eval_metric  pred_time_val     fit_time  \
0     WeightedEnsemble_L3   0.441380    accuracy      21.168896  2446.497848   
1       LightGBMXT_BAG_L2   0.410256    accuracy      20.405257  2265.798948   
2  NeuralNetFastAI_BAG_L2   0.400258    accuracy      20.721702  1862.998975   
3         LightGBM_BAG_L2   0.340268    accuracy      20.347774  1892.362649   
4     WeightedEnsemble_L2   0.301887    accuracy       1.154048  1464.495476   
5       LightGBMXT_BAG_L1   0.290760    accuracy       0.454375  1296.732733   
6  NeuralNetFastAI_BAG_L1   0.228028    accuracy       0.697115   167.332039   
7         LightGBM_BAG_L1   0.181584    accuracy       0.267574   215.200279   
8   KNeighborsUnif_BAG_L1   0.166425    accuracy       9.334132     1.833763   
9   KNeighborsDist_BAG_L1   0.136752    accuracy       9.207412     1.827935   

   pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  \
0                0.002545           0.626674   