<a href="https://colab.research.google.com/github/3000minsu/2023-LG-Aimers/blob/main/5_tabnet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [150]:
pip install pytorch-tabnet

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [151]:
import numpy as np
import pandas as pd
import random
import os

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from matplotlib import pyplot as plt

import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from pytorch_tabnet.tab_model import TabNetClassifier

In [152]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

In [153]:
from google.colab import drive
drive.mount('/content/drive')

train_df = pd.read_csv('/content/drive/MyDrive/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/test.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [154]:
train = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
train_y = train_df['Y_Class']

test = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])
train = train.fillna(0)
test = test.fillna(0)

In [155]:

if "Set" not in train.columns:
    train["Set"] = np.random.choice(["train", "valid"], p =[.8, .2], size=(train.shape[0],))

train_indices = train[train.Set=="train"].index
valid_indices = train[train.Set=="valid"].index

In [156]:
nunique = train.nunique()
types = train.dtypes

categorical_columns = []
categorical_dims =  {}
for col in train.columns:
    if ((types[col] == 'object')&(col!='Set')):
        print(col, train[col].nunique())
        l_enc = LabelEncoder()
        train[col] = l_enc.fit_transform(train[col].values)
        test[col] = l_enc.transform(test[col].values)
        categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)


# Categorical Embedding을 위해 Categorical 변수의 차원과 idxs를 담음.
unused_feat = ['Set']
features = [ col for col in train.columns if col not in unused_feat] 
cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]
cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]



X_train = train[features].values[train_indices]
y_train = train_y.values[train_indices]

X_valid = train[features].values[valid_indices]
y_valid = train_y.values[valid_indices]


LINE 6
PRODUCT_CODE 3


In [157]:
print(X_train.shape)
print(y_train.shape)
print(train.shape)
print(test.shape)

(485, 2877)
(485,)
(598, 2878)
(310, 2877)


In [158]:
clf = TabNetClassifier(cat_idxs=cat_idxs,
                       cat_dims=cat_dims,
                       cat_emb_dim=10,
                       optimizer_fn=torch.optim.Adam,
                       optimizer_params=dict(lr=1e-2),
                       scheduler_params={"step_size":50,
                                         "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type='sparsemax' # "sparsemax", entmax
                      )



In [159]:
for col in train.columns:
    if (train[col].isnull().any()):
      print(col)

In [160]:
max_epochs = 15

clf.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    eval_name=['train', 'valid'],
    #eval_metric=['rmse'],
    max_epochs=max_epochs , patience=20,
    batch_size=1024, virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False,
)

epoch 0  | loss: 1.40059 | train_accuracy: 0.29897 | valid_accuracy: 0.27434 |  0:00:00s
epoch 1  | loss: 1.3025  | train_accuracy: 0.56082 | valid_accuracy: 0.59292 |  0:00:00s
epoch 2  | loss: 1.23139 | train_accuracy: 0.65979 | valid_accuracy: 0.70796 |  0:00:00s
epoch 3  | loss: 1.30452 | train_accuracy: 0.61649 | valid_accuracy: 0.68142 |  0:00:00s
epoch 4  | loss: 1.25745 | train_accuracy: 0.61649 | valid_accuracy: 0.62832 |  0:00:00s
epoch 5  | loss: 1.25753 | train_accuracy: 0.59794 | valid_accuracy: 0.64602 |  0:00:01s
epoch 6  | loss: 1.18954 | train_accuracy: 0.60412 | valid_accuracy: 0.64602 |  0:00:01s
epoch 7  | loss: 1.22344 | train_accuracy: 0.60206 | valid_accuracy: 0.65487 |  0:00:01s
epoch 8  | loss: 1.2048  | train_accuracy: 0.27216 | valid_accuracy: 0.27434 |  0:00:01s
epoch 9  | loss: 1.1597  | train_accuracy: 0.54021 | valid_accuracy: 0.61947 |  0:00:01s
epoch 10 | loss: 1.15369 | train_accuracy: 0.56082 | valid_accuracy: 0.56637 |  0:00:02s
epoch 11 | loss: 1.22



In [161]:
preds = clf.predict(test.values)

In [162]:
submit = pd.read_csv('/content/drive/MyDrive/sample_submission.csv')

In [163]:
submit['Y_Class'] = preds

In [164]:
submit.to_csv('./baseline_submission.csv', index=False)