In [1]:
!pip install pytorch-tabnet
!pip install wandb



In [2]:
import argparse
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder

import torch
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetRegressor
from pytorch_tabnet.metrics import Metric

import wandb
wandb.init(project="DACON_235877")

parser = argparse.ArgumentParser(description='TabNet')
parser.add_argument('--validation_split', default=0.1, type=float)
parser.add_argument('--n_d', default=8, type=int) # 8, 8~64
parser.add_argument('--n_a', default=8, type=int) # 8, n_d=n_a
parser.add_argument('--n_steps', default=3, type=int) # 3, 3~10
parser.add_argument('--gamma', default=1.3, type=float) # 1.3, 1.0~2.0
parser.add_argument('--batch_size', default=64, type=int) # 1024
parser.add_argument('--virtual_batch_size', default=8, type=int) # 128
parser.add_argument('--mask_type', default="entmax", type=str) # sparsemax or entmax
parser.add_argument('--seed', default=1011, type=int)
args = parser.parse_args('')

wandb.config.update(args)

random_seed=args.seed

random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed) # if use multi-GPU
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

train = pd.read_csv("/content/drive/MyDrive/DACON-Basic/235877_전복 나이 예측 경진대회/data/train.csv")
test = pd.read_csv("/content/drive/MyDrive/DACON-Basic/235877_전복 나이 예측 경진대회/data/test.csv")

train.head()

[34m[1mwandb[0m: Currently logged in as: [33mgnoeyheat[0m (use `wandb login --relogin` to force relogin)


Unnamed: 0,id,Gender,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Target
0,1,M,0.605,0.47,0.115,1.114,0.3925,0.291,0.31,15
1,2,I,0.43,0.315,0.095,0.378,0.175,0.08,0.1045,8
2,3,I,0.58,0.49,0.195,1.3165,0.5305,0.254,0.41,18
3,4,M,0.535,0.405,0.175,1.2705,0.548,0.3265,0.337,13
4,5,I,0.31,0.235,0.09,0.127,0.048,0.031,0.04,6


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1253 entries, 0 to 1252
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              1253 non-null   int64  
 1   Gender          1253 non-null   object 
 2   Lenght          1253 non-null   float64
 3   Diameter        1253 non-null   float64
 4   Height          1253 non-null   float64
 5   Whole Weight    1253 non-null   float64
 6   Shucked Weight  1253 non-null   float64
 7   Viscra Weight   1253 non-null   float64
 8   Shell Weight    1253 non-null   float64
 9   Target          1253 non-null   int64  
dtypes: float64(7), int64(2), object(1)
memory usage: 98.0+ KB


In [4]:
categorical_columns = []
categorical_dims =  {}
for col in train.columns[train.dtypes == object]:
    print(col, train[col].nunique())
    l_enc = LabelEncoder()
    train[col] = l_enc.fit_transform(train[col].values)
    test[col] = l_enc.fit_transform(test[col].values)
    categorical_columns.append(col)
    categorical_dims[col] = len(l_enc.classes_)

target = 'Target'
unused_feat = ['id']
features = [col for col in train.columns if col not in unused_feat+[target]] 

cat_idxs = [i for i, f in enumerate(features) if f in categorical_columns]
cat_dims = [categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

train.info()

Gender 3
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1253 entries, 0 to 1252
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              1253 non-null   int64  
 1   Gender          1253 non-null   int64  
 2   Lenght          1253 non-null   float64
 3   Diameter        1253 non-null   float64
 4   Height          1253 non-null   float64
 5   Whole Weight    1253 non-null   float64
 6   Shucked Weight  1253 non-null   float64
 7   Viscra Weight   1253 non-null   float64
 8   Shell Weight    1253 non-null   float64
 9   Target          1253 non-null   int64  
dtypes: float64(7), int64(3)
memory usage: 98.0 KB


In [5]:
## Data Preprocessing

X = train.drop(["id", "Target"], axis=1)
y = train.Target

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=args.validation_split, random_state=random_seed)
X_test = test.drop(["id"], axis=1)

X_train = X_train.values
X_valid = X_valid.values
X_test = X_test.values

y_train = y_train.values.reshape(-1, 1)
y_valid = y_valid.values.reshape(-1, 1)

X_train.shape, X_valid.shape, X_test.shape, y_train.shape, y_valid.shape

((1127, 8), (126, 8), (2924, 8), (1127, 1), (126, 1))

In [6]:
class NMAE(Metric):
    def __init__(self):
        self._name = "nmae"
        self._maximize = False

    def __call__(self, y_true, y_score):
        target_idx = np.where(y_true!=0)
        true = y_true[target_idx]
        pred = y_score[target_idx]
        score = np.mean(np.abs(true-pred)/true)
        return score

* Supervised learning

In [7]:
clf = TabNetRegressor(
    n_d=args.n_d, n_a=args.n_a, n_steps=args.n_steps, gamma=args.gamma,
    cat_idxs = cat_idxs,
    cat_dims = cat_dims,
    seed=random_seed,
    optimizer_fn=torch.optim.Adam,
    # scheduler_params={"step_size":10,
    #                   "gamma":0.9},
    # scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type=args.mask_type
    )

clf.fit(
  X_train, y_train, 
  eval_set=[(X_valid, y_valid)],
  eval_metric=[NMAE],
  max_epochs=200,
  patience=15,
  loss_fn=torch.nn.L1Loss(),
  batch_size=args.batch_size,
  virtual_batch_size=args.virtual_batch_size
)

Device used : cuda
epoch 0  | loss: 4.6752  | val_0_nmae: 0.29235 |  0:00:02s
epoch 1  | loss: 2.39458 | val_0_nmae: 0.27195 |  0:00:04s
epoch 2  | loss: 2.05311 | val_0_nmae: 0.22638 |  0:00:07s
epoch 3  | loss: 1.91807 | val_0_nmae: 0.26958 |  0:00:09s
epoch 4  | loss: 1.88354 | val_0_nmae: 0.27949 |  0:00:12s
epoch 5  | loss: 1.90029 | val_0_nmae: 0.25084 |  0:00:15s
epoch 6  | loss: 1.83983 | val_0_nmae: 0.27871 |  0:00:19s
epoch 7  | loss: 1.86827 | val_0_nmae: 0.254   |  0:00:21s
epoch 8  | loss: 1.88398 | val_0_nmae: 0.24637 |  0:00:23s
epoch 9  | loss: 1.82943 | val_0_nmae: 0.26411 |  0:00:24s
epoch 10 | loss: 1.90485 | val_0_nmae: 0.24514 |  0:00:25s
epoch 11 | loss: 1.82485 | val_0_nmae: 0.22725 |  0:00:26s
epoch 12 | loss: 1.84835 | val_0_nmae: 0.22774 |  0:00:27s
epoch 13 | loss: 1.75671 | val_0_nmae: 0.22016 |  0:00:28s
epoch 14 | loss: 1.73102 | val_0_nmae: 0.26057 |  0:00:29s
epoch 15 | loss: 1.74043 | val_0_nmae: 0.27465 |  0:00:31s
epoch 16 | loss: 1.74722 | val_0_nmae

* Semi-supervised pre-training

In [8]:
# pretraining_ratio = 0.8 # 0.5, 0~1
# wandb.log({"pretraining_ratio": pretraining_ratio})

# unsupervised_model = TabNetPretrainer(
#     n_d=args.n_d, n_a=args.n_a, n_steps=args.n_steps, gamma=args.gamma,
#     cat_idxs = cat_idxs,
#     cat_dims = cat_dims,
#     seed=random_seed,
#     optimizer_fn=torch.optim.Adam,
#     mask_type=args.mask_type
# )

# unsupervised_model.fit(
#     X_train=X_train,
#     eval_set=[X_valid],
#     max_epochs=200,
#     patience=15,
#     batch_size=args.batch_size,
#     virtual_batch_size=args.virtual_batch_size,
#     pretraining_ratio=pretraining_ratio,
# )

# clf = TabNetRegressor(
#     n_d=args.n_d, n_a=args.n_a, n_steps=args.n_steps, gamma=args.gamma,
#     cat_idxs = cat_idxs,
#     cat_dims = cat_dims,
#     seed=random_seed,
#     optimizer_fn=torch.optim.Adam,
#     scheduler_params={"step_size":10,
#                       "gamma":0.9},
#     scheduler_fn=torch.optim.lr_scheduler.StepLR,
#     mask_type=args.mask_type
# )

# clf.fit(
#     X_train=X_train, y_train=y_train,
#     eval_set=[(X_train, y_train), (X_valid, y_valid)],
#     eval_name=['train', 'valid'],
#     eval_metric=[NMAE],
#     max_epochs=200,
#     patience=15,
#     loss_fn=torch.nn.L1Loss(),
#     batch_size=args.batch_size,
#     virtual_batch_size=args.virtual_batch_size,
#     from_unsupervised=unsupervised_model
# )

In [9]:
preds = clf.predict(X_valid)
valid_name = NMAE()(y_valid, preds)
print(valid_name)

wandb.log({"valid_nmae": valid_name})

0.12847126969872744


In [10]:
# saving_path_name = "./tabnet_model_test_1"
# saved_filepath = clf.save_model(saving_path_name)

# loaded_clf = TabNetRegressor()
# loaded_clf.load_model(saved_filepath)

In [11]:
submission=clf.predict(X_test).round()

sample_submission = pd.read_csv("/content/drive/MyDrive/DACON-Basic/235877_전복 나이 예측 경진대회/data/sample_submission.csv")
sample_submission.Target = submission
sample_submission.to_csv("/content/drive/MyDrive/DACON-Basic/235877_전복 나이 예측 경진대회/submission.csv",index=False)