In [None]:
!pip install catboost
!pip install pycaret
!pip install optuna
!pip install smogn
!pip install pytorch-tabnet
!pip install wandb

In [None]:
import argparse
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder

from pycaret.regression import *

import smogn

import torch
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetRegressor
from pytorch_tabnet.metrics import Metric

import wandb
wandb.init(project="DACON_235877")

parser = argparse.ArgumentParser(description='TabNet')
parser.add_argument('--validation_split', default=0.1, type=float)
parser.add_argument('--n_d', default=8, type=int) # 8, 8~64
parser.add_argument('--n_a', default=8, type=int) # 8, n_d=n_a
parser.add_argument('--n_steps', default=3, type=int) # 3, 3~10
parser.add_argument('--gamma', default=1.6, type=float) # 1.3, 1.0~2.0
parser.add_argument('--batch_size', default=64, type=int) # 1024
parser.add_argument('--virtual_batch_size', default=8, type=int) # 128
parser.add_argument('--mask_type', default="entmax", type=str) # sparsemax or entmax
parser.add_argument('--seed', default=1011, type=int)
parser.add_argument('--cv', default=5, type=int)
parser.add_argument('--top_n_model', default=5, type=int)
parser.add_argument('--tune_iter', default=10, type=int)
args = parser.parse_args('')

wandb.config.update(args)

random_seed=args.seed

random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
torch.cuda.manual_seed_all(random_seed) # if use multi-GPU
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

train = pd.read_csv("/content/drive/MyDrive/DACON-Basic/235877_전복 나이 예측 경진대회/data/train.csv")
test = pd.read_csv("/content/drive/MyDrive/DACON-Basic/235877_전복 나이 예측 경진대회/data/test.csv")

train.head()

In [None]:
train.info()

In [None]:
## Outlier

train = train[train["Whole Weight"]>=(train["Shucked Weight"] + train["Shell Weight"])]
train = train[train["Shucked Weight"] >= train["Viscra Weight"]]

## Feature Engineering
train["Shell Water"] = round(train["Whole Weight"] - (train["Shucked Weight"] + train["Shell Weight"]), 3)
test["Shell Water"] = round(test["Whole Weight"] - (test["Shucked Weight"] + test["Shell Weight"]), 3)
train["ratio"] = round(train["Shucked Weight"] / train["Whole Weight"], 3)
test["ratio"] = round(test["Shucked Weight"] / test["Whole Weight"], 3)
train["density"] = round(train["Shucked Weight"] / (train["Lenght"] * train["Diameter"] * train["Height"]), 3)
test["density"] = round(test["Shucked Weight"] / (train["Lenght"] * train["Diameter"] * train["Height"]), 3)

train.info()

In [None]:
# ## SMOTE

# train=smogn.smoter(data=train, y='Target')
# train.shape

### AutoML

In [None]:
s = setup(train, target='Target',
          fold_strategy='kfold', fold=args.cv,
          session_id=args.seed, log_experiment=True)

In [None]:
def NMAE(true, pred):
    score = np.mean(np.abs(true - pred) / true)
    return score

add_metric('NMAE', 'NMAE', NMAE, greater_is_better=False)

In [None]:
%%time
blender = blend_models(compare_models(sort='NMAE', n_select=args.top_n_model), choose_better=True)
tuned_blender = tune_model(blender, n_iter=args.tune_iter, optimize='NMAE', search_library='optuna', choose_better=True)
final_model=finalize_model(tuned_blender)
final_model

In [None]:
evaluate_model(final_model)

In [None]:
predictions = predict_model(final_model, data=test)
predictions.shape

### TabNet

In [None]:
categorical_columns = []
categorical_dims =  {}
for col in train.columns[train.dtypes == object]:
    print(col, train[col].nunique())
    l_enc = LabelEncoder()
    train[col] = l_enc.fit_transform(train[col].values)
    test[col] = l_enc.fit_transform(test[col].values)
    categorical_columns.append(col)
    categorical_dims[col] = len(l_enc.classes_)

target = 'Target'
unused_feat = ['id']
features = [col for col in train.columns if col not in unused_feat+[target]] 

cat_idxs = [i for i, f in enumerate(features) if f in categorical_columns]
cat_dims = [categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

train.info()

In [None]:
train = train[['id', 'Gender', 'Lenght', 'Diameter', 'Height', 
               'Whole Weight', 'Shucked Weight', 'Viscra Weight', 'Shell Weight',
               'Shell Water', 'ratio', 'density',
               'Target']]
train.columns

In [None]:
## Data Preprocessing

X = train.drop(["id", "Target"], axis=1)
y = train.Target

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=args.validation_split, random_state=random_seed)
X_test = test.drop(["id"], axis=1)

X_train = X_train.values
X_valid = X_valid.values
X_test = X_test.values

y_train = y_train.values.reshape(-1, 1)
y_valid = y_valid.values.reshape(-1, 1)

X_train.shape, X_valid.shape, X_test.shape, y_train.shape, y_valid.shape

In [None]:
class NMAE(Metric):
    def __init__(self):
        self._name = "nmae"
        self._maximize = False

    def __call__(self, true, pred):
        mae = np.mean(np.abs(true-pred))
        score = mae / np.mean(np.abs(true))
        return score

In [None]:
clf = TabNetRegressor(
    n_d=args.n_d, n_a=args.n_a, n_steps=args.n_steps, gamma=args.gamma,
    cat_idxs = cat_idxs,
    cat_dims = cat_dims,
    seed=random_seed,
    optimizer_fn=torch.optim.Adam,
    mask_type=args.mask_type
    )

clf.fit(
  X_train, y_train, 
  eval_set=[(X_valid, y_valid)],
  eval_metric=[NMAE],
  max_epochs=200,
  patience=15,
  loss_fn=torch.nn.L1Loss(),
  batch_size=args.batch_size,
  virtual_batch_size=args.virtual_batch_size
)

In [None]:
preds = clf.predict(X_valid)
valid_name = NMAE()(y_valid, preds)
print(valid_name)

wandb.log({"valid_nmae": valid_name})

## Test

In [None]:
submission1=predictions['Label']
submission2=clf.predict(X_test)

submission=(submission1 + pd.Series(submission2.flatten()))/2
submission

In [None]:
submission=submission.round()

sample_submission = pd.read_csv("/content/drive/MyDrive/DACON-Basic/235877_전복 나이 예측 경진대회/data/sample_submission.csv")
sample_submission.Target = submission
sample_submission.to_csv("/content/drive/MyDrive/DACON-Basic/235877_전복 나이 예측 경진대회/submission.csv",index=False)