In [1]:
# Figure out how to load custom dataset to required format

In [2]:
from dataclasses import dataclass, field
import json
# tracking events that happen when the code is running
import logging
import os
from typing import Optional

import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoConfig, Trainer, EvalPrediction, set_seed
from transformers.training_args import TrainingArguments

import sys

from multimodal_transformers.data import load_data_from_folder
from multimodal_transformers.model import TabularConfig
from multimodal_transformers.model import AutoModelWithTabular

In [3]:
@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={
            "help": "Path to pretrained model or model identifier from huggingface.co/models"
        }
    )
    config_name: Optional[str] = field(
        default=None,
        metadata={
            "help": "Pretrained config name or path if not the same as model_name"
        },
    )
    tokenizer_name: Optional[str] = field(
        default=None,
        metadata={
            "help": "Pretrained tokenizer name or path if not the same as model_name"
        },
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={
            "help": "Where do you want to store the pretrained models downloaded from s3"
        },
    )


@dataclass
class MultimodalDataTrainingArguments:
    """
    Arguments pertaining to how we combine tabular features
    Using `HfArgumentParser` we can turn this class
    into argparse arguments to be able to specify them on
    the command line.
    """

    data_path: str = field(
        metadata={"help": "the path to the csv file containing the dataset"}
    )
    column_info_path: str = field(
        default=None,
        metadata={
            "help": "the path to the json file detailing which columns are text, categorical, numerical, and the label"
        },
    )

    column_info: dict = field(
        default=None,
        metadata={
            "help": "a dict referencing the text, categorical, numerical, and label columns"
            "its keys are text_cols, num_cols, cat_cols, and label_col"
        },
    )

    categorical_encode_type: str = field(
        #ohe: one-hot encoding
        default="ohe",
        metadata={
            "help": "sklearn encoder to use for categorical data",
            "choices": ["ohe", "binary", "label", "none"],
        },
    )
    numerical_transformer_method: str = field(
        # more details: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html
        default="yeo_johnson",
        metadata={
            "help": "sklearn numerical transformer to preprocess numerical data",
            "choices": ["yeo_johnson", "box_cox", "quantile_normal", "none"],
        },
    )
    task: str = field(
        default="classification",
        metadata={
            "help": "The downstream training task",
            "choices": ["classification", "regression"],
        },
    )

    mlp_division: int = field(
        default=4,
        metadata={
            "help": "the ratio of the number of "
            "hidden dims in a current layer to the next MLP layer"
        },
    )
    ## NOTE: you will implement different combination methods
    combine_feat_method: str = field(
        default="individual_mlps_on_cat_and_numerical_feats_then_concat",
        metadata={
            "help": "method to combine categorical and numerical features, "
            "see README for all the methods"
        },
    )
    mlp_dropout: float = field(
        default=0.1, metadata={"help": "dropout ratio used for MLP layers"}
    )
    numerical_bn: bool = field(
        default=True,
        metadata={"help": "whether to use batchnorm on numerical features"},
    )
    categorical_bn: bool = field(
        default=True,
        metadata={"help": "whether to use batchnorm on categorical features"},
    )
    use_simple_classifier: str = field(
        default=True,
        metadata={"help": "whether to use single layer or MLP as final classifier"},
    )
    mlp_act: str = field(
        default="relu",
        metadata={
            "help": "the activation function to use for finetuning layers",
            "choices": ["relu", "prelu", "sigmoid", "tanh", "linear"],
        },
    )

    # https://docs.python.org/3/library/dataclasses.html#dataclasses.__post_init__
    # https://stackoverflow.com/questions/73672582/when-the-method-post-init-doesnt-called
    def __post_init__(self):
        assert self.column_info != self.column_info_path
        if self.column_info is None and self.column_info_path:
            with open(self.column_info_path, "r") as f:
                self.column_info = json.load(f)

# Training: