In [None]:
# usage: models.py [-h] [-d1 DATA_FILE1] [-t TEST_FILE] [-tn TEST_NAME] [--easy] [-d3 DATA_FILE3] [-d4 DATA_FILE4] [--semeval] [--max_len] [-d2 DATA_FILE2] [-m MODE] [--gab_only]
#                  [--task_b] [-tf TRANSFORMER] [-lr LEARN_RATE] [-bs BATCH_SIZE] [-sl SEQUENCE_LENGTH] [-epoch EPOCHS] [--no_weight_restore] [--save_model SAVE_MODEL]

# options:
#   -h, --help            show this help message and exit
#   -d1 DATA_FILE1, --data_file1 DATA_FILE1
#                         Dataset to train the model with, default is the SemEval 2022 sexism dataset
#   -t TEST_FILE, --test_file TEST_FILE
#                         Test file, which will be used to evaluate the model
#   -tn TEST_NAME, --test_name TEST_NAME
#                         Test file name, which will be used to store multiple test files.
# 
#   --easy                Run the model using easy-to-learn data.
#   -d3 DATA_FILE3, --data_file3 DATA_FILE3
#                         Easy-to-learn train data from data cartography
#   -d4 DATA_FILE4, --data_file4 DATA_FILE4
#                         Easy-to-learn dev data from data cartography
#   --semeval             Run the model for task A, using the semeval dataset.
#   --max_len             Run the model using max padding length based on input.
# 
#   -d2 DATA_FILE2, --data_file2 DATA_FILE2
#                         Extra dataset to train the model with, has to have compatible labels with the first dataset
#   -m MODE, --mode MODE  This argument sets the data merge option, you can choose between concatenating (concat) or shuffling (shuffle), default is concat
#   --gab_only            Run the model using only gab added data.
# 
#   --task_b              Run the model for task B, using the extra EXIST dataset
# 
#   -tf TRANSFORMER, --transformer TRANSFORMER
#                         this argument takes the pretrained language model link from HuggingFace, default is HateBERT
#   -lr LEARN_RATE, --learn_rate LEARN_RATE
#                         Set a custom learn rate for the pretrained language model, default is 5e-5
#   -bs BATCH_SIZE, --batch_size BATCH_SIZE
#                         Set a custom batch size for the pretrained language model, default is 8
#   -sl SEQUENCE_LENGTH, --sequence_length SEQUENCE_LENGTH
#                         Set a custom maximum sequence length for the pretrained language model, default is 100
#   -epoch EPOCHS, --epochs EPOCHS
#                         This argument selects the amount of epochs to run the model with, default is 1 epoch
#   --no_weight_restore   Run the model without weight restore
# 
#   --save_model SAVE_MODEL
#                         Save the current model to a file for later use on a test file, requires a name to be specified

In [None]:
# For data_cartography, currently the parameter '--easy' makes sure that it can import the .tsv outputs from the cartography folders, 
# if you want to use those, make sure that you link to the right files with d3 (train) and d4 (dev) (dev is not really needed, but that input is a leftover from before it was realized
# that this was not needed)
# Make sure to replace all filenames and directories so it coresponds with your local folder structure!

!python models.py \
	-tf GroNLP/hateBERT \
	--mode shuffle \
	--learn_rate 1e-5 \
	--batch_size 16 \
	--sequence_length 325 \
	--epochs 10 \
	-d1 data/train_all_tasks.csv \
	-d2 data/EXIST2021_merged.csv \
	--test_file dev_task_a_entries.csv \
	--test_name job_hate_taska_16_1e5_325_shuffle \
	-d3 data/easy-to-learn/hatebert/shuffle/train.tsv \
	-d4 data/easy-to-learn/hatebert/shuffle/dev.tsv \
	--easy