<a href="https://colab.research.google.com/github/BigTMiami/AdaptOrDie/blob/main/Amazon_Domain_Training_Loss_Condensed_vs_Non_Condensed_Experiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Summary
This experiment compares the loss on a small non condensed dataset vs a condensed dataset.  

# Details
* Datasets contain the same data, only one is condensed and one isn't
* Models aren't pushed to hub, only getting loss values for comparison
* No training is done, only evaluation, so train dataset is only used for evaluation

# Results
Not as expected = the losses were the same.

Non Condensed Loss: 10.95
Non Condensed Rows: 5017

Condensed Loss: 10.94
Non Condensed Rows: 873

Expected loss ration 5.75
Actual loss Ratio: 1.00


# Setup

In [1]:
!pip install datasets
!pip install transformers[torch]

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/510.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━[0m [32m399.4/510.5 kB[0m [31m12.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-

In [16]:
from datasets import load_dataset
dataset_non_condensed = load_dataset("BigTMiami/amazon_25M_simple_5_000")
dataset_non_condensed

Downloading readme:   0%|          | 0.00/324 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/935k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5017 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 5017
    })
})

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("roberta-base")
block_size = tokenizer.model_max_length
print(f"block_size:{block_size}")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

block_size:512


# Train

In [4]:
from transformers import AutoConfig, AutoModelForMaskedLM

config = AutoConfig.from_pretrained("roberta-base")
model = AutoModelForMaskedLM.from_config(config)

In [7]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="amazon_25M_simple_5_000",
    learning_rate=0.0005, # Paper for DAPT training
    #per_device_train_batch_size=34, NOT TRAINING
    per_device_eval_batch_size=60, # TRYING TO SHOW DIFFERENCE IN LOSSES, just need to set condensed to 1/6 value of non condensed
    num_train_epochs=1, # 1 pass, 12k steps, 25 million reviews
    weight_decay=0.01,
    warmup_ratio=0.06, # Paper: warmup proportion of 0.06
    adam_epsilon=1e-6, # Paper 1e-6 (huggingface default 1e-08)
    adam_beta1=0.9, # Paper: Adam weights 0.9
    adam_beta2=0.98, # Paper: Adam weights 0.98 (huggingface default  0.999)
    lr_scheduler_type="linear",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    # load_best_model_at_end=True,
    # push_to_hub=True,
)

In [17]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm_probability=0.15
)

trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=dataset_non_condensed["train"],
    data_collator=data_collator,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [18]:
eval_results = trainer.evaluate()
non_condensed_loss = eval_results["eval_loss"]
print(f"Non Condensed Loss: {non_condensed_loss:.2f}")
print(eval_results)


Non Condensed Loss: 10.95
{'eval_loss': 10.946366310119629, 'eval_runtime': 42.9062, 'eval_samples_per_second': 116.929, 'eval_steps_per_second': 11.7}


In [10]:
from datasets import load_dataset
dataset_condensed = load_dataset("BigTMiami/amazon_25M_simple_5_000_condensed")
dataset_condensed

Downloading readme:   0%|          | 0.00/361 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/873 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 873
    })
})

In [11]:
condensed_training_args = training_args
condensed_training_args.per_device_eval_batch_size=10 # TRYING TO SHOW DIFFERENCE IN LOSSES, just need to set condensed to 1/6 value (10)

In [12]:
trainer_condensed = Trainer(
    model=model,
    args=condensed_training_args,
    eval_dataset=dataset_condensed["train"],
    data_collator=data_collator,
)

In [13]:
eval_results = trainer_condensed.evaluate()
condensed_loss = eval_results["eval_loss"]
print(f"Condensed Loss: {condensed_loss:.2f}")
print(eval_results)

Condensed Loss: 10.94
{'eval_loss': 10.943507194519043, 'eval_runtime': 10.6807, 'eval_samples_per_second': 81.736, 'eval_steps_per_second': 8.239}


In [23]:
print(f"Non Condensed Loss: {non_condensed_loss:.2f}")
print(f"Non Condensed Rows: {len(dataset_non_condensed['train'])}")
print(f"Condensed Loss: {condensed_loss:.2f}")
print(f"Non Condensed Rows: {len(dataset_condensed['train'])}")
print(f"Expected loss ration {len(dataset_non_condensed['train'])/len(dataset_condensed['train']):.2f}")
print(f"Actual loss Ratio: {condensed_loss / non_condensed_loss:.2f}")

Non Condensed Loss: 10.95
Non Condensed Rows: 5017
Condensed Loss: 10.94
Non Condensed Rows: 873
Expected loss ration 5.75
Actual loss Ratio: 1.00


In [24]:
print("Disconnecting Session")
from google.colab import runtime
runtime.unassign()

Disconnecting Session
