In [1]:
from utils import TLDRDataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

ds = TLDRDataset('data/tldr-filtered.json', tokenizer)
dl = torch.utils.data.DataLoader(ds, batch_size = 16)

In [3]:
for batch_idx, (text, summary_length) in enumerate(dl):
    break

In [4]:
inputs = tokenizer(text, padding=True, truncation=True, return_length = True, max_length = 512, return_tensors = 'pt')
total_length = inputs.pop('length')


In [5]:
from datasets import load_dataset
import transformers
from transformers.testing_utils import CaptureLogger

In [14]:
datasets = load_dataset('json', field='data', data_files = {'test': './data/tldr-filtered-test.json'})

Using custom data configuration default


Downloading and preparing dataset json/default-aacabb4e0f3c1bc0 (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/kip/.cache/huggingface/datasets/json/default-aacabb4e0f3c1bc0/0.0.0/70d89ed4db1394f028c651589fcab6d6b28dddcabbe39d3b21b4d41f9a708514...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset json downloaded and prepared to /home/kip/.cache/huggingface/datasets/json/default-aacabb4e0f3c1bc0/0.0.0/70d89ed4db1394f028c651589fcab6d6b28dddcabbe39d3b21b4d41f9a708514. Subsequent calls will reuse this data.


In [63]:
column_names = datasets["test"].column_names
text_column_name = "text" if "text" in column_names else column_names[0]

tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")

tokenizer = transformers.AutoTokenizer.from_pretrained('gpt2')

def tokenize_function(examples):
    with CaptureLogger(tok_logger) as cl:
        text = [content + ' TLDR:' + summary for content, summary in zip(examples['content'], examples['summary'])]
        output = tokenizer(text, return_length = True)
        
        summary_lengths = tokenizer(examples['summary'], return_length = True)['length']
        total_lengths = output.pop("length")
        # for item in tokenize batch
        output['mask'] = []
        for i in range(len(summary_lengths)):
            output['mask'].append([0 for _ in range(total_lengths[i])])
            for j in range(total_lengths[i] - summary_lengths[i], total_lengths[i]):
                output['mask'][i][j] = 1
    # clm input could be much much longer than block_size
    if "Token indices sequence length is longer than the" in cl.out:
        tok_logger.warning(
            "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model."
        )
    return output

tokenized_datasets = datasets.map(
    tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=column_names)







In [65]:
for data in tokenized_datasets['test']:
    break

In [49]:
lengths.min()

115

In [55]:
block_size = 1024

# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
    
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    num_proc=4
)

python midtune.py \
    --model_name_or_path distilgpt2 \
    --train_file ../../data/tldr/tldr-filtered-train.json \
    --validation_file ../../data/tldr/tldr-filtered-test.json \
    --do_train \
    --do_eval \
    --output_dir /media/external_usb/kip/models/distilgpt2 







In [36]:
deepspeed --num_gpus=2 midtune.py \
--deepspeed ds_config.json \
--model_name_or_path gpt2-xl \
--train_file ../../data/tldr/tldr-filtered-train.json \
--validation_file ../../data/tldr/tldr-filtered-test.json \
--do_train \
--do_eval \
--fp16 \
--overwrite_cache \
--evaluation_strategy="steps" \
--output_dir finetuned \
--eval_steps 200 \
--num_train_epochs 1 \
--gradient_accumulation_steps 2 \
--per_device_train_batch_size 1

#Data storage location
#/home/aleph/.cache/huggingface/transformers/

In [38]:
len(data['attention_mask'])

187

In [103]:
acc_steps = 3

for i in range(30):
    if (i + 1) % acc_steps == 0:
        print(i)

2
5
8
11
14
17
20
23
26
29


In [94]:
mask = torch.zeros_like(shift_labels, dtype=torch.bool)
for i, (s, t) in enumerate(zip(summary_length, total_length)):
        mask[i][t - s - 1 : t - 1] = True 

In [95]:
shift_labels[0, mask[0]]

tensor([34094,  3454,  2194,  1869,    13])

In [99]:
print(tokenizer.decode(shift_labels[3, mask[3]]))
print(text[3])

series of unrelated fucked up things reveals that I'm a very selfish person.
Long one, 
 I was in Walmart, being chased by a man whom I knew was going to rape me/cause me serious bodily harm. Said Walmart became a maze and was increasingly hard to navigate, then I come upon my ex boyfriend in the yarn section (which I'm not sure exists in Walmart stores) and he refused to help me. He said, "You deserve what is happening to you". So I'm running out of breath, feeling terrified, knowing I'm going to be caught when I see the door and run outside. 
 It's raining, and very dark out. Two of my professors are out in the parking lot, one is crying and staring off into the distance. I follow her gaze to see this terrible scene: A bridge over a gaping ravine has collapsed and cars are still driving off the edge and crashing into the river beneath. People are dying right before my eyes, screaming, crying. My ears are filled with a terrible screeching sound louder than any other sobbing and I'm su

16