# Training

### Load Dataset

In [11]:
import os
from datasets import load_from_disk, load_dataset, concatenate_datasets
#dataset_train = load_dataset('ASSERT-KTH/DISL', 'invariants-infillings', cache_dir=os.environ.get('TMPDIR'), streaming=True, split='train')
#dataset_test = load_dataset('ASSERT-KTH/DISL', 'invariants-infillings', cache_dir=os.environ.get('TMPDIR'), streaming=True, split='test')

dataset = load_dataset('GGmorello/FLAMES', 'abstract', num_proc=10)['train']

In [63]:
dataset['label']

['_impl!=address(0)',
 '_implementation!=implementation',
 'msg.sender==proxyOwner()',
 'newOwner!=address(0)',
 'address(this).delegatecall(data)',
 'initialImplementation.delegatecall(calldata)',
 '_masterCopy!=address(0),"Invalid master copy address provided"',
 "success&&(data.length==0||abi.decode(data,(bool))),'TransferHelper::safeApprove: approve failed'",
 "success&&(data.length==0||abi.decode(data,(bool))),'TransferHelper::safeTransfer: transfer failed'",
 "success&&(data.length==0||abi.decode(data,(bool))),'TransferHelper::transferFrom: transferFrom failed'",
 "success,'TransferHelper::safeTransferETH: ETH transfer failed'",
 "success,'Flush failed'",
 "msg.sender==parentAddress,'Only Parent'",
 "parentAddress==address(0x0),'Already initialized'",
 "success,'Flush failed'",
 'isAuthorized(msg.sender,msg.sig)',
 'setCache(_cacheAddr)',
 '_target!=0x0',
 '_cacheAddr!=0x0',
 'address(this).balance>=amount,"Address: insufficient balance"',
 'success,"Address: unable to send value

In [5]:
import os

base_model = "codellama/CodeLlama-7b-hf"



In [6]:
from transformers import AutoTokenizer
MAX_SEQ_LEN = 16384/4

tokenizer = AutoTokenizer.from_pretrained(base_model,
                                         truncation=False,
                                         model_max_length=MAX_SEQ_LEN,
                                         padding_side="left",
                                         use_fast=True,
                                         )

## Tokenization


In [7]:
tokenizer.add_eos_token = True
tokenizer.pad_token_id = 0

def tokenize(text, tokenizer, max_seq_len=MAX_SEQ_LEN):
    result = tokenizer(
        text,
        truncation=False,
        max_length=max_seq_len,
        padding=False,
        return_tensors=None,
    )
    if len(result['input_ids']) > max_seq_len:
        return None
        
    return result
    
def generate_and_tokenize_prompt(sample):
    input_text = sample['input']
    target_text = sample['predicate']
    
    tokenized_input_text = tokenize(input_text, tokenizer, max_seq_len=MAX_SEQ_LEN)
    tokenized_target_text = tokenize(target_text, tokenizer, max_seq_len=MAX_SEQ_LEN)

    if tokenized_input_text is None or tokenized_target_text is None:
        return  {"input_ids": None, "attention_mask": None, "labels": None}
    else:
        return {'input_ids': tokenized_input_text['input_ids'][:-1] + tokenized_target_text['input_ids'][1:], 
                  'attention_mask': [1] * len(tokenized_input_text['input_ids'][:-1] + tokenized_target_text['input_ids'][1:]),
                  'labels': [-100] * len(tokenized_input_text['input_ids'][:-1]) + tokenized_target_text['input_ids'][1:],
                }

In [8]:
tokenized = dataset['train'].map(generate_and_tokenize_prompt, 
                                remove_columns=['comment', 'input', 'label', 'predicate'],
                                num_proc=10,
                      )


Map (num_proc=10): 100%|██████████| 5282156/5282156 [1:39:11<00:00, 887.46 examples/s]  


In [9]:
train_tokenized = tokenized.filter(lambda sample: sample["input_ids"] is not None)


Filter: 100%|██████████| 5282156/5282156 [1:15:15<00:00, 1169.89 examples/s]


In [64]:
train_tokenized.push_to_hub("GGmorello/FLAMES_only_predicates", data_dir='data/tokenized_with_idx')

Creating parquet from Arrow format: 100%|██████████| 28/28 [00:03<00:00,  7.36ba/s]
Creating parquet from Arrow format: 100%|██████████| 28/28 [00:03<00:00,  7.40ba/s]
Creating parquet from Arrow format: 100%|██████████| 28/28 [00:03<00:00,  7.43ba/s]
Creating parquet from Arrow format: 100%|██████████| 28/28 [00:03<00:00,  7.48ba/s]
Creating parquet from Arrow format: 100%|██████████| 28/28 [00:03<00:00,  7.14ba/s]
Creating parquet from Arrow format: 100%|██████████| 28/28 [00:03<00:00,  7.33ba/s]
Creating parquet from Arrow format: 100%|██████████| 28/28 [00:04<00:00,  6.39ba/s]
Creating parquet from Arrow format: 100%|██████████| 28/28 [00:03<00:00,  7.23ba/s]
Creating parquet from Arrow format: 100%|██████████| 28/28 [00:03<00:00,  7.32ba/s]
Creating parquet from Arrow format: 100%|██████████| 28/28 [00:04<00:00,  6.03ba/s]
Creating parquet from Arrow format: 100%|██████████| 28/28 [00:05<00:00,  5.38ba/s]
Creating parquet from Arrow format: 100%|██████████| 28/28 [00:04<00:00,  6.

CommitInfo(commit_url='https://huggingface.co/datasets/GGmorello/FLAMES_only_predicates/commit/957217d07024c1af79c69d163c6e13366ebd7d3b', commit_message='Upload dataset (part 00002-of-00003)', commit_description='', oid='957217d07024c1af79c69d163c6e13366ebd7d3b', pr_url=None, pr_revision=None, pr_num=None)