In [1]:
## Step 1.1: Load new tokenizer from pretrained
from transformers import AutoTokenizer, TrainingArguments, Trainer
import pandas as pd
import re
from datasets import Dataset
from sklearn.model_selection import train_test_split
new_tokenizer = AutoTokenizer.from_pretrained("/home/bombbom/Documents/NLP_in_Detection_System/save_model/tokenizer/")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## Step 2: Load and process dataset

In [3]:
data = pd.read_pickle("/home/bombbom/Documents/NLP_in_Detection_System/dataset_example/labeled_SBW_datasets.pkl")
data = data[['source_code']]

In [4]:
def remove_comments(string):
    pattern = r"(\".*?\"|\'.*?\')|(/\*.*?\*/|//[^\r\n]*$)"
    # first group captures quoted strings (double or single)
    # second group captures comments (//single-line or /* multi-line */)
    regex = re.compile(pattern, re.MULTILINE|re.DOTALL)
    def _replacer(match):
        # if the 2nd group is not None, then we have captured a real comment string.
        if match.group(2) is not None:
            return "" 
        else: # otherwise, we will return the 1st group
            return match.group(1) 
    return regex.sub(_replacer, string)

operators3 = {'<<=', '>>='}
operators2 = {
    '->', '++', '--',
    '!~', '<<', '>>', '<=', '>=',
    '==', '!=', '&&', '||', '+=',
    '-=', '*=', '/=', '%=', '&=', '^=', '|='
}
operators1 = {
    '(', ')', '[', ']', '.',
    '+', '-', '*', '&', '/',
    '%', '<', '>', '^', '|',
    '=', ',', '?', ':', ';',
    '{', '}'
}
def tokenize(line):
        line = line.replace("\n"," ").replace("\r\n"," ").replace("\r"," ")
        tmp, w = [], []
        i = 0
        while i < len(line):
            # Ignore spaces and combine previously collected chars to form words
            if line[i] == ' ':
                tmp.append(''.join(w))
                tmp.append(line[i])
                w = []
                i += 1
            # Check operators and append to final list
            elif line[i:i + 3] in operators3:
                tmp.append(''.join(w))
                tmp.append(line[i:i + 3])
                w = []
                i += 3
            elif line[i:i + 2] in operators2:
                tmp.append(''.join(w))
                tmp.append(line[i:i + 2])
                w = []
                i += 2
            elif line[i] in operators1:
                tmp.append(''.join(w))
                tmp.append(line[i])
                w = []
                i += 1
            # Character appended to word list
            else:
                w.append(line[i])
                i += 1
        # Filter out irrelevant strings
        res = list(filter(lambda c: c != '', tmp))
        return ' '.join(list(filter(lambda c: c != ' ', res)))

In [5]:
data.source_code = data.source_code.apply(remove_comments)
data.source_code = data.source_code.apply(tokenize)

In [6]:
train, test = train_test_split(data,test_size=0.3, random_state=20)

In [7]:
train, val = train_test_split(train,test_size=0.15, random_state=20)

In [8]:
del data

In [9]:
data_train = Dataset.from_pandas(train)
data_test = Dataset.from_pandas(test)
data_val = Dataset.from_pandas(val)



In [10]:
def preprocess_function(examples):
    return new_tokenizer([" ".join(x) for x in examples["source_code"]], padding = "max_length", truncation=True)

block_size = 128

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result


## Step 3: Prepare dataset for MLM task 

In [11]:
tokenized_data_train = data_train.map(preprocess_function, batched = True, num_proc=4, remove_columns = data_train.column_names)
tokenized_data_val = data_val.map(preprocess_function, batched = True, num_proc=4, remove_columns = data_val.column_names)


                                                                               

In [12]:
lm_dataset_train = tokenized_data_train.map(group_texts, batched=True, num_proc=4 )
lm_dataset_val = tokenized_data_val.map(group_texts, batched=True, num_proc=4)

                                                                                

In [12]:
lm_dataset_train

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 112644
})

## Step 4: Load pretrained model distilbert-base-uncased


In [13]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=new_tokenizer, mlm_probability=0.15)
from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained("distilbert-base-uncased")
print(model)
# %%
len(new_tokenizer)
## Resize 
model.resize_token_embeddings(len(new_tokenizer))
print(model)

DistilBertForMaskedLM(
  (activation): GELUActivation()
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.

In [15]:

## Step 5: Setup training_args 
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir="pre-train-mlm",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    save_steps=10000,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset_train,
    eval_dataset=lm_dataset_val,
    data_collator=data_collator,
)


## Step 6: Trainning model



In [None]:
trainer.train()


## Step 7: Saving model



In [None]:
trainer.save_model("/home/bombbom/Documents/Project_CS_Vul_Detect/py/save_trained_models/pre_train/pre_train_6/")