In [None]:
import validation_library as veri   
from peft import PeftConfig, PeftModel
from transformers import LlamaForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import os  
import importlib
import gc
from dotenv import load_dotenv
from collections import defaultdict
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

importlib.reload(veri)

counter_greater_than_200 = 0

contract_lines = []


contracts = veri.get_files("/home/matteo/FLAMES/validation-results/sb-heists/smartbugs-curated/0.4.x/contracts/dataset")

for contract_path, contract_name in contracts:
    contract, line = veri.find_occurrences(contract_path, "// <yes> <report>")
    print(contract_name, line)
    
    if contract.count('\n') > 200:
        counter_greater_than_200 += 1 
    else:
        contract = veri.replace_lines_with_string(contract, line, '')
        contract_lines.append((contract_name, contract, line))

veri.print_json_report("reports/aggregated/contract_no_comment.json", contract_lines)



In [None]:
load_dotenv()
token = os.getenv("HF_TOKEN")
print(token)

all_contracts = []
mapping = []  

#VL

for idx, (contract_name, contract, lines) in enumerate(contract_lines):
   
    for line in lines:
        prompt_with_fill = veri.replace_lines_with_string(contract, [line], 'require(<FILL_ME>);') ##ask if I should generate again or not
        all_contracts.append(prompt_with_fill)
        mapping.append((contract_name, contract, line)) 

In [None]:
config = PeftConfig.from_pretrained("GGmorello/FLAMES-100k-multi-gpu", token=token)

ft_model = LlamaForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    token = token,
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
    ),
    cache_dir=os.environ.get("TMPDIR")
)

ft_model_20 = PeftModel.from_pretrained(ft_model, "GGmorello/FLAMES-100k-multi-gpu", token=token)

#llama_tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b", token=token)
llama_tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, token=token) 
ft_model_20 = ft_model_20.to('cuda')

In [None]:
PROMPT = all_contracts[2]

input_ids = llama_tokenizer(PROMPT, return_tensors="pt")["input_ids"].to('cuda')
#generated_ids = ft_model_20.generate(input_ids, max_new_tokens=128)
generated_ids = ft_model.generate(input_ids, max_new_tokens=128)

filling = llama_tokenizer.batch_decode(generated_ids[:, input_ids.shape[1]:], skip_special_tokens = True)[0]
#print(filling)

In [None]:
from tqdm import tqdm

results_20 = []

chunk_size = 8  
for i in tqdm(range(0, len(all_contracts), chunk_size)):
    chunk = all_contracts[i:i + chunk_size]

    for data in chunk:
        tok = llama_tokenizer(data, return_tensors='pt', truncation=True, max_length=2048)
        tok = {k: v.to('cuda') for k, v in tok.items()}

        with torch.no_grad():
            generated_ids = ft_model.generate(
                **tok,
                max_new_tokens=256,
                pad_token_id=llama_tokenizer.eos_token_id
            )

        ft_filling = llama_tokenizer.batch_decode(
            generated_ids[:, tok['input_ids'].shape[1]:],
            skip_special_tokens=True
        )[0]

        results_20.append(ft_filling)

        del tok
        del generated_ids
        torch.cuda.empty_cache()
        gc.collect()

In [None]:
results_20 += [''] * (len(all_contracts) - len(results_20))
print(len(all_contracts))
replaced_contracts_VL = []
contracts_with_results = defaultdict(list)

for generated, (contract_name, contract, line) in zip(results_20, mapping):
    pre, post = veri.find_function_bounds(contract, line)
    contracts_with_results[contract_name].append((contract, {
        "VL": (line,f'require({generated});'),
        "pre": (pre+1,""),
        "post": (post,"")
    }))

In [None]:
all_contracts = []
mapping = []  

#pre

for contract_name, entries in contracts_with_results.items():
    for contract, annotations in entries:
        line, _ = annotations["pre"]  # estrai la linea dove c'è il "require(...)"
        contract_with_VL = veri.replace_lines_with_string(contract_with_VL,[annotations["VL"][0]],[annotations["VL"][1]])
        contract_with_VL = veri.insert_empty_line(contract_with_VL, annotations["pre"][0])
        prompt_with_fill = veri.replace_lines_with_string(
            contract_with_VL, 
            [line], 'require(<FILL_ME>);')
        all_contracts.append(prompt_with_fill)
        mapping.append((contract_name, contract, line))

In [None]:
PROMPT = all_contracts[2]

input_ids = llama_tokenizer(PROMPT, return_tensors="pt")["input_ids"].to('cuda')
#generated_ids = ft_model_20.generate(input_ids, max_new_tokens=128)
generated_ids = ft_model.generate(input_ids, max_new_tokens=128)

filling = llama_tokenizer.batch_decode(generated_ids[:, input_ids.shape[1]:], skip_special_tokens = True)[0]
#print(filling)

In [None]:
from tqdm import tqdm

results_20 = []

chunk_size = 8  
for i in tqdm(range(0, len(all_contracts), chunk_size)):
    chunk = all_contracts[i:i + chunk_size]

    for data in chunk:
        tok = llama_tokenizer(data, return_tensors='pt', truncation=True, max_length=2048)
        tok = {k: v.to('cuda') for k, v in tok.items()}

        with torch.no_grad():
            generated_ids = ft_model.generate(
                **tok,
                max_new_tokens=256,
                pad_token_id=llama_tokenizer.eos_token_id
            )

        ft_filling = llama_tokenizer.batch_decode(
            generated_ids[:, tok['input_ids'].shape[1]:],
            skip_special_tokens=True
        )[0]

        results_20.append(ft_filling)

        del tok
        del generated_ids
        torch.cuda.empty_cache()
        gc.collect()

In [None]:
results_20 += [''] * (len(all_contracts) - len(results_20))
flat_entries = []
for contract_name, entries in contracts_with_results.items():
    for entry in entries:
        flat_entries.append((contract_name, entry))

for (generated, (contract_name, contract, line)), (_, (existing_contract, annotations)) in zip(zip(results_20, mapping), flat_entries):
    line, _ = annotations["pre"]
    annotations["pre"] = (line, f'require({generated});')

    

In [None]:
all_contracts = []
mapping = []  

#post

for contract_name, entries in contracts_with_results.items():
    for contract, annotations in entries:
        line, _ = annotations["post"]  
        contract_with_VL = veri.replace_lines_with_string(contract_with_VL,[annotations["VL"][0]],[annotations["VL"][1]])
        contract_PV = veri.insert_empty_line(contract_with_VL, annotations["pre"][0])
        contract_PV = veri.replace_lines_with_string(contract_PV,[annotations["pre"][0]],[annotations["pre"][1]])
        contract_PV = veri.insert_empty_line(contract_PV, line + 1)
        prompt_with_fill = veri.replace_lines_with_string(contract_PV, [line + 1], 'require(<FILL_ME>);') #think about the empty line I added
        all_contracts.append(prompt_with_fill)
        mapping.append((contract_name, contract, line))

In [None]:
PROMPT = all_contracts[2]

input_ids = llama_tokenizer(PROMPT, return_tensors="pt")["input_ids"].to('cuda')
#generated_ids = ft_model_20.generate(input_ids, max_new_tokens=128)
generated_ids = ft_model.generate(input_ids, max_new_tokens=128)

filling = llama_tokenizer.batch_decode(generated_ids[:, input_ids.shape[1]:], skip_special_tokens = True)[0]
#print(filling)

In [None]:
from tqdm import tqdm

results_20 = []

chunk_size = 8  
for i in tqdm(range(0, len(all_contracts), chunk_size)):
    chunk = all_contracts[i:i + chunk_size]

    for data in chunk:
        tok = llama_tokenizer(data, return_tensors='pt', truncation=True, max_length=2048)
        tok = {k: v.to('cuda') for k, v in tok.items()}

        with torch.no_grad():
            generated_ids = ft_model.generate(
                **tok,
                max_new_tokens=256,
                pad_token_id=llama_tokenizer.eos_token_id
            )

        ft_filling = llama_tokenizer.batch_decode(
            generated_ids[:, tok['input_ids'].shape[1]:],
            skip_special_tokens=True
        )[0]

        results_20.append(ft_filling)

        del tok
        del generated_ids
        torch.cuda.empty_cache()
        gc.collect()

In [None]:
results_20 += [''] * (len(all_contracts) - len(results_20))
flat_entries = []
for contract_name, entries in contracts_with_results.items():
    for entry in entries:
        flat_entries.append((contract_name, entry))

for (generated, (contract_name, contract, line)), (_, (existing_contract, annotations)) in zip(zip(results_20, mapping), flat_entries):
    annotations["post"] = (line, f'require({generated});')

veri.print_json_report("reports/aggregated/contracts_aggregated_results.json", contracts_with_results)
    