In [None]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
import io
import sys
import time
import json
import pandas as pd
import datasets
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import torch
from collections import defaultdict
import gc

from transformers import AutoTokenizer, AutoModelForCausalLM, TextGenerationPipeline
from Evaluator import Evaluator

import cloudpickle as pickle

def save_to_pickle(obj, filepath):
    with open(filepath, 'wb') as file:
        pickle.dump(obj, file)

def load_from_pickle(filepath):
    with open(filepath, 'rb') as file:
        return pickle.load(file)

def _make_r_io_base(f, mode: str):
    if not isinstance(f, io.IOBase):
        f = open(f, mode=mode)
    return f

def jload(f, mode="r"):
    """Load a .json file into a dictionary."""
    f = _make_r_io_base(f, mode)
    jdict = json.load(f)
    f.close()
    return jdict

def Average(lst): 
    return sum(lst) / len(lst)

token_model_dir = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(token_model_dir,padding_side='left')
model_dir = "/work/ree398/LLM-Workshop/mistral_7b_output_dir"
model = AutoModelForCausalLM.from_pretrained(model_dir, device_map="auto")

generator = TextGenerationPipeline(
    model=model, tokenizer=tokenizer)

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

tokenizer.pad_token = tokenizer.eos_token
EOS_TOKEN = tokenizer.eos_token
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass


dataset_path = "/work/ree398/LLM-Workshop/alpaca_data.json"
data = jload(dataset_path)
dataset = load_dataset("json", data_files=dataset_path)
dataset = dataset.map(formatting_prompts_func, batched=True)

train_dataset, test_dataset = train_test_split(dataset["train"], test_size=0.2, random_state=42)
train_dataset = datasets.Dataset.from_pandas(pd.DataFrame(data=train_dataset))
test_dataset = datasets.Dataset.from_pandas(pd.DataFrame(data=test_dataset))

evaluator = Evaluator()
bleu_list = []
rouge_list =[]
bert_list = []

top_k = [2, 20, 50, 100]
num_beams = [1, 2, 10, 20]
temps = [0.1, 0.5, 1, 1.5]

results = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))

bleu_list = []
rouge_list = []
bert_list = []
for k in top_k:
    for beam in num_beams:
        for temp in temps:
            # Reset metrics lists for the current combination
            bleu_list = []
            rouge_list = []
            bert_list = []
            for i in range(len(test_dataset[0])):
                str_0 = test_dataset[i]['output']
                text = alpaca_prompt.format(test_dataset[i]['instruction'], test_dataset[i]['input'], '') + EOS_TOKEN
                tokens = tokenizer(text, return_tensors="pt", padding="longest", pad_to_multiple_of=8)
                attention_mask = tokens['attention_mask'].to('cuda')
                input_ids = tokens['input_ids'].to('cuda')
            
                output = model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    pad_token_id=tokenizer.eos_token_id,
                    max_new_tokens=256,
                    top_k=k,
                    num_beams=beam,
                    temperature=temp
                )
                str_1 = tokenizer.decode(output[0], skip_special_tokens=True)
                str_list = [str_0, str_1]
                evaluator.set_strs_list(str_list)
                bleu, rouge, bert = evaluator.PerformEval(verbose=False)
                bleu_list.append(bleu)
                rouge_list.append(rouge['f'])
                bert_list.append(bert[2])  # F1
                    
                torch.cuda.empty_cache()
                gc.collect()
            
            # Store the results
            results[k][beam][temp] = {'BLEU': Average(bleu_list), 'ROUGE': Average(rouge_list), 'BERT': Average(bert_list)}
            if False:                                              
                print(f"Bleu F1 Score: {Average(bleu_list)}")
                print(f"Rouge F1 Score: {Average(rouge_list)}")
                print(f"Bert Score: {Average(bert_list)}")

save_to_pickle(results, './results.pkl')

for top_k, beams_dict in results.items():
    for num_beams, temps_dict in beams_dict.items():
        for temp, metrics_dict in temps_dict.items():
            print(f"top_k={top_k}, num_beams={num_beams}, temperature={temp}")
            print(f"Metrics: {metrics_dict}")
            print("----")


In [1]:
import torch
from transformers import AutoTokenizer, AutoProcessor, TrainingArguments, LlavaForConditionalGeneration, BitsAndBytesConfig
from trl import SFTTrainer
from peft import LoraConfig
import json
from PIL import Image 

model_id = "llava-hf/llava-1.5-7b-hf"
safetensor_path = "/work/ree398/visual_research/out_dir"
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
)
model = LlavaForConditionalGeneration.from_pretrained(safetensor_path)
tokenizer = AutoTokenizer.from_pretrained(model_id)
processor = AutoProcessor.from_pretrained(model_id)
processor.tokenizer = tokenizer


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of the model checkpoint at /work/ree398/visual_research/out_dir were not used when initializing LlavaForConditionalGeneration: ['language_model.lm_head.base_layer.weight', 'language_model.lm_head.lora_A.default.weight', 'language_model.lm_head.lora_B.default.weight', 'language_model.model.layers.0.mlp.down_proj.base_layer.weight', 'language_model.model.layers.0.mlp.down_proj.base_layer.weight.absmax', 'language_model.model.layers.0.mlp.down_proj.base_layer.weight.quant_map', 'language_model.model.layers.0.mlp.down_proj.base_layer.weight.quant_state.bitsandbytes__fp4', 'language_model.model.layers.0.mlp.down_proj.lora_A.default.weight', 'language_model.model.layers.0.mlp.down_proj.lora_B.default.weight', 'language_model.model.layers.0.mlp.gate_proj.base_layer.weight', 'language_model.model.layers.0.mlp.gate_proj.base_layer.weight.absmax', 'language_model.model.layers.0.mlp.gate_proj.base_layer.weight.quant_map', 'language_model.model.layers.0.mlp.gate_proj.base_layer.weight

In [2]:
test_file = open('test_data.json')
test_data = json.load(test_file)
print(len(test_data))

29


In [27]:
def extract_answer(text):
    # Look for the keyword "Answer:" and extract the text after it
    try:
        start = text.index("Answer:") + len("Answer:")
        answer = text[start:].strip()
        return answer
    except ValueError:
        # If "Answer:" is not found in the text, return an empty string or a predefined error message
        return "Answer keyword not found"

In [3]:
texts = test_data[0]['conversations'][0]['value']
images = Image.open(test_data[0]['image'][1:])
inputs = processor(texts, images, return_tensors="pt", padding=True, truncation=True)
out = model.generate(**inputs, max_new_tokens=3)
decoded_output = tokenizer.decode(out[0], skip_special_tokens=True)
print(decoded_output)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



Are intimate areas being touching on a character? Only answer yes or no. Answer: inclus inclus inclus


In [29]:
from Evaluator import Evaluator

e = Evaluator()
str_0 = extract_answer(texts + test_data[0]['conversations'][1]['value'])
str_1 = extract_answer(decoded_output)
print([str_0,str_1])
e.set_strs_list([str_0,str_1])
e.PerformEval(False)

['No', 'inclus inclus inclus']


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


(1.646211035903463e-231,
 {'r': 1.0, 'p': 1.0, 'f': 0.999999995},
 (tensor([1.]), tensor([1.]), tensor([1.])))

In [26]:
extract_answer(str_1)

'inclus inclus inclus'

In [22]:
str_0

'<image>\nAre intimate areas being touching on a character? Only answer yes or no. Answer:No'

In [30]:
model_id = "llava-hf/llava-1.5-7b-hf"
model_og = LlavaForConditionalGeneration.from_pretrained(model_id)



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [31]:
texts = test_data[0]['conversations'][0]['value']
images = Image.open(test_data[0]['image'][1:])
inputs = processor(texts, images, return_tensors="pt", padding=True, truncation=True)
out = model_og.generate(**inputs, max_new_tokens=3)
decoded_output = tokenizer.decode(out[0], skip_special_tokens=True)
print(decoded_output)

from Evaluator import Evaluator

e = Evaluator()
str_0 = extract_answer(texts + test_data[0]['conversations'][1]['value'])
str_1 = extract_answer(decoded_output)
print([str_0,str_1])
e.set_strs_list([str_0,str_1])
e.PerformEval(False)


Are intimate areas being touching on a character? Only answer yes or no. Answer: Yes
['No', 'Yes']


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


(1.646211035903463e-231,
 {'r': 1.0, 'p': 1.0, 'f': 0.999999995},
 (tensor([1.]), tensor([1.]), tensor([1.])))

In [41]:

bleu_list_ft = []
rouge_list_ft = []
bert_list_ft = []
for obj in test_data:
    #Fine Tuned model inference
    texts = obj['conversations'][0]['value']
    images = Image.open(obj['image'][1:])
    inputs = processor(texts, images, return_tensors="pt", padding=True, truncation=True)
    out = model.generate(**inputs, max_new_tokens=3)
    decoded_output = tokenizer.decode(out[0], skip_special_tokens=True)
    str_0 = extract_answer(texts + test_data[0]['conversations'][1]['value'])
    str_1 = extract_answer(decoded_output)

    #Fine Tuned model evaluation
    str_list = [str_0, str_1]
    e.set_strs_list(str_list)
    bleu, rouge, bert = e.PerformEval(verbose=False)
    bleu_list_ft.append(bleu)
    rouge_list_ft.append(rouge['f'])
    bert_list_ft.append(bert[2]) 

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

In [39]:
bert_list_ft

[tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.])]

In [42]:
bleu_list_og = []
rouge_list_og = []
bert_list_og = []
for obj in test_data:
    #Original model inference
    texts = obj['conversations'][0]['value']
    images = Image.open(obj['image'][1:])
    inputs = processor(texts, images, return_tensors="pt", padding=True, truncation=True)
    out = model_og.generate(**inputs, max_new_tokens=3)
    decoded_output = tokenizer.decode(out[0], skip_special_tokens=True)
    str_0 = extract_answer(texts + test_data[0]['conversations'][1]['value'])
    str_1 = extract_answer(decoded_output)

    #Original model evaluation
    str_list = [str_0, str_1]
    e.set_strs_list(str_list)
    bleu, rouge, bert = e.PerformEval(verbose=False)
    bleu_list_og.append(bleu)
    rouge_list_og.append(rouge['f'])
    bert_list_og.append(bert[2]) 

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

In [43]:
bert_list_og

[tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.]),
 tensor([1.])]