In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
dir_path = '/content/drive/MyDrive/Uncertainty_Quantification'
os.chdir(dir_path)

In [None]:
%ls

data_normal.csv             Sample_data_creation_stress_index.ipynb
data_with_stress_index.csv  Uncertainty_Quantification_llama.ipynb


In [None]:
%%capture
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets
!pip install evaluate
!pip install -qqq trl==0.7.1
!pip install torch

In [None]:
import torch
import time
import evaluate
import pandas as pd
import numpy as np
from datasets import Dataset, load_dataset
import random
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [6]:
model_id =  "NousResearch/Llama-2-7b-hf"
# model_id = "meta-llama/Llama-2-13b-chat-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [39]:
from transformers import StoppingCriteria
class EosListStoppingCriteria(StoppingCriteria):
    def __init__(self, eos_sequence = [319,350,360,315]):
        self.eos_sequence = eos_sequence

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        last_ids = input_ids[:,-len(self.eos_sequence):].tolist()
        return self.eos_sequence in last_ids

In [36]:
tokenizer.encode('<DONE>')

[1, 529, 29928, 12413, 29958]

In [42]:
import warnings
warnings.filterwarnings('ignore')

prompt = f"""
###Instruction: Act as a science expert and help in answering the following multiple choice question. The answer should be one of the option choices.
Follow the pattern from the first example and answer the second question accordingly.

###Question:
This is a question from high school biology.A piece of potato is dropped into a beaker of pure water.
Which of the following describes the activity after the potato is immersed into the water?
(A) Water moves from the potato into the surrounding water.
(B) Water moves from the surrounding water into the potato.
(C) Potato cells plasmolyze.
(D) Solutes in the water move into the potato.

###Answer:
B

###Question:
You are the world’s best expert in high school biology. From the solubility rules, which of the following is true?
(A) All hydroxides are soluble
(B) All sulfates are soluble
(C) All chlorides, bromides, and iodides are soluble
(D) All ammonium-containing compounds are soluble

###Answer:
"""

inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
    model.generate(
        inputs["input_ids"],
        temperature=0.2,
        do_sample=True,
        max_new_tokens=8,
        stopping_criteria = [EosListStoppingCriteria()]
    )[0],
    skip_special_tokens=False
)

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:
<s> 

###Instruction: Act as a science expert and help in answering the following multiple choice question. The answer should be one of the option choices. 
Follow the pattern from the first example and answer the second question accordingly.

###Question:
This is a question from high school biology.A piece of potato is dropped into a beaker of pure water. 
Which of the following describes the activity after the potato is immersed into the water?
(A) Water moves from the potato into the surrounding water.
(B) Water moves from the surrounding water into the potato.
(C) Potato cells plasmolyze.
(D) Solutes in the water move into the potato.

###Answer:
B

###Question:
You are the world’s best expert in high school biology. From the solubility rules, which of the following is true?
(A) All hydroxides are soluble
(B) All sulfates are soluble
(C) All chlorides, b

In [31]:
tokenizer.encode('B'),tokenizer.encode('D'),tokenizer.encode('A'),tokenizer.encode('C')

([1, 350], [1, 360], [1, 319], [1, 315])

In [43]:
prompt = f"""
###Instruction: Act as a science expert and help in answering the following multiple choice question. The answer should be one of the option choices.
Follow the pattern from the first example and answer the second question accordingly.

###Question:
This is a question from high school biology.A piece of potato is dropped into a beaker of pure water.
Which of the following describes the activity after the potato is immersed into the water?
(A) Water moves from the potato into the surrounding water.
(B) Water moves from the surrounding water into the potato.
(C) Potato cells plasmolyze.
(D) Solutes in the water move into the potato.

###Answer:
B

###Question:
You are the world’s best expert in high school biology. From the solubility rules, which of the following is true?
(A) All hydroxides are soluble
(B) All sulfates are soluble
(C) All chlorides, bromides, and iodides are soluble
(D) All ammonium-containing compounds are soluble

###Answer:
"""

inputs = tokenizer(prompt, return_tensors='pt')
outputs=model(inputs["input_ids"])

next_token_logits = outputs.logits[:, -1]

In [73]:
next_token_logits[0,next_token_logits.argmax()]

tensor(21.8125, grad_fn=<SelectBackward0>)

In [66]:
tokens_of_interest=['A','B','C','D']
token_indices = tokenizer.convert_tokens_to_ids(tokens_of_interest)

In [70]:
indices_in_logits = {token: next_token_logits[0,token_idx].item() for token, token_idx in zip(tokens_of_interest, token_indices)}

In [71]:
indices_in_logits

{'A': 21.578125, 'B': 20.78125, 'C': 21.8125, 'D': 21.5}