In [1]:
from llama2 import *
from typing import List, Literal, Optional, Tuple, TypedDict
from pprint import pprint
import pandas as pd
import numpy as np
import datasets
import evaluate

from torch.utils.data import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_size='int4'
max_samples=100

In [3]:
# Parameters
model_size = "int8"
max_samples = -1


## 1 - Load model

In [4]:
model_name = "meta-llama/Llama-2-7b-chat-hf"

model = LlamaModel(
    model_name=model_name,
    model_resolution=model_size
)

Loading checkpoint shards:   0%|                                                        | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:  50%|████████████████████████                        | 1/2 [00:01<00:01,  1.06s/it]

Loading checkpoint shards: 100%|████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.48it/s]

Loading checkpoint shards: 100%|████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.36it/s]




## 2 - Load data

In [5]:
dataset = datasets.load_dataset('hellaswag', split='validation')

In [6]:
dataset

Dataset({
    features: ['ind', 'activity_label', 'ctx_a', 'ctx_b', 'ctx', 'endings', 'source_id', 'split', 'split_type', 'label'],
    num_rows: 10042
})

In [7]:
dataset[0]

{'ind': 24,
 'activity_label': 'Roof shingle removal',
 'ctx_a': 'A man is sitting on a roof.',
 'ctx_b': 'he',
 'ctx': 'A man is sitting on a roof. he',
 'endings': ['is using wrap to wrap a pair of skis.',
  'is ripping level tiles off.',
  "is holding a rubik's cube.",
  'starts pulling up roofing on a roof.'],
 'source_id': 'activitynet~v_-JhWjGDPHMY',
 'split': 'val',
 'split_type': 'indomain',
 'label': '3'}

In [8]:
n_options = [len(x['endings']) for x in dataset]
assert min(n_options) == max(n_options)
max(n_options), min(n_options) # All endings in the dataset have exactly four options

(4, 4)

In [9]:
x = dataset[2]
options = [f"\t{chr(ord('a')+i)}) {option}" for i, option in enumerate(x['endings'])]
endings = '\n'.join(options)
text = f'''\
{model.B_INST} You are solving an entailment task, given the situation respond with the most appropriate completion. 
Think logically and step by step. {model.E_INST}
{x["activity_label"]}. {x['ctx']}

{endings}

Answer:

The correct answer is ('''

print(text)

[INST] You are solving an entailment task, given the situation respond with the most appropriate completion. 
Think logically and step by step. [/INST]
Canoeing. Two women in a child are shown in a canoe while a man pulls the canoe while standing in the water, with other individuals visible in the background. the child and a different man

	a) are then shown paddling down a river in a boat while a woman talks.
	b) are driving the canoe, they go down the river flowing side to side.
	c) sit in a canoe while the man paddles.
	d) walking go down the rapids, while the man in his helicopter almost falls and goes out of canoehood.

Answer:

The correct answer is (


In [10]:
output = model.generate(text)
print(output)

<s> [INST] You are solving an entailment task, given the situation respond with the most appropriate completion. 
Think logically and step by step. [/INST]
Canoeing. Two women in a child are shown in a canoe while a man pulls the canoe while standing in the water, with other individuals visible in the background. the child and a different man

	a) are then shown paddling down a river in a boat while a woman talks.
	b) are driving the canoe, they go down the river flowing side to side.
	c) sit in a canoe while the man paddles.
	d) walking go down the rapids, while the man in his helicopter almost falls and goes out of canoehood.

Answer:

The correct answer is (b) are driving the canoe, they go down the river flowing side to side.

Explanation:

Based on the given situation, the most appropriate completion is (b) are driving the canoe, they go down the river flowing side to side. This is because the man is shown pulling the canoe while standing in the water, indicating that he is the on

In [11]:
class HellaSwag(Dataset):
    def __init__(self, split: str):
        self.dataset = datasets.load_dataset('hellaswag', split='validation')

    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx: int):
        x = self.dataset[idx]
        options = [f"\t{chr(ord('a')+i)}) {option}" for i, option in enumerate(x['endings'])]
        endings = '\n'.join(options)
        text = (f'{x["activity_label"]} {x["ctx"]}\n\n'
                f'{endings}\n\n'
                'Answer:\n\nThe correct answer is (')
        
        y = x['label']
        return text, int(y)

In [12]:
def eval(model: LlamaModel, x):
    with torch.no_grad():
        tok_ids = model.tokenize(x)
        output = model.model(tok_ids.to(model.model.device))
        choices = model.tokenizer.convert_tokens_to_ids(['a','b','c','d'])
        probs = torch.softmax(output['logits'][:,-1,choices], dim=1)
    return probs.argmax().item(), probs.max().item()


def eval_dataset(model, data, max_samples):
    gt, pred, prob = [], [], []

    limit = len(data)
    if max_samples > 0:
        limit = min([limit, max_samples])

    for i in range(limit):
        x, y = data[i]
        res = eval(model, x)

        gt.append(y)
        pred.append(res[0])
        prob.append(res[1])

    df = pd.DataFrame({'y': gt, 'y_pred': pred, 'prob': prob})
    return df


def get_metrics(df: pd.DataFrame) -> Dict[str, float]:
    accuracy = (df.y_pred == df.y).sum() / len(df)
    
    mat = torch.zeros(4,4, dtype=int)
    for i in range(len(df)):
        row = df.iloc[i]
        mat[int(row.y_pred), int(row.y)] += 1

    recall = [mat[i, i] / (mat[i, :].sum() + 1e-6) for i in range(4)]
    precision = [mat[i, i] / (mat[:, i].sum() + 1e-6)for i in range(4)]

    mp = sum(precision) / 4
    mr = sum(recall) / 4

    f1 = (2 * (mp * mr) / (mp + mr)).item()

    return {'accuracy': f'{accuracy:.4f}', 'f1': f'{f1:.4f}'}

## 3 - Run inference

In [13]:
df = eval_dataset(model, HellaSwag('validation'), max_samples)
df.prob.describe()

count    10042.000000
mean         0.482413
std          0.174496
min          0.253921
25%          0.356611
50%          0.427546
75%          0.552486
max          0.999993
Name: prob, dtype: float64

In [14]:
for precision in np.linspace(0,1,10, endpoint=False):
    print(f'Class prob > {precision:.1f}: {get_metrics(df[df.prob > precision])}')

Class prob > 0.0: {'accuracy': '0.4094', 'f1': '0.4148'}


Class prob > 0.1: {'accuracy': '0.4094', 'f1': '0.4148'}


Class prob > 0.2: {'accuracy': '0.4094', 'f1': '0.4148'}


Class prob > 0.3: {'accuracy': '0.4174', 'f1': '0.4236'}


Class prob > 0.4: {'accuracy': '0.4837', 'f1': '0.4945'}
Class prob > 0.5: {'accuracy': '0.5693', 'f1': '0.5819'}


Class prob > 0.6: {'accuracy': '0.6503', 'f1': '0.6592'}
Class prob > 0.7: {'accuracy': '0.7219', 'f1': '0.7281'}
Class prob > 0.8: {'accuracy': '0.7959', 'f1': '0.7990'}
Class prob > 0.9: {'accuracy': '0.8515', 'f1': '0.8490'}
