In [1]:
from llama2 import *
from typing import List, Literal, Optional, Tuple, TypedDict
from pprint import pprint
import pandas as pd
import datasets
import evaluate

from torch.utils.data import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "meta-llama/Llama-2-7b-chat-hf"

model = LlamaModel(
    model_name=model_name,
    model_resolution='int4'
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.32it/s]


In [3]:
dataset = datasets.load_dataset('hellaswag', split='validation')

In [4]:
dataset

Dataset({
    features: ['ind', 'activity_label', 'ctx_a', 'ctx_b', 'ctx', 'endings', 'source_id', 'split', 'split_type', 'label'],
    num_rows: 10042
})

In [5]:
dataset[0]

{'ind': 24,
 'activity_label': 'Roof shingle removal',
 'ctx_a': 'A man is sitting on a roof.',
 'ctx_b': 'he',
 'ctx': 'A man is sitting on a roof. he',
 'endings': ['is using wrap to wrap a pair of skis.',
  'is ripping level tiles off.',
  "is holding a rubik's cube.",
  'starts pulling up roofing on a roof.'],
 'source_id': 'activitynet~v_-JhWjGDPHMY',
 'split': 'val',
 'split_type': 'indomain',
 'label': '3'}

In [6]:
lengths = [len(x['endings']) for x in dataset]
max(lengths), min(lengths) # All endings in the dataset have exactly four options

(4, 4)

In [7]:
x = dataset[2]
options = [f"\t{chr(ord('a')+i)}) {option}" for i, option in enumerate(x['endings'])]
endings = '\n'.join(options)
text = f'''\
{x["activity_label"]}. {x['ctx']}

{endings}

Answer:

The correct answer is ('''

print(text)

Canoeing. Two women in a child are shown in a canoe while a man pulls the canoe while standing in the water, with other individuals visible in the background. the child and a different man

	a) are then shown paddling down a river in a boat while a woman talks.
	b) are driving the canoe, they go down the river flowing side to side.
	c) sit in a canoe while the man paddles.
	d) walking go down the rapids, while the man in his helicopter almost falls and goes out of canoehood.

Answer:

The correct answer is (


In [8]:
output = model.generate(text)
print(output)

<s> Canoeing. Two women in a child are shown in a canoe while a man pulls the canoe while standing in the water, with other individuals visible in the background. the child and a different man

	a) are then shown paddling down a river in a boat while a woman talks.
	b) are driving the canoe, they go down the river flowing side to side.
	c) sit in a canoe while the man paddles.
	d) walking go down the rapids, while the man in his helicopter almost falls and goes out of canoehood.

Answer:

The correct answer is (b) are driving the canoe, they go down the river flowing side to side.

Explanation:

In option (b), the two women are shown paddling a canoe down a river, flowing side to side. This is the correct answer as it matches the given image. In the image, the two women are shown paddling a canoe, with the river flowing side to side behind them.

Option (a) is incorrect as the man is not shown paddling a canoe. In the image, the man is shown standing in the water while pulling the


In [9]:
tok_ids = model.tokenize(text)
tok_ids
output = model.model(tok_ids.cuda())

In [10]:
choices = model.tokenizer.convert_tokens_to_ids(['a','b','c','d'])
torch.softmax(output['logits'][:,-1,choices], dim=1)


tensor([[0.1877, 0.5782, 0.1508, 0.0833]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)

In [4]:
class HellaSwag(Dataset):
    def __init__(self, split: str):
        self.dataset = datasets.load_dataset('hellaswag', split='validation')

    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx: int):
        x = self.dataset[idx]
        options = [f"\t{chr(ord('a')+i)}) {option}" for i, option in enumerate(x['endings'])]
        endings = '\n'.join(options)
        text = (f'{x["activity_label"]} {x["ctx"]}\n\n'
                f'{endings}\n\n'
                'Answer:\n\nThe correct answer is (')
        
        y = x['label']
        return text, int(y)
    

In [5]:
data = HellaSwag('validation')

In [6]:
def eval(model: LlamaModel, x):
    with torch.no_grad():
        tok_ids = model.tokenize(x)
        output = model.model(tok_ids.to(model.model.device))
        choices = model.tokenizer.convert_tokens_to_ids(['a','b','c','d'])
        probs = torch.softmax(output['logits'][:,-1,choices], dim=1)
    return probs.argmax().item(), probs.max().item()

eval(model, data[0][0])

(3, 0.9994550347328186)

In [7]:
def eval_dataset(model, data):
    gt, pred, prob = [], [], []
    for i in range(len(data)):
        x, y = data[i]
        res = eval(model, x)

        gt.append(y)
        pred.append(res[0])
        prob.append(res[1])
        # print(x)
        # print(f'Y: {y}, pred: {res}')


    df = pd.DataFrame({'y': gt, 'y_pred': pred, 'prop': prob})
    return df


df = eval_dataset(model, data)

In [16]:
df.describe()

Unnamed: 0,y,y_pred,prop
count,10042.0,10042.0,10042.0
mean,1.496415,1.2144,0.496284
std,1.1138,0.952272,0.153962
min,0.0,0.0,0.260565
25%,0.0,1.0,0.387699
50%,2.0,1.0,0.456338
75%,2.0,2.0,0.560352
max,3.0,3.0,0.999991


In [22]:

df_filt = df[df.prop > 0.6]
accuracy = (df_filt.y_pred == df_filt.y).sum() / len(df_filt)
print(f'Accuracy: {accuracy}')
print(df.dtypes)

mat = torch.zeros(4,4, dtype=int)
for i in range(len(df)):
    row = df.iloc[i]
    mat[int(row.y_pred), int(row.y)] += 1
mat

Accuracy: 0.6351909184726522
y           int64
y_pred      int64
prop      float64
dtype: object


tensor([[ 870,  382,  378,  439],
        [1231, 1714, 1278, 1166],
        [ 133,   95,  559,  159],
        [ 281,  294,  369,  694]])

In [15]:
recall = [mat[i, i] / mat[i, :].sum() for i in range(4)]
precision = [mat[i, i] / mat[:, i].sum() for i in range(4)]

mp = sum(precision) / 4
mr = sum(recall) / 4

f1 = 2 * (mp * mr) / (mp + mr)

precision, recall, f1

([tensor(0.3459), tensor(0.6897), tensor(0.2163), tensor(0.2823)],
 [tensor(0.4205), tensor(0.3181), tensor(0.5909), tensor(0.4237)],
 tensor(0.4091))

In [34]:
help(evaluate)

Help on package evaluate:

NAME
    evaluate

DESCRIPTION
    # flake8: noqa
    # Copyright 2020 The HuggingFace Evaluate Authors and the TensorFlow Datasets Authors.
    #
    # Licensed under the Apache License, Version 2.0 (the "License");
    # you may not use this file except in compliance with the License.
    # You may obtain a copy of the License at
    #
    #     http://www.apache.org/licenses/LICENSE-2.0
    #
    # Unless required by applicable law or agreed to in writing, software
    # distributed under the License is distributed on an "AS IS" BASIS,
    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    # See the License for the specific language governing permissions and
    # limitations under the License.

PACKAGE CONTENTS
    commands (package)
    config
    evaluation_suite (package)
    evaluator (package)
    hub
    info
    inspect
    loading
    module
    naming
    saving
    utils (package)
    visualization

SUBMODULES
    gra

In [22]:
print(model.generate(data[9][0]))

<s> Sharpening knives Then he takes a small stone from the flowing river and smashes it on another stone. He starts to crush the small stone to smaller pieces. he

	a) cuts the center stone in half and blow it on to make it bigger.
	b) grind it hard to make the pieces smaller.
	c) eventually brings it back into view and adds it to the smaller ones to make a small triangular shaped piece.
	d) starts to party with them and throw the pieces by hand while they celebrate.

Answer:

The correct answer is (b) grind it hard to make the pieces smaller.

Explanation:

The passage describes the man's process of sharpening knives using a river stone. He starts by taking a small stone from the river and smashing it on another stone. He then crushes the small stone into smaller pieces using a hard grinding motion. This process continues until the small stone is reduced to a small triangular-shaped piece.
There is no mention in the passage of the man cutting the center stone in half and blowing it on

In [25]:
eval(model, data[9][0])

(1, 0.7448536157608032)

In [24]:
dataset[9]

{'ind': 182,
 'activity_label': 'Sharpening knives',
 'ctx_a': 'Then he takes a small stone from the flowing river and smashes it on another stone. He starts to crush the small stone to smaller pieces.',
 'ctx_b': 'he',
 'ctx': 'Then he takes a small stone from the flowing river and smashes it on another stone. He starts to crush the small stone to smaller pieces. he',
 'endings': ['cuts the center stone in half and blow it on to make it bigger.',
  'grind it hard to make the pieces smaller.',
  'eventually brings it back into view and adds it to the smaller ones to make a small triangular shaped piece.',
  'starts to party with them and throw the pieces by hand while they celebrate.'],
 'source_id': 'activitynet~v_0bosp4-pyTM',
 'split': 'val',
 'split_type': 'zeroshot',
 'label': '1'}

In [104]:
labels = dataset[:]['label']

In [108]:
min(labels), max(labels)

('0', '3')

In [16]:
toks = model.tokenize(data[0][0])

x = torch.stack([toks, toks]).squeeze(1)
model.model(x)

CausalLMOutputWithPast(loss={'logits': tensor([[[ 0.1254, -0.7974,  0.4358,  ...,  1.3164,  1.6396,  0.3994],
         [-6.1523, -7.9961,  0.9722,  ..., -3.7051, -2.4199, -2.8086],
         [-7.8516, -9.1875,  1.2598,  ..., -6.2109, -4.0312, -2.7305],
         ...,
         [-2.1016, -4.6836,  7.5664,  ..., -0.8599, -0.2402,  0.0791],
         [ 2.1094, -0.0539,  8.1719,  ..., -2.3828, -0.4795,  0.5381],
         [-2.7832, -8.6406,  5.1133,  ..., -2.0625, -2.5273, -0.3167]],

        [[ 0.1254, -0.7974,  0.4358,  ...,  1.3164,  1.6396,  0.3994],
         [-6.1523, -7.9961,  0.9722,  ..., -3.7051, -2.4199, -2.8086],
         [-7.8516, -9.1875,  1.2598,  ..., -6.2109, -4.0312, -2.7305],
         ...,
         [-2.1016, -4.6836,  7.5664,  ..., -0.8599, -0.2402,  0.0791],
         [ 2.1094, -0.0539,  8.1719,  ..., -2.3828, -0.4795,  0.5381],
         [-2.7832, -8.6406,  5.1133,  ..., -2.0625, -2.5273, -0.3167]]],
       grad_fn=<ToCopyBackward0>), 'past_key_values': ((tensor([[[[-0.4629,  