# Llama2 on Toxigen Benchmark
Reproducing the results on the Toxigen benchmark for Llama2-chat-7b.

In [None]:
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
import torch
import accelerate
import bitsandbytes  # Works with CUDA
import numpy as np
from tqdm import tqdm

# device = torch.device("mps") if torch.backends.mps.is_built() else torch.device("cpu")
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [None]:
# Get model and tokenizer
model_name = "meta-llama/Llama-2-7b-chat-hf"
access_token = os.environ["HF_API_KEY"]

# Quantization: https://huggingface.co/docs/transformers/v4.33.2/en/main_classes/quantization
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True)

model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", quantization_config=bnb_config,  token=access_token);  # In colab cache_dir can be set to a folder in GDrive
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, token=access_token);
tokenizer.pad_token = tokenizer.eos_token

## Toxigen score

In [None]:
from transformers import pipeline
toxigen_hatebert = pipeline("text-classification", model="tomh/toxigen_hatebert", tokenizer="bert-base-cased")

In [None]:
input_text = tokenizer.decode(output.sequences[0][model_inputs.input_ids.shape[1]:], skip_special_tokens=True)
print("Text: ", input_text)
prob = toxigen_hatebert(input_text)[0]['score'] * 100
print("\n------------------------------------------------------------------------------")
print("Probability the input text was toxic according to ToxDectRoBERTa: {:2f}".format(prob))

## Toxigen

In [None]:
from datasets import load_dataset
# TG_data = load_dataset("skg/toxigen-data", name="train", use_auth_token=True) # 250k training examples
TG_annotations = load_dataset("skg/toxigen-data", name="annotated", token=access_token) # Human study