In [None]:
# @title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Toxicity Detector by Meta
This notebook demonstrates how to use Meta toxicity model.

Some parts of the codes are from the original source: https://github.com/aws-samples/data-science-on-aws/blob/main/05b_RLHF_Fine_Tune_Model_to_Detoxify_Summaries.ipynb
<!--table align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/google-gemini/gemma-cookbook/blob/main/Gemma/Gemma_Basics_with_HF.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
</table-->


Adapted for EECE.4860/5860 at UMass Lowell

## Prerequisites 

### Account on Intel Tiber AI Cloud (or run on your local GPU if available)

You will need a standard account on Intel Tiber AI Cloud, where we have tested this notebook. Students have been given instructions on how to sign up for an account on Intel Tiber.

Once you open this notebook on Tiber cloud, make sure to select **PyTorch 2.7** kernel to run it.

### HuggingFace setup

Before we dive into the tutorial, let's get you set up with HuggingFace:

1. **Hugging Face Account:**  If you don't already have one, you can create a free Hugging Face account by clicking [here](https://huggingface.co/join).
2. **LLM Model Access:** Head over to the [Gemma model page](https://huggingface.co/google/gemma-2b) and [llama2 model papge](https://huggingface.co/meta-llama/Llama-2-7b-hf) and accept the usage conditions.
3. **Hugging Face Token:**  You need to create a token on HuggingFace and use it to login from this notebook. Once you are logged in, you can download the models. Check [this guide](https://huggingface.co/docs/hub/en/security-tokens) on how to create a token on HF. Generate a Hugging Face access (preferably `write` permission) token by clicking [here](https://huggingface.co/settings/tokens). **Save the token in a safe document that you can access**. Once you've completed these steps, you're ready to move on to the next section where we'll install necessary packages and log into HuggingFace Hub.


### Import Necessary Packages


In [None]:
# import necessary packages
import os
import sys
import torch

### Install dependencies
Run the cell below to install all the required dependencies.

In [None]:
!{sys.executable} -m pip install --upgrade -q transformers huggingface_hub peft \
  accelerate bitsandbytes datasets trl ipywidgets evaluate

### Log into Hugging Face Hub


In [None]:
# you could use OS env variable to store the HF token
#from huggingface_hub import login
#login(os.environ["HF_TOKEN"])

# or use an input box on this notebook to copy/paste the token
from huggingface_hub import notebook_login
notebook_login()

**If there is no error in the previous step, you are all set and ready to explore the possibilities with LLM models!**


**You need to click the next cell to proceed**

## Instantiate the Roberta-hate-speech-dynabench-r4-target model 


### Setup Device

In [None]:
%env HF_HOME=/opt/notebooks/.cache/huggingface

In [None]:
#device = "cuda"
USE_CPU = False
device = "xpu:0" if torch.xpu.is_available() else "cpu"
if USE_CPU:
    device = "cpu"
print(f"using device: {device}")

### Loading the model from HF Hub

In [None]:
# Let's load the tokenizer and model 
from transformers import AutoTokenizer, AutoModelForSequenceClassification

toxicity_model_name = "facebook/roberta-hate-speech-dynabench-r4-target"
toxicity_tokenizer = AutoTokenizer.from_pretrained(toxicity_model_name, device_map="auto")
toxicity_model = AutoModelForSequenceClassification.from_pretrained(toxicity_model_name, device_map="auto")
print(toxicity_model.config.id2label)

### Evaluate the prompts

In [None]:
non_toxic_text = "You are a great person and I like you."

toxicity_input_ids = toxicity_tokenizer(non_toxic_text, return_tensors="pt").input_ids.to(device)

logits = toxicity_model(input_ids=toxicity_input_ids).logits
print(f'logits [not hate, hate]: {logits.tolist()[0]}')

# Print the probabilities for [not hate, hate]
probabilities = logits.softmax(dim=-1).tolist()[0]
print(f'probabilities [not hate, hate]: {probabilities}')

# get the logits for "not hate" - this is the reward!
not_hate_index = 0
nothate_reward = (logits[:, not_hate_index]).tolist()
print(f'reward (high): {nothate_reward}')


In [None]:
toxic_text = "You are disgusting and terrible and i damn hate you"

toxicity_input_ids = toxicity_tokenizer(toxic_text, return_tensors="pt").input_ids.to(device)

logits = toxicity_model(toxicity_input_ids).logits
print(f'logits [not hate, hate]: {logits.tolist()[0]}')

# Print the probabilities for [not hate, hate]
probabilities = logits.softmax(dim=-1).tolist()[0]
print(f'probabilities [not hate, hate]: {probabilities}')

# Get the logits for "not hate" - this is the reward!
nothate_reward = (logits[:, not_hate_index]).tolist() 
print(f'reward (low): {nothate_reward}')

### Evaluate the Toxicity

In [None]:
import evaluate

toxicity_evaluator = evaluate.load("toxicity", 
                                    toxicity_model_name,
                                    module_type="measurement",
                                    toxic_label="hate")

In [None]:
toxicity_score = toxicity_evaluator.compute(predictions=[
    non_toxic_text
])

print("Toxicity score for non-toxic text:")
print(toxicity_score["toxicity"])

toxicity_score = toxicity_evaluator.compute(predictions=[
    toxic_text
])

print("\nToxicity score for toxic text:")
print(toxicity_score["toxicity"])