# Lesson 3: Data leakage and toxicity

## Setup

In [None]:
import pandas as pd

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
import whylogs as why

In [None]:
import helpers

In [None]:
chats = pd.read_csv("./chats.csv")

In [None]:
chats[10:11]

## Data leakage 

### 1. Detect Patterns

In [None]:
from langkit import regexes

**Note**: To view the next visuals, you may have to either hide the left-side menu bar or widen the notebook towards the right.

In [None]:
helpers.visualize_langkit_metric(
    chats, 
    "prompt.has_patterns"
)

In [None]:
helpers.visualize_langkit_metric(
    chats, 
    "response.has_patterns"
)

In [None]:
helpers.show_langkit_critical_queries(
    chats, 
    "response.has_patterns"
)

In [None]:
from whylogs.experimental.core.udf_schema import udf_schema

In [None]:
annotated_chats, _ = udf_schema().apply_udfs(chats)

In [None]:
annotated_chats.head(5)

In [None]:
annotated_chats[(annotated_chats["prompt.has_patterns"].notnull()) |
                  (annotated_chats["response.has_patterns"].notnull())]

**Note**: To view the next visual, you may have to either hide the left-side menu bar or widen the notebook towards the right.

In [None]:
helpers.evaluate_examples(
  annotated_chats[(annotated_chats["prompt.has_patterns"].notnull()) |
                  (annotated_chats["response.has_patterns"].notnull())] ,
  scope="leakage")

### 2. Entity recognition

In [None]:
from span_marker import SpanMarkerModel

In [None]:
entity_model = SpanMarkerModel.from_pretrained(
    "tomaarsen/span-marker-bert-tiny-fewnerd-coarse-super"
)

In [None]:
entity_model.predict(
    "Write an funny email subject to Bill Gates that\
    describes a confidential product called Modelizer 900."
)

In [None]:
leakage_entities = ["person", "product","organization"]

In [None]:
from whylogs.experimental.core.udf_schema import register_dataset_udf

In [None]:
@register_dataset_udf(["prompt"],"prompt.entity_leakage")
def entity_leakage(text):
    entity_counts = []
    for _, row in text.iterrows():
        entity_counts.append(
            next((entity["label"] for entity in \
                entity_model.predict(row["prompt"]) if\
                entity["label"] in leakage_entities and \
                entity["score"] > 0.25), None
            )
        )
    return entity_counts

In [None]:
entity_leakage(chats.head(5))

In [None]:
@register_dataset_udf(["response"],"response.entity_leakage")
def entity_leakage(text):
    entity_counts = []
    for _, row in text.iterrows():
        entity_counts.append(
            next((entity["label"] for entity in \
                entity_model.predict(row["response"]) if\
                entity["label"] in leakage_entities and \
                entity["score"] > 0.25), None
            )
        )
    return entity_counts

In [None]:
annotated_chats, _ = udf_schema().apply_udfs(chats)

In [None]:
helpers.show_langkit_critical_queries(
    chats, 
    "prompt.entity_leakage")

In [None]:
annotated_chats[(annotated_chats["prompt.has_patterns"].notnull()) |
                  (annotated_chats["response.has_patterns"].notnull()) | 
                  (annotated_chats["prompt.entity_leakage"].notnull()) |
                  (annotated_chats["response.entity_leakage"].notnull())
]

**Note**: To view the next visual, you may have to either hide the left-side menu bar or widen the notebook towards the right.

In [None]:
helpers.evaluate_examples(
  annotated_chats[(annotated_chats["prompt.has_patterns"].notnull()) |
                  (annotated_chats["response.has_patterns"].notnull()) | 
                  (annotated_chats["prompt.entity_leakage"].notnull()) |
                  (annotated_chats["response.entity_leakage"].notnull())],
  scope="leakage")

## Toxicity

In [None]:
from transformers import pipeline

In [None]:
toxigen_hatebert = pipeline("text-classification", 
                            model="tomh/toxigen_hatebert", 
                            tokenizer="bert-base-cased")

In [None]:
toxigen_hatebert(["Something non-toxic",
                  "A benign sentence, despite mentioning women."])

In [None]:
@register_dataset_udf(["prompt"],"prompt.implicit_toxicity")
def implicit_toxicity(text):
    return [int(result["label"][-1]) for result in 
            toxigen_hatebert(text["prompt"].to_list())]

In [None]:
helpers.show_langkit_critical_queries(
    annotated_chats, 
    "prompt.implicit_toxicity")