# Step 1 - Install the required dependencies and make sure the python version is 3.10 and above

In [None]:
!pip install zenoml

In [None]:
!pip install datasets
!pip install transformers
!pip install tqdm
!pip install torch

In [1]:
!python --version

Python 3.11.10


# Step 2 - Load a dataset from Hugging Face

In [2]:
from datasets import load_dataset
import pandas as pd

ds = load_dataset("cardiffnlp/tweet_eval", "sentiment")
df = pd.DataFrame(ds['test']).head(500)
df.head(5)

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 100%|██████████| 45615/45615 [00:00<00:00, 1392403.31 examples/s]
Generating test split: 100%|██████████| 12284/12284 [00:00<00:00, 3039874.35 examples/s]
Generating validation split: 100%|██████████| 2000/2000 [00:00<00:00, 826138.27 examples/s]


Unnamed: 0,text,label
0,@user @user what do these '1/2 naked pics' hav...,1
1,OH: “I had a blue penis while I was this” [pla...,1
2,"@user @user That's coming, but I think the vic...",1
3,I think I may be finally in with the in crowd ...,2
4,"@user Wow,first Hugo Chavez and now Fidel Cast...",0


In [4]:
def label_map(x):
    if x == 0:
        return 'negative'
    elif x == 1:
        return 'neutral'
    elif x == 2:
        return 'positive'
    return x
df['label'] = df['label'].map(label_map)
df

Unnamed: 0,text,label
0,@user @user what do these '1/2 naked pics' hav...,neutral
1,OH: “I had a blue penis while I was this” [pla...,neutral
2,"@user @user That's coming, but I think the vic...",neutral
3,I think I may be finally in with the in crowd ...,positive
4,"@user Wow,first Hugo Chavez and now Fidel Cast...",negative
...,...,...
495,Donnie is gonna deport the Menendez Brothers r...,negative
496,Still reading #SettleForMore @user #fridayreads,neutral
497,#Chocolate cupcake #candle melting with its sw...,positive
498,Is this leaf ?Can I eat ?Open the leaf!Oh! Thi...,neutral


# Step 3 - Run model inference

Warning: This step is going to download two models of ~500MB each. 

**If you don't want to download the models, you can jump to step 4 and use the provided data in the repo instead.**

### Run inference with roberta

In [5]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment-latest")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


In [6]:
import tqdm

results = []
texts = df['text'].to_list()

## Depending on your machine, this should take around 1 minute
for text in tqdm.tqdm(texts):
    results.append(pipe(text))

100%|██████████| 500/500 [00:47<00:00, 10.49it/s]


In [8]:
df['roberta'] = [r[0]['label'] for r in results]
df['roberta_score'] = [r[0]['score'] for r in results]
df

Unnamed: 0,text,label,roberta,roberta_score
0,@user @user what do these '1/2 naked pics' hav...,neutral,negative,0.804726
1,OH: “I had a blue penis while I was this” [pla...,neutral,neutral,0.866949
2,"@user @user That's coming, but I think the vic...",neutral,neutral,0.763725
3,I think I may be finally in with the in crowd ...,positive,positive,0.774047
4,"@user Wow,first Hugo Chavez and now Fidel Cast...",negative,neutral,0.416397
...,...,...,...,...
495,Donnie is gonna deport the Menendez Brothers r...,negative,negative,0.915344
496,Still reading #SettleForMore @user #fridayreads,neutral,neutral,0.797208
497,#Chocolate cupcake #candle melting with its sw...,positive,positive,0.951857
498,Is this leaf ?Can I eat ?Open the leaf!Oh! Thi...,neutral,positive,0.664861


### Run inference with gpt2

In [10]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-classification", model="LYTinn/finetuning-sentiment-model-tweet-gpt2")

Device set to use mps:0


In [11]:
import tqdm

results = []
texts = df['text'].to_list()

## Depending on your machine, this should take around 1 minute
for text in tqdm.tqdm(texts):
    results.append(pipe(text))

100%|██████████| 500/500 [00:26<00:00, 18.88it/s]


In [12]:
df['gpt2'] = [r[0]['label'] for r in results]
df['gpt2_score'] = [r[0]['score'] for r in results]

## map labels back
def label_map(x):
    if x == 'LABEL_0':
        return 'negative'
    elif x == 'LABEL_1':
        return 'neutral'
    elif x == 'LABEL_2':
        return 'positive'
    return x
df['gpt2'] = df['gpt2'].map(label_map)

# Step 4 - Pre-processing data and add additional columns

In [14]:
## If you skip the model inference, uncomment the code below and load the provided data

# df = pd.read_csv('tweets.csv')

In [15]:
df["input_length"] = df["text"].str.len()

# Step 5 - Start Zeno for interactive slicing

In this step, you need to create 5 slices in the Zeno interface and derive meaningful insights.

As a starting point, try to create the two slices we provide:

1. Tweets with hashtags
2. Tweets with strong positive words (e.g., love) -- you can determine the exact words

Creating slices in Zeno is straightforward: Just click on the '+' button for 'create a new slice', and you can define the slice using existing column attributes, with simple value macthing or even regular expression.

![image.png](images/image.png)

There are more fun features in Zeno, including interactive metadata & model comparison -- feel free to check the teaser video in [README](https://github.com/zeno-ml/zeno) of the Zeno repository.

In [None]:
## Execute the code here to start a local Zeno server

from zeno import zeno

from zeno.api import model, distill, metric
from zeno.api import ModelReturn, MetricReturn, DistillReturn, ZenoOptions

@model
def load_model(model_name):
    
    def pred(df, ops: ZenoOptions):
        out = df[model_name]
        return ModelReturn(model_output=out)

    return pred

@distill
def label_match(df, ops: ZenoOptions):
    results = (df[ops.label_column] == df[ops.output_column]).to_list()
    return DistillReturn(distill_output=results)

@metric
def accuracy(df, ops: ZenoOptions):
    avg = df[ops.distill_columns["label_match"]].mean()
    if pd.isna(avg):  # Check for NaN
        avg = 0.0
    return MetricReturn(metric=avg)

zeno({
    "metadata": df, # Pandas DataFrame with a row for each instance
    "view": "text-classification", # The type of view for this data/task
    "data_column": "text", 
    "label_column": "label",
    "functions": [load_model, label_match, accuracy],
    "models": ["roberta", "gpt2"],
    "port": 8231
})




python(48028) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.



[1mZeno[0m running on http://localhost:8231
Running predistill functions

Running inference
Running postdistill functions
Done processing


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  filt_df.groupby([pd.cut(filt_df[str(col)], bucs)])  # type: ignore
  filt_df.groupby([pd.cut(filt_df[str(col)], bucs)])  # type: ignore
  filt_df.groupby([pd.cut(filt_df[str(col)], bucs)])  # type: ignore
  filt_df.groupby([pd.cut(filt_df[str(col)], bucs)])  # type: ignore
  filt_df.groupby([pd.cut(filt_df[str(col)], bucs)])  # type: ignore
  filt_df.groupby([pd.cut(filt_df[str(col)], bucs)])  # type: ignore
  filt_df.groupby([pd.cut(filt_df[str(col)], bucs)])  # type: ignore
  filt_df.groupby([pd.cut(filt_df[str(col)], bucs)])  # type: ignore
  filt_df.groupby([pd.cut(filt_df[str(col)], bucs)])  # type: ignore
  filt_df.groupby([pd.cut(filt_df[str(col)], bucs)])  # type: ignore
  filt_df.groupby([pd.cut(f

After running the code above, you should be able to access Zeno in http://localhost:8231


After successfully creating the two slices, come up with three *additional* slices you want to check and **create** the slices in the Zeno interface.

There are two directions to identify useful slices:
- Top-down: Think about what kinds of things the model can struggle with, and come up with some slices.
- Bottom-up: Look at model (mis-)predictions, come up with hypotheses, and translate them into data slices.

3. Tweets without mentions
4. Tweets with URLs
5. Tweets with questions

In [22]:
## Write down descriptions of additional slices you created

custom_slice_descriptions = [
    "Tweets without mentions", # text not match (regex) @\w+
    "Tweets containing URLs", # text match (case) (regex) http[s]?://\S+
    "Tweets with questions" # text match (regex) \?
]

# Step 6 - Write down three addition data slices you want to create but do not have the metadata for slicing

In the previous step, you might have already come up with some slices you wanted to create but found it hard to do with existing metadata. Write down three of such slices in this step.

Example: 
- I want to create a slice on tweets using slangs
- I want to create a slice on non-English tweets (if any)

In [None]:
## Write down three additional data slices here:

additional_slice_descriptions = [
    "I want to create a slice on tweets using Nepali devanagari script",
    "I want to create a slice on tweets using sarcasm",
    "I want to create a slice on tweets with political or social commentary"
]

# Step 7 - Generate more test cases with Large Language Models

Select one slice from the three you wrote down and generate **10 test cases** using LLMs, which can include average case, boundary case, or difficult case.

Your input can be in the following format:

> Examples:
> - OH: “I had a blue penis while I was this” [playing with Google Earth VR]
> - @user @user That’s coming, but I think the victims are going to be Medicaid recipients.
> - I think I may be finally in with the in crowd #mannequinchallenge  #grads2014 @user
> 
> Generate more tweets using slangs.

The first part of **Examples** conditions the LLM on the style, length, and content of examples. The second part of **Instructions** instructs what kind of examples you want LLM to generate.

Use our provided GPTs to start the task: [llm-based-test-case-generator](https://chatgpt.com/g/g-982cylVn2-llm-based-test-case-generator). If you do not have access to GPTs, use the plain ChatGPT or other LLM providers you have access to instead.

In [None]:
## Write down the slice you select

slice_description = "I want to create a slice on tweets with political or social commentary"

## Write down all generated test cases here

# slice_description = "I want to create a slice on tweets using sarcasm"
# generated_test_cases = [
# 'Wow, what a surprise! Another celebrity has a “totally unexpected” PR scandal. Didn’t see that coming at all.',
# 'Absolutely love when websites ask me to accept cookies. Like, of course! I’d love to trade my privacy for slightly more relevant ads.',
# 'Oh, you’re a morning person? Must be nice to wake up and immediately feel like a functioning human being.',
# 'Love how my WiFi works perfectly—except when I actually need it for something important.',
# 'Nothing beats the thrill of getting paid just to watch it disappear into bills 30 seconds later!',
# '"Drink more water!" Oh sure, let me just chug a gallon and suddenly become a superhuman. Thanks for the life-changing advice!',
# 'Wow, my boss just sent an email at 11 PM. Such a great reminder that work-life balance is alive and well!',
# 'Love how customer service puts me on hold just so I can vibe to the same 10-second music loop for half an hour.',
# 'Ah, another influencer teaching me how to be rich by selling a course on how to be rich. Truly groundbreaking stuff.',
# 'Nothing makes me feel more confident than a self-checkout machine telling me I “need assistance” for the 5th time in a row.'
# ]

generated_test_cases = [
"Amazing how there’s always “not enough budget” for healthcare and education, but there’s unlimited money for war.",
"@user So we can ban books, but not assault rifles? Makes total sense.",
"Nothing says “democracy” like billionaires funding election campaigns and writing laws in their free time.",
"Raise wages? No way! But sure, let’s give corporations another tax cut and hope for the best.",
"Love how politicians suddenly remember the working class exists—right before an election.",
"Ah yes, “thoughts and prayers.” The most effective policy for every crisis since forever.",
"Crazy how some people are more upset about property damage than actual human lives.",
"If healthcare is a human right everywhere else, why is it a luxury in the U.S.?",
"Breaking news: Another company that underpays workers just posted record profits. Shocking!",
"Weird how people against “cancel culture” are always the first to boycott things they don’t like."
]