# Step 1 - Install the required dependencies and make sure the python version is 3.10 and above

In [None]:
!pip install zenoml

In [None]:
!pip install datasets
!pip install transformers
!pip install tqdm
!pip install torch

In [None]:
!python --version

# Step 2 - Load a dataset from Hugging Face

In [1]:
from datasets import load_dataset
import pandas as pd

ds = load_dataset("cardiffnlp/tweet_eval", "sentiment")
df = pd.DataFrame(ds['test']).head(500)
df.head(5)

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,text,label
0,@user @user what do these '1/2 naked pics' hav...,1
1,OH: “I had a blue penis while I was this” [pla...,1
2,"@user @user That's coming, but I think the vic...",1
3,I think I may be finally in with the in crowd ...,2
4,"@user Wow,first Hugo Chavez and now Fidel Cast...",0


In [2]:
def label_map(x):
    if x == 0:
        return 'negative'
    elif x == 1:
        return 'neutral'
    elif x == 2:
        return 'positive'
    return x
df['label'] = df['label'].map(label_map)

# Step 3 - Run model inference

Warning: This step is going to download two models of ~500MB each. 

**If you don't want to download the models, you can jump to step 4 and use the provided data in the repo instead.**

### Run inference with roberta

In [3]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment-latest")

2025-03-18 20:58:59.972799: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-18 20:58:59.979400: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-18 20:58:59.987668: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-18 20:58:59.990180: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-18 20:58:59.996612: I tensorflow/core/platform/cpu_feature_guar

In [4]:
import tqdm

results = []
texts = df['text'].to_list()

## Depending on your machine, this should take around 1 minute
for text in tqdm.tqdm(texts):
    results.append(pipe(text))

100%|██████████| 500/500 [00:07<00:00, 69.53it/s]


In [5]:
df['roberta'] = [r[0]['label'] for r in results]
df['roberta_score'] = [r[0]['score'] for r in results]

### Run inference with gpt2

In [6]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-classification", model="LYTinn/finetuning-sentiment-model-tweet-gpt2")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [7]:
import tqdm

results = []
texts = df['text'].to_list()

## Depending on your machine, this should take around 1 minute
for text in tqdm.tqdm(texts):
    results.append(pipe(text))

100%|██████████| 500/500 [00:07<00:00, 70.67it/s]


In [8]:
df['gpt2'] = [r[0]['label'] for r in results]
df['gpt2_score'] = [r[0]['score'] for r in results]

## map labels back
def label_map(x):
    if x == 'LABEL_0':
        return 'negative'
    elif x == 'LABEL_1':
        return 'neutral'
    elif x == 'LABEL_2':
        return 'positive'
    return x
df['gpt2'] = df['gpt2'].map(label_map)

# Step 4 - Pre-processing data and add additional columns

In [9]:
## If you skip the model inference, uncomment the code below and load the provided data

# df = pd.read_csv('tweets.csv')

In [9]:
df["input_length"] = df["text"].str.len()

In [13]:
import pandas as pd
from collections import Counter
import re

# Combine all text data
all_text = " ".join(df['text'].dropna())

# Tokenize words (remove punctuation and split by space)
words = re.findall(r'\b\w+\b', all_text.lower())

# Count word occurrences
word_counts = Counter(words)

# Get the top 10 most common words
top_10_words = word_counts.most_common(50)

# Display results
print(top_10_words)


[('user', 342), ('the', 222), ('to', 173), ('of', 113), ('a', 109), ('and', 104), ('i', 101), ('s', 92), ('in', 88), ('is', 77), ('for', 73), ('t', 68), ('on', 51), ('it', 51), ('trump', 50), ('you', 50), ('that', 45), ('with', 44), ('this', 42), ('are', 38), ('not', 31), ('all', 29), ('they', 28), ('be', 26), ('my', 26), ('can', 26), ('have', 24), ('but', 24), ('as', 23), ('from', 22), ('right', 21), ('so', 21), ('like', 20), ('if', 20), ('he', 20), ('by', 19), ('we', 19), ('there', 19), ('what', 18), ('about', 18), ('co', 18), ('her', 18), ('was', 17), ('m', 17), ('me', 17), ('just', 17), ('get', 17), ('do', 16), ('will', 16), ('at', 16)]


# Step 5 - Start Zeno for interactive slicing

In this step, you need to create 5 slices in the Zeno interface and derive meaningful insights.

As a starting point, try to create the two slices we provide:

1. Tweets with hashtags
2. Tweets with strong positive words (e.g., love) -- you can determine the exact words

Creating slices in Zeno is straightforward: Just click on the '+' button for 'create a new slice', and you can define the slice using existing column attributes, with simple value macthing or even regular expression.

![image.png](images/image.png)

There are more fun features in Zeno, including interactive metadata & model comparison -- feel free to check the teaser video in [README](https://github.com/zeno-ml/zeno) of the Zeno repository.

In [10]:
## Execute the code here to start a local Zeno server

from zeno import zeno

from zeno.api import model, distill, metric
from zeno.api import ModelReturn, MetricReturn, DistillReturn, ZenoOptions

@model
def load_model(model_name):
    
    def pred(df, ops: ZenoOptions):
        out = df[model_name]
        return ModelReturn(model_output=out)

    return pred

@distill
def label_match(df, ops: ZenoOptions):
    results = (df[ops.label_column] == df[ops.output_column]).to_list()
    return DistillReturn(distill_output=results)

@metric
def accuracy(df, ops: ZenoOptions):
    avg = df[ops.distill_columns["label_match"]].mean()
    return MetricReturn(metric=avg)

zeno({
    "metadata": df, # Pandas DataFrame with a row for each instance
    "view": "text-classification", # The type of view for this data/task
    "data_column": "text", 
    "label_column": "label",
    "functions": [load_model, label_match, accuracy],
    "models": ["roberta", "gpt2"],
    "port": 8231
})




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1mZeno[0m running on http://localhost:8231
Running predistill functions

Running inference


Inference on roberta: 100%|██████████| 500/500 [00:00<00:00, 2843.74it/s]
Inference on gpt2: 100%|██████████| 500/500 [00:00<00:00, 2812.18it/s]


Running postdistill functions


postprocessing label_match on roberta: 100%|██████████| 500/500 [00:00<00:00, 2207.94it/s]
postprocessing label_match on gpt2: 100%|██████████| 500/500 [00:00<00:00, 2185.51it/s]


Done processing


  filt_df.groupby([pd.cut(filt_df[str(col)], bucs)])  # type: ignore
  filt_df.groupby([pd.cut(filt_df[str(col)], bucs)])  # type: ignore
  filt_df.groupby([pd.cut(filt_df[str(col)], bucs)])  # type: ignore
  filt_df.groupby([pd.cut(filt_df[str(col)], bucs)])  # type: ignore
ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/env_ai_deploy/lib/python3.12/site-packages/uvicorn/protocols/http/h11_impl.py", line 408, in run_asgi
    result = await app(  # type: ignore[func-returns-value]
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ubuntu/anaconda3/envs/env_ai_deploy/lib/python3.12/site-packages/uvicorn/middleware/proxy_headers.py", line 84, in __call__
    return await self.app(scope, receive, send)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ubuntu/anaconda3/envs/env_ai_deploy/lib/python3.12/site-packages/fastapi/applications.py", line 289, in __call__
    await super().__call__(sco

After running the code above, you should be able to access Zeno in http://localhost:8231


After successfully creating the two slices, come up with three *additional* slices you want to check and **create** the slices in the Zeno interface.

There are two directions to identify useful slices:
- Top-down: Think about what kinds of things the model can struggle with, and come up with some slices.
- Bottom-up: Look at model (mis-)predictions, come up with hypotheses, and translate them into data slices.

3. Has the word user
4. Special characters: `[^\u1F600-\u1F64F\s]+`
5. Has the word "Trump"

In [16]:
## Write down descriptions of additional slices you created

custom_slice_descriptions = [
    "Has the user word",
    "Has emojis",
    "Has Trump word"
]

# Step 6 - Write down three addition data slices you want to create but do not have the metadata for slicing

In the previous step, you might have already come up with some slices you wanted to create but found it hard to do with existing metadata. Write down three of such slices in this step.

Example: 
- I want to create a slice on tweets using slangs
- I want to create a slice on non-English tweets (if any)

In [17]:
## Write down three additional data slices here:

additional_slice_descriptions = [
    "I want to create a slice on tweets using urls",
    "I want to create a slice on tweets using with high engagement",
    "I want to create a slice on tweets using specific hashtags"
]

# Step 7 - Generate more test cases with Large Language Models

Select one slice from the three you wrote down and generate **10 test cases** using LLMs, which can include average case, boundary case, or difficult case.

Your input can be in the following format:

> Examples:
> - OH: “I had a blue penis while I was this” [playing with Google Earth VR]
> - @user @user That’s coming, but I think the victims are going to be Medicaid recipients.
> - I think I may be finally in with the in crowd #mannequinchallenge  #grads2014 @user
> 
> Generate more tweets using slangs.

The first part of **Examples** conditions the LLM on the style, length, and content of examples. The second part of **Instructions** instructs what kind of examples you want LLM to generate.

Use our provided GPTs to start the task: [llm-based-test-case-generator](https://chatgpt.com/g/g-982cylVn2-llm-based-test-case-generator). If you do not have access to GPTs, use the plain ChatGPT or other LLM providers you have access to instead.

> Examples:
> - Check out this article on the latest in technology at nytimes.com
> - "Fascinating paper on AI published here: arxiv.org/abs/1234"
> - "Get the best deals on books at amazon.com!"
> 
> Generate more tweets using urls.

In [None]:
## Write down the slice you select

slice_description = "I want to create a slice on tweets using urls"

## Write down all generated test cases here

generated_test_cases = [  
    "Breaking news on climate change: bbc.com/news/science-environment-123456",  
    "New study on black holes just dropped! Read here: nature.com/articles/abc123",  
    "Flash sale on electronics! Don't miss out: bestbuy.com/deals",  
    "10 tips for better productivity – must read! lifehacker.com/article-789",  
    "Exciting NASA discovery about Mars: nasa.gov/mars-exploration",  
    "Streaming now: the latest blockbuster movie! Watch at netflix.com/title/98765",  
    "Learn how to invest like a pro: forbes.com/investing-guide",  
    "Concert tickets selling fast! Grab yours at ticketmaster.com/event-456",  
    "Delicious new recipes to try this weekend: foodnetwork.com/recipes",  
    "Join the discussion on AI ethics: medium.com/ai-thoughts/ethics-in-ai"  
]

