In [None]:
import os
import random
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from collections import Counter
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
import meerkat as mk
from domino import DominoSlicer
from classifer import *
from bootstrap_utils import *
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from huggingface_hub import login



This file contains example code from the LLM experiments performed in this study.

In [None]:
# error slice from previous iteration
results = pd.read_csv("/vol/bitbucket/yl28218/thesis/auditing_pipeline/results/results_corr.csv")
# separe the error slice  based on the id
#types = ['slice_image_only', 'slice_image_text','slice_image_text_meta','slice_report_text','slice_metadata','slice_report_metadata','slice_image_metadata']
slice_1 = results[results['id'] == 1]
slice_2 = results[results['id'] == 2]
slice_3 = results[results['id'] == 3]
slice_4 = results[results['id'] == 4]
slice_5 = results[results['id'] == 5]
slice_6 = results[results['id'] == 6]
slice_7 = results[results['id'] == 7]

k =5
max_features = 1000
tokens_slice_1 = analyze_error_slice_tokens(slice_1, results, k, max_features)
tokens_slice_2 = analyze_error_slice_tokens(slice_2, results, k, max_features)
tokens_slice_3 = analyze_error_slice_tokens(slice_3, results, k, max_features)
tokens_slice_4 = analyze_error_slice_tokens(slice_4, results, k, max_features)
tokens_slice_5 = analyze_error_slice_tokens(slice_5, results, k, max_features)
tokens_slice_6 = analyze_error_slice_tokens(slice_6, results, k, max_features)
tokens_slice_7 = analyze_error_slice_tokens(slice_7, results, k, max_features)
  

In [None]:
classifer_description = "Pneumothorax"
data = "Chest X-ray"

### Gemma

In [None]:
def init_model(model_id="google/gemma-2-2b-it", hf_token=None,
               hf_home="/vol/bitbucket/yl28218/hf_home",
               hf_cache="/vol/bitbucket/yl28218/hf_cache"):


    os.environ["HF_HOME"] = hf_home
    os.environ["TRANSFORMERS_CACHE"] = hf_cache

    tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        token=hf_token
        device_map="auto",     
        torch_dtype="auto"     
    )
    return tokenizer, model


def generate_text(prompt, tokenizer, model,
                  max_new_tokens=200,
                  temperature=0.7,
                  top_p=0.9):
 
   
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

  
    gen_kwargs = dict(
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=temperature,
        top_p=top_p
    )
    if tokenizer.eos_token_id is not None:
        gen_kwargs["eos_token_id"] = tokenizer.eos_token_id


    outputs = model.generate(**inputs, **gen_kwargs)

   
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [None]:
#please replace with your own Hugging Face token
hf_token = "xxxxxxxxxxxxxxxxxxxxxxxxxxxx" 
tokenizer, model = init_model(hf_token=hf_token)
# example usage
print(generate_text(f"An  slice, {slice_1[['report_text', 'metadata_description']]}, was found using a slice discovery method.Please identify the 3 most common features  In the format of H1, H2, H3.Please be specific and concise in your response.", tokenizer, model))


In [None]:
input_text = f"""We are trying to audit a {classifer_description} image classifier.
An error slice, {slice_1[['report_text', 'metadata_description']]}, was found using a slice discovery method.

Please identify the 3 most important features in this slice that may be causing the classifier to fail. In the format of H1, H2, H3. Please be specific and concise in your response."""
print(generate_text(input_text, tokenizer, model))

In [None]:
input_text_2 = f"""We are trying to audit a {classifer_description} image classifier.
An error slice, {slice_1[['report_text', 'metadata_description']]}, was found using a slice discovery method.
The tokens analysis also done to compare the token frequency in the error slice and the normal slice. They are ranked by the difference in frequency between the error slice and the normal slice.
The results are {tokens_slice_1}.
Please use the tokens found and the report text and metadata description to identify the 3 most important features in this slice that may be causing the classifier to fail.
Please identify the 3 most important features in this slice that may be causing the classifier to fail. In the format of H1, H2, H3.Please be specific and concise in your response.""

### Deepseek

In [None]:
from openai import OpenAI
# please replace the api_key with your own key
client = OpenAI(api_key="xxxxxxxxxxxxxxxxxxxxxxx", base_url="https://api.deepseek.com")

In [None]:
response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {"role": "user", "content": f"""

An  slice, {slice_1[['report_text', 'metadata_description']]}, was found using a slice discovery method.

Please identify the 3 most common features  In the format of H1, H2, H3.Please be specific and concise in your response.
"""}
    ],
    stream=False
)

print("Response from DeepSeek:", response.choices[0].message.content)

In [None]:
response = client.chat.completions.create(
    model="deepseek-chat",
    messages=[
        {"role": "user", "content": f"""
We are trying to audit a {classifer_description} image classifier.
An error slice, {slice_1[['report_text', 'metadata_description']]}, was found using a slice discovery method.
The tokens analysis also done to compare the token frequency in the error slice and the normal slice. They are ranked by the difference in frequency between the error slice and the normal slice.
The results are {tokens_slice_1.head(10)}.
Please use the tokens found and the report text and metadata description to identify the 3 most important features in this slice that may be causing the classifier to fail.

Please identify the 3 most important features in this slice that may be causing the classifier to fail. In the format of H1, H2, H3.Please be specific and concise in your response.
"""}
    ],
    stream=False
)

print("Response from DeepSeek:", response.choices[0].message.content)