# Zero Shot Baseline
A basic setup script for running zero shot inference on three datasets

The dataset on blogs is fairness relevant, as it can discriminate and use harmful features in inference

The dataset on emails is also safety relevant, as it is about spam filtering

In [1]:
import torch
import pandas as pd
import numpy as np
import transformers
import torch
import re
import time
import os

from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from transformers import pipeline

from funcs import make_prompt, pre_process_df, extract_answer

In [2]:
access_token='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'
model_type = 'google/gemma-2-9b-it'
device = 'cuda:3'  'auto'
device = 'auto'

tokenizer = AutoTokenizer.from_pretrained(model_type,
                                          token=access_token,
                                            )
model = AutoModelForCausalLM.from_pretrained(model_type,
                                             device_map=device,
                                             token=access_token,
                                             torch_dtype=torch.float16,  # Use half precision if possible
                                            )

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

In [2]:
dataset = 'imdb'  # 'emails' 'imdb'

In [4]:
df = pd.read_csv('datasets/'+dataset+'.csv')
df = pre_process_df(df, dataset)
df.shape

(50000, 2)

In [5]:
df.sample(5)

Unnamed: 0,text,label
1605,Another Indian legend you never heard of befor...,negative
22440,I felt Rancid Aluminium was a complete waste o...,negative
4952,"I love to see a female protagonist, in this mo...",positive
16375,"In a performance both volatile and graceful, A...",positive
8793,This movie really surprised me. I had my doubt...,positive


## Run Inference

In [6]:
# Set pad token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

df_result = pd.DataFrame(columns=['label', 'prediction', 'llm_response', 'text', 'index'])


start_time = time.time()

# Generate responses
with torch.no_grad():

    for idx, row in df.sample(5, random_state=42).iterrows():

        # Change this for different datasets
        text     = row.text
        label    = row.label
        
        messages = make_prompt(text, dataset)
        encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")
        
        # Create attention mask
        attention_mask = encodeds.ne(tokenizer.pad_token_id).long() if tokenizer.pad_token_id is not None else torch.ones_like(encodeds).long()
        
        encodeds = encodeds.to(model.device)
        attention_mask = attention_mask.to(model.device)
        
        generated_ids = model.generate(
            encodeds,
            attention_mask=attention_mask,
            max_new_tokens=2000,
            do_sample=True,
            temperature=1.0,
            top_p=1.0,
            use_cache=True,
            pad_token_id=tokenizer.eos_token_id
        )

        response = tokenizer.batch_decode(generated_ids)[0]
        prediction = extract_answer(response.split('<end_of_turn>')[-2])
            
        print(response.split('[/INST]')[-1])
        print('\n\n\n =====================')
    
        # Create a list with these values
        data = [label, prediction, response, text, idx]
        
        # Append the list as a new row to the DataFrame
        df_result.loc[len(df_result)] = data
    
        # df_result.to_csv('data/zero_shot_results_'+dataset+'.csv')

print("time taken:", time.time() - start_time)


<bos><start_of_turn>user
You are an AI assistant and you are very good at doing sentiment classification of movie reviews.
        Your task is to carefully read a movie review and then classify if the reviewer thinks negaitvely or positively about it.
        
        You are only allowed to choose one of the following two categories in your classification: 
        1. negative
        2. positive
    
        Now, consider the following: 

        **Paragraph starting now**
        
        I really liked this Summerslam due to the look of the arena, the curtains and just the look overall was interesting to me for some reason. Anyways, this could have been one of the best Summerslam's ever if the WWF didn't have Lex Luger in the main event against Yokozuna, now for it's time it was ok to have a huge fat man vs a strong man but I'm glad times have changed. It was a terrible main event just like every match Luger is in is terrible. Other matches on the card were Razor Ramon vs Ted Dibi

In [10]:
# 2b 83  sec
# 9b 148 sec

83.57818603515625

In [11]:
df_result

Unnamed: 0,label,prediction,llm_response,text,index
0,positive,negative,<bos><start_of_turn>user\nYou are an AI assist...,I really liked this Summerslam due to the look...,33553
1,positive,positive,<bos><start_of_turn>user\nYou are an AI assist...,Not many television shows appeal to quite as m...,9427
2,negative,negative,<bos><start_of_turn>user\nYou are an AI assist...,The film quickly gets to a major chase scene w...,199
3,positive,positive,<bos><start_of_turn>user\nYou are an AI assist...,Jane Austen would definitely approve of this o...,12447
4,negative,negative,<bos><start_of_turn>user\nYou are an AI assist...,Expectations were somewhat high for me when I ...,39489


In [13]:
response.split('<end_of_turn>')[-2]

'\n    The reviewer starts by expressing high expectations for the movie due to Steve Carell\'s past successes. However, they immediately state they were "wrong," setting a negative tone.\n\n    The reviewer acknowledges a few positive aspects: Carell\'s comedic ability shines in specific moments. However, they emphasize that even these moments can\'t salvage the overall "train-wreck" of a film.\n\n    The criticism intensifies as the reviewer dives into the plot. They berate the family dynamics, calling them unrealistic and obnoxious caricatures. They use harsh language like "awful," "sickening," and "horrifyingly wrong" to describe various elements of the film.\n\n    The review culminates in scathing remarks about an important character, comparing her portrayal to a "stroke victim" and "mildly retarded." The reviewer concludes by expressing bewilderment at the movie, calling it "not much."\n\n\n\n    <answer>negative</answer> \n'

In [9]:
# print(df_result.llm_response[0])

## Evaluate Inference

In [69]:
dataset = 'imdb'    # 0.9511677282377919
dataset = 'emails'  # 0.7936170212765957
dataset = 'blogs'   # 0.7245179063360881

In [70]:
result = pd.read_csv('data/zero_shot_results_'+dataset+'.csv')

In [71]:
result.prediction.value_counts()

prediction
male                                                        207
female                                                      156
Cannot be determined                                         12
cannot be determined                                          7
unknown                                                       4
inconclusive                                                  4
cannot determine                                              4
Unable to determine                                           3
Cannot Determine                                              3
unclear                                                       3
Cannot determine                                              2
Cannot Classify                                               2
uncertain                                                     2
I cannot answer                                               1
undecided                                                     1
Unable to classify           

In [72]:
print(result.shape)
result = result.dropna()
print(result.shape)

(500, 6)
(423, 6)


In [73]:
print(result.shape)
if dataset == 'imdb':
    result = result[result['prediction'].isin(['positive', 'negative'])]
if dataset == 'emails':
    result = result[result['prediction'].isin(['normal', 'spam'])]
if dataset == 'blogs':
    result = result[result['prediction'].isin(['male', 'female'])]

print(result.shape)

(423, 6)
(363, 6)


In [74]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [75]:
preds = result.prediction.values
labels = result.label.values

In [76]:
accuracy_score(preds, labels)

0.7245179063360881

In [42]:
result.prediction.value_counts()

prediction
male      207
female    156
Name: count, dtype: int64

In [43]:
confusion_matrix(preds, labels)

array([[116,  40],
       [ 60, 147]])

In [28]:
result.prediction.value_counts()

prediction
male                                                        207
female                                                      156
Cannot be determined                                         12
cannot be determined                                          7
unknown                                                       4
inconclusive                                                  4
cannot determine                                              4
Unable to determine                                           3
Cannot Determine                                              3
unclear                                                       3
Cannot determine                                              2
Cannot Classify                                               2
uncertain                                                     2
I cannot answer                                               1
undecided                                                     1
Unable to classify           