In [1]:
!unzip /content/anli_v1.0.zip

Archive:  /content/anli_v1.0.zip
   creating: anli_v1.0/
  inflating: anli_v1.0/README.txt    
   creating: anli_v1.0/R3/
  inflating: anli_v1.0/R3/dev.jsonl  
  inflating: anli_v1.0/R3/train.jsonl  
  inflating: anli_v1.0/R3/test.jsonl  
   creating: anli_v1.0/R2/
  inflating: anli_v1.0/R2/dev.jsonl  
  inflating: anli_v1.0/R2/train.jsonl  
  inflating: anli_v1.0/R2/test.jsonl  
   creating: anli_v1.0/R1/
  inflating: anli_v1.0/R1/dev.jsonl  
  inflating: anli_v1.0/R1/train.jsonl  
  inflating: anli_v1.0/R1/test.jsonl  


In [None]:
!pip install -q -U google-generativeai

In [2]:
import pandas as pd
import numpy as np
import json

from tqdm import tqdm

import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

from google.colab import userdata

GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')

genai.configure(api_key=GOOGLE_API_KEY)

from sklearn.metrics import accuracy_score

In [3]:
def pack_to_df(ds_path):
  lines = []
  with open(ds_path) as f:
    lines = f.read().splitlines()

  line_dicts = [json.loads(line) for line in lines]
  return pd.DataFrame(line_dicts)

In [4]:
dataset_r1 = pack_to_df('anli_v1.0/R1/test.jsonl')

In [5]:
dataset_r2 = pack_to_df('anli_v1.0/R2/test.jsonl')

In [6]:
dataset_r3 = pack_to_df('anli_v1.0/R3/test.jsonl')

In [None]:
dataset_r1.loc[392, 'hypothesis'] = 'Kaspars Roga is a drummer.'

In [None]:
model = genai.GenerativeModel('gemini-1.0-pro-latest')

In [7]:
def encode(r):
    if r is None:
      return 'none'
    elif 'True' in r:
      return 'e'
    elif 'False' in r:
      return 'c'
    else:
      return 'n'

In [8]:
def encode_and_count_accuracy(responses, ds):
  responses_encoded = []
  for r in responses:
    if r is None:
      responses_encoded.append('none')
    elif 'True' in r:
      responses_encoded.append('e')
    elif 'False' in r:
      responses_encoded.append('c')
    else:
      responses_encoded.append('n')
  responses_encoded = np.array(responses_encoded)

  print('Total accuracy: ', accuracy_score(ds.label, responses_encoded))

  for v in ['e', 'c', 'n']:
    idx = ds.index[ds['label'] == v].tolist()
    print(f"Label: {v} {accuracy_score(responses_encoded[idx], ds[ds['label'] == v].label)}")

In [9]:
class BaseTest:

  def __init__(self, fname, ds, pre_prompt = '', suf_prompt = ''):
    self.responses = []
    self.fname = fname
    self.pre_prompt = pre_prompt
    self.suf_prompt = suf_prompt
    self.ds = ds

  def get_responses(self):
    with open(self.fname) as f:
      r = f.read()
      self.responses = r.split('₩')[:-1]
    return self.responses

  def prompting(self):
    for index, row in tqdm(self.ds.iterrows()):
      if index >= len(self.responses):
        try:
          self.responses.append(model.generate_content(self.pre_prompt + \
           f'{row.context}\nQuestion: {row.hypothesis} True, False, or Neither?' \
                                                  + self.suf_prompt).text)
          with open(self.fname, 'a') as f:
            f.write(self.responses[-1] + "₩")

        except:
          self.responses.append(None)
          print(row.context, row.hypothesis)
    return self.responses

##ZeroShot

In [None]:
ZeroShotTest = BaseTest('zero_r1.txt', dataset_r1)

In [None]:
responses = ZeroShotTest.prompting()

In [None]:
encode_and_count_accuracy(responses, dataset_r1)

Total accuracy:  0.668
Label: e 0.6856287425149701
Label: c 0.6186186186186187
Label: n 0.6996996996996997


In [None]:
ZeroShotTest = BaseTest('zero_r2.txt', dataset_r2)

In [None]:
responses = ZeroShotTest.prompting()

In [None]:
encode_and_count_accuracy(responses, dataset_r2)

Total accuracy:  0.56
Label: e 0.5479041916167665
Label: c 0.4594594594594595
Label: n 0.6726726726726727


In [None]:
ZeroShotTest = BaseTest('zero_r3.txt', dataset_r3)

In [None]:
responses = ZeroShotTest.get_responses()

In [None]:
responses = ZeroShotTest.prompting()

In [None]:
encode_and_count_accuracy(responses, dataset_r3)

Total accuracy:  0.5683333333333334
Label: e 0.5945273631840796
Label: c 0.42424242424242425
Label: n 0.6840796019900498


##ZeroShot CoT

In [None]:
ZShCoTTest = BaseTest('zero_cot_r1.txt', dataset_r1, '', "\n Answer: Let's think step by step.")

In [None]:
responses = ZShCoTTest.get_responses()

In [None]:
responses = ZShCoTTest.prompting()

In [None]:
encode_and_count_accuracy(responses, dataset_r1)

Total accuracy:  0.673
Label: e 0.7395209580838323
Label: c 0.6606606606606606
Label: n 0.6186186186186187


In [None]:
ZShCoTTest = BaseTest('zero_cot_r2.txt', dataset_r2, '', "\n Answer: Let's think step by step.")

In [None]:
responses = ZShCoTTest.get_responses()

In [None]:
responses = ZShCoTTest.prompting()

In [None]:
encode_and_count_accuracy(responses, dataset_r2)

Total accuracy:  0.428
Label: e 0.40718562874251496
Label: c 0.4624624624624625
Label: n 0.4144144144144144


In [None]:
ZShCoTTest = BaseTest('zero_cot_r3.txt', dataset_r3, '', "\n Answer: Let's think step by step.")

In [None]:
responses = ZShCoTTest.get_responses()

In [None]:
responses = ZShCoTTest.prompting()

In [None]:
encode_and_count_accuracy(responses, dataset_r3)

Total accuracy:  0.5625
Label: e 0.664179104477612
Label: c 0.49747474747474746
Label: n 0.5248756218905473


##OneShot CoT

In [None]:
OneShCoTTest = BaseTest('one_cot_r1.txt', dataset_r1, "The Parma trolleybus system forms part of the public transport network of the city and \"comune\" of Parma, in the region of Emilia-Romagna, northern Italy. In operation since 1953, the system presently comprises four urban routes. Question: The trolleybus system has over 2 urban routes. True, False or Neither?\n Answer: Let's think step by step. The system presently comprises four urban routes, since four is more than 2 the correct answer is True.", "\n Answer: Let's think step by step.")

In [None]:
responses = OneShCoTTest.prompting()

In [None]:
encode_and_count_accuracy(responses, dataset_r1)

Total accuracy:  0.661
Label: e 0.7574850299401198
Label: c 0.6576576576576577
Label: n 0.5675675675675675


In [None]:
OneShCoTTest = BaseTest('one_cot_r2.txt', dataset_r2, "The Parma trolleybus system forms part of the public transport network of the city and \"comune\" of Parma, in the region of Emilia-Romagna, northern Italy. In operation since 1953, the system presently comprises four urban routes. Question: The trolleybus system has over 2 urban routes. True, False or Neither?\n Answer: Let's think step by step. The system presently comprises four urban routes, since four is more than 2 the correct answer is True.", "\n Answer: Let's think step by step.")

In [None]:
responses = OneShCoTTest.get_responses()

In [None]:
responses = OneShCoTTest.prompting()

In [None]:
encode_and_count_accuracy(responses, dataset_r2)

Total accuracy:  0.434
Label: e 0.49101796407185627
Label: c 0.4264264264264264
Label: n 0.3843843843843844


In [None]:
OneShCoTTest = BaseTest('one_cot_r3.txt', dataset_r3, "The Parma trolleybus system forms part of the public transport network of the city and \"comune\" of Parma, in the region of Emilia-Romagna, northern Italy. In operation since 1953, the system presently comprises four urban routes. Question: The trolleybus system has over 2 urban routes. True, False or Neither?\n Answer: Let's think step by step. The system presently comprises four urban routes, since four is more than 2 the correct answer is True.", "\n Answer: Let's think step by step.")

In [None]:
responses = OneShCoTTest.get_responses()

In [None]:
responses = OneShCoTTest.prompting()

In [None]:
encode_and_count_accuracy(responses, dataset_r3)

Total accuracy:  0.5391666666666667
Label: e 0.6268656716417911
Label: c 0.4823232323232323
Label: n 0.5074626865671642


##Reductio Ad Absurdum

In [None]:
ZeroRAATest = BaseTest('zero_raa_r1.txt', dataset_r1, "", "First try to negate the hypothesis and see whether it contradicts the context.")

In [None]:
responses = ZeroRAATest.get_responses()

In [None]:
responses = ZeroRAATest.prompting()

In [None]:
encode_and_count_accuracy(responses, dataset_r1)

Total accuracy:  0.509
Label: e 0.5149700598802395
Label: c 0.5225225225225225
Label: n 0.4894894894894895


In [None]:
ZeroRAATest = BaseTest('zero_raa_r2.txt', dataset_r2, "", "First try to negate the hypothesis and see whether it contradicts the context.")

In [None]:
responses = ZeroRAATest.get_responses()

In [None]:
responses = ZeroRAATest.prompting()

1000it [00:55, 18.18it/s]


In [None]:
encode_and_count_accuracy(responses, dataset_r2)

Total accuracy:  0.439
Label: e 0.4041916167664671
Label: c 0.45645645645645644
Label: n 0.45645645645645644


In [None]:
ZeroRAATest = BaseTest('zero_raa_r3.txt', dataset_r3, "", "First try to negate the hypothesis and see whether it contradicts the context.")

In [None]:
responses = ZeroRAATest.prompting()

In [None]:
encode_and_count_accuracy(responses, dataset_r3)

Total accuracy:  0.5608333333333333
Label: e 0.6094527363184079
Label: c 0.5151515151515151
Label: n 0.5572139303482587


#Ensemble

In [41]:
l = []
for method in ['zero_r1.txt', 'zero_cot_r1.txt', 'one_cot_r1.txt', 'zero_raa_r1.txt']:
  test = BaseTest(method, dataset_r1)
  responses = test.get_responses()
  l.append(responses)
df = pd.DataFrame(l).T
df = df.applymap(encode)
accuracy_score(df.mode(axis=1).iloc[:, 0].values, dataset_r1.label)

0.556

In [44]:
for i in range(4):
  mask = df.iloc[:, i] != dataset_r1.label
  print(accuracy_score(df.iloc[:, i][mask], dataset_r1.model_label[mask]))

0.6144578313253012
0.5034965034965035
0.5085536547433903
0.5223613595706619


In [46]:
l = []
for method in ['zero_r2.txt', 'zero_cot_r2.txt', 'one_cot_r2.txt', 'zero_raa_r2.txt']:
  test = BaseTest(method, dataset_r2)
  responses = test.get_responses()
  l.append(responses)
df = pd.DataFrame(l).T
df = df.applymap(encode)
accuracy_score(df.mode(axis=1).iloc[:, 0].values, dataset_r2.label)

0.44

In [47]:
for i in range(4):
  mask = df.iloc[:, i] != dataset_r2.label
  print(accuracy_score(df.iloc[:, i][mask], dataset_r2.model_label[mask]))

0.6641074856046065
0.5117056856187291
0.5177419354838709
0.5151515151515151


In [19]:
l = []
for method in ['zero_r3.txt', 'zero_cot_r3.txt', 'one_cot_r3.txt', 'zero_raa_r3.txt']:
  test = BaseTest(method, dataset_r3)
  responses = test.get_responses()
  l.append(responses)
df = pd.DataFrame(l).T
df = df.applymap(encode)
accuracy_score(df.mode(axis=1).iloc[:, 0].values, dataset_r3.label)

0.5436241610738255

In [1]:
for i in range(4):
  mask = df.iloc[:, i] != dataset_r3.label
  print(accuracy_score(df.iloc[:, i][mask], dataset_r3.model_label[mask]))

0.6323766233766234
0.600358422939068
0.5392156862745098
0.5330578512396694
