In [1]:
import pandas as pd
import sys
import os

module_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from ai_service import snli_label_dataset, AiClient
ai_service = AiClient()

In [3]:
train = pd.read_csv('../data/stanford-natural-language-inference-corpus/snli_test_sample_50.csv')

In [4]:
train.head(2)

Unnamed: 0,sentence1,sentence2,gold_label
0,A kid bored in a train with brown hair and his...,A child is riding the train from New York to B...,neutral
1,A brown dog and black and white dog run along ...,Two squirrels run after acorns in the grass.,contradiction


## system promt

In [5]:
system_prompt = """
You are a high-precision classifier for the Natural Language Inference (NLI) task. Your sole goal is to determine the **logical relationship** between a pair of sentences.

Input:
1. SENTENCE 1 (Premise): The original statement.
2. SENTENCE 2 (Hypothesis): The statement under test.

Logical Relationships (Answers):
* entailment: If SENTENCE 1 **logically guarantees** the truth of SENTENCE 2.
* contradiction: If SENTENCE 1 **logically excludes** the truth of SENTENCE 2.
* neutral: If SENTENCE 1 **does not provide enough information** to determine the truth or falsity of SENTENCE 2.

Output Format:
Your answer must be **strictly one word** from the allowed list.

SENTENCE 1: {premise}
SENTENCE 2: {hypothesis}
"""

## gpt-4o-mini

In [7]:
res_df, metrics = snli_label_dataset(
    train,
    prompt_template=system_prompt,
    model_name="gpt-4o-mini",
    llm_classifier=ai_service.apenai_call
)

100%|██████████| 50/50 [00:28<00:00,  1.76it/s]


In [8]:
display(res_df.head())
display(metrics)

Unnamed: 0,sentence1,sentence2,gold_label,pred_raw,pred_label,time_sec,prompt_tokens,completion_tokens,total_tokens
0,A kid bored in a train with brown hair and his...,A child is riding the train from New York to B...,neutral,neutral,neutral,0.792366,208,1,209
1,A brown dog and black and white dog run along ...,Two squirrels run after acorns in the grass.,contradiction,neutral,neutral,0.39615,205,1,206
2,A person in full astronaut suit and gear train...,A female astronaut adjusting to the feeling of...,neutral,neutral,neutral,0.623418,214,1,215
3,An old Indian man dressed in rags sleeps on th...,An old Indian man is dressed up,contradiction,contradiction,contradiction,0.465151,208,3,211
4,A bunch of people are standing all together in...,A group of people are planning something.,neutral,neutral,neutral,0.657472,207,1,208


{'count': 50,
 'avg_time': 0.5674446630477905,
 'total_cost_tokens': 10406,
 'sum_prompt_tokens': 10304,
 'sum_completion_tokens': 102,
 'errors': 0}

## gpt-4o

In [17]:
res_df, metrics = snli_label_dataset(
    train,
    prompt_template=system_prompt,
    model_name="gpt-4o",
    llm_classifier=ai_service.apenai_call
)

100%|██████████| 50/50 [00:35<00:00,  1.42it/s]


In [18]:
display(res_df.head())
display(metrics)

Unnamed: 0,sentence1,sentence2,gold_label,pred_raw,pred_label,time_sec,prompt_tokens,completion_tokens,total_tokens
0,A kid bored in a train with brown hair and his...,A child is riding the train from New York to B...,neutral,neutral,neutral,0.735654,208,1,209
1,A brown dog and black and white dog run along ...,Two squirrels run after acorns in the grass.,contradiction,neutral,neutral,0.555125,205,1,206
2,A person in full astronaut suit and gear train...,A female astronaut adjusting to the feeling of...,neutral,neutral,neutral,0.461536,214,1,215
3,An old Indian man dressed in rags sleeps on th...,An old Indian man is dressed up,contradiction,contradiction,contradiction,0.531079,208,3,211
4,A bunch of people are standing all together in...,A group of people are planning something.,neutral,neutral,neutral,0.551588,207,1,208


{'count': 50,
 'avg_time': 0.7039625024795533,
 'total_cost_tokens': 10406,
 'sum_prompt_tokens': 10304,
 'sum_completion_tokens': 102,
 'errors': 0}

## gemini-2.0-flash

In [6]:
res_df, metrics = snli_label_dataset(
    train,
    prompt_template=system_prompt,
    model_name="gemini-2.0-flash",
    llm_classifier=ai_service.gemini_call
)

100%|██████████| 50/50 [00:25<00:00,  1.93it/s]


In [7]:
display(res_df.head())
display(metrics)

Unnamed: 0,sentence1,sentence2,gold_label,pred_label,time_sec,prompt_tokens,completion_tokens,total_tokens
0,A kid bored in a train with brown hair and his...,A child is riding the train from New York to B...,neutral,neutral,0.653339,207,2,209
1,A brown dog and black and white dog run along ...,Two squirrels run after acorns in the grass.,contradiction,neutral,0.509239,203,2,205
2,A person in full astronaut suit and gear train...,A female astronaut adjusting to the feeling of...,neutral,neutral,0.442758,213,2,215
3,An old Indian man dressed in rags sleeps on th...,An old Indian man is dressed up,contradiction,contradiction,0.602235,205,3,208
4,A bunch of people are standing all together in...,A group of people are planning something.,neutral,neutral,0.499803,206,2,208


{'count': 50,
 'avg_time': 0.5152929592132568,
 'total_cost_tokens': 10368,
 'sum_prompt_tokens': 10233,
 'sum_completion_tokens': 135,
 'errors': 0}

## claude-3-5-haiku

In [8]:
res_df, metrics = snli_label_dataset(
    train,
    prompt_template=system_prompt,
    model_name="claude-3-5-haiku-20241022",
    llm_classifier=ai_service.anthropic_call
)

100%|██████████| 50/50 [02:06<00:00,  2.53s/it]


In [9]:
display(res_df.head())
display(metrics)

Unnamed: 0,sentence1,sentence2,gold_label,pred_label,time_sec,prompt_tokens,completion_tokens,total_tokens
0,A kid bored in a train with brown hair and his...,A child is riding the train from New York to B...,neutral,neutral,4.415553,230,167,397
1,A brown dog and black and white dog run along ...,Two squirrels run after acorns in the grass.,contradiction,,2.041509,229,63,292
2,A person in full astronaut suit and gear train...,A female astronaut adjusting to the feeling of...,neutral,entailment,2.443652,240,75,315
3,An old Indian man dressed in rags sleeps on th...,An old Indian man is dressed up,contradiction,,2.426506,231,86,317
4,A bunch of people are standing all together in...,A group of people are planning something.,neutral,neutral,3.615615,228,119,347


{'count': 50,
 'avg_time': 2.5237209463119505,
 'total_cost_tokens': 15845,
 'sum_prompt_tokens': 11414,
 'sum_completion_tokens': 4431,
 'errors': 7}