In [2]:
import pandas as pd

# let's check the completeness of the model output
import os

folder_path = "resource/input/causal_discovery"

for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path)
        length = len(df)
        print(f"{filename} 的长度为: {length}")

causal_discovery_experiment_prompt.csv 的长度为: 19180
exp_result_Llama-2-13b-chat-hf_20240506005530.csv 的长度为: 14114
exp_result_Meta-Llama-3-8B-Instruct_20240506005718.csv 的长度为: 8528
exp_result_Mistral-7B-Instruct-v0.2_20240506014939.csv 的长度为: 14551
exp_result_Meta-Llama-3-8B-Instruct_20240507183022.csv 的长度为: 10652
exp_result_Llama-2-13b-chat-hf_20240505194445.csv 的长度为: 13
exp_result_Llama-2-7b-chat-hf_20240505194019.csv 的长度为: 12
exp_result_Llama-2-7b-chat-hf_20240506004401.csv 的长度为: 19180


In [3]:
result_complete = pd.read_csv(
    "/Users/lujun.li/projects/causallm-github/causalllm/resource/input/causal_discovery/exp_result_Llama-2-7b-chat-hf_20240506004401.csv"
)

result_complete1 = pd.read_csv(
    "/Users/lujun.li/projects/causallm-github/causalllm/resource/input/causal_discovery/exp_result_Meta-Llama-3-8B-Instruct_20240506005718.csv"
)

result_complete2 = pd.read_csv(
    "/Users/lujun.li/projects/causallm-github/causalllm/resource/input/causal_discovery/exp_result_Meta-Llama-3-8B-Instruct_20240507183022.csv"
)

result_complete = pd.concat([result_complete1, result_complete2], ignore_index=True)

result_complete = pd.read_csv(
    "/Users/lujun.li/projects/causallm-github/causalllm/resource/input/causal_discovery/exp_result_Llama-2-13b-chat-hf_20240506005530.csv"
)



len(result_complete)

14114

In [5]:
result_complete["llama2_chat_initial_prompt"].sample(1).to_dict()

{1408: '\n<s>[INST] <<SYS>>\nYou are a conscientious assistant, and you must answer my questions exactly as my questions require.\n<</SYS>>\nPlease reference the following three examples (including premise, query, possible answers, and correct answer) to answer the final query.\n\n    Examples: \n\n    Premise: A boy strikes a match.\n    Query: What would have happened if the boy had ditched the match?\n    Possible answers: The lighting of the match would not have occurred.\n    Correct answer: He would have been disqualified.\n    Premise: A boy strikes a match.\nQuery: What would have happened if the boy had ditched the match?\nPossible answers: The lighting of the match would not have occurred.He would have been disqualified.The game would have ended tied.\nCorrect answer:\n[/INST]\n'}

In [6]:
def extract_final_response(row):
    prompt = row["llama2_chat_initial_prompt"]
    response = row["generated_response"]
    return response[len(prompt) :]


result_complete["generated_response_pure"] = result_complete.apply(
    extract_final_response, axis=1
)

In [7]:
result_complete["generated_response_pure"].sample(1).to_dict()

{8797: 'Based on the given examples, to answer the final query, "What would have happened if the bus had run over the family?", the correct answer would be:\n\n"The family would have been hurt."\n\nExplanation:\n\nIn the given premise, "A bus unloads a family.", the focus is on the unloading of the family from the bus. If the bus had run over the family, it would have caused harm to the family, and the bus would not have successfully unloaded them. Therefore, the correct answer is "The family would have been hurt.".'}

In [58]:
import re


def find_correct_answer(sentence):
    pattern = r"Correct answer:(.+?)\."
    match = re.search(pattern, sentence)
    if match:
        return match.group(1), True
    else:
        return "", False


result_complete[["generated_answer", "is_answer_found"]] = result_complete[
    "generated_response_pure"
].apply(lambda x: pd.Series(find_correct_answer(x)))

In [59]:
# The model confused about the input by asking for the query

result_complete[result_complete["is_answer_found"] == False][
    "generated_response_pure"
].str.contains("Please provide")


0        False
1        False
2        False
3        False
5        False
         ...  
14105    False
14107    False
14108    False
14109    False
14112    False
Name: generated_response_pure, Length: 9970, dtype: bool

In [60]:
result_complete["is_confused"] = result_complete[
    result_complete["is_answer_found"] == False
]["generated_response_pure"].apply(lambda x: True if "Please provide" in x else False)

In [61]:
result_complete["is_confused"].fillna(False, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  result_complete["is_confused"].fillna(False, inplace=True)
  result_complete["is_confused"].fillna(False, inplace=True)


In [70]:
print("length of the answer")
print(
    len(
        result_complete
    )
)

print("length of not detected correct answer")
print(
    len(
        result_complete[
            result_complete["is_answer_found"] == False
        ]
    )
)

print("length of not detected correct answer but without confusion")
print(
    len(
        result_complete[
            (result_complete["is_confused"] == False)
            & (result_complete["is_answer_found"] == False)
        ]
    )
)



length of the answer
14114
length of not detected correct answer
9970
length of not detected correct answer but without confusion
9663


In [63]:
# label 1 : found correct answer
# label 2 : correct answer not found because they didn't unstand the examples and ask for query
# label 3 : didn't understand anything
result_complete.loc[result_complete["is_answer_found"] == True, "label"] = 1

result_complete.loc[
    (result_complete["is_answer_found"] == False)
    & (result_complete["is_confused"] == True),
    "label",
] = 2

result_complete.loc[
    (result_complete["is_answer_found"] == False)
    & (result_complete["is_confused"] == False),
    "label",
] = 3

In [64]:
import string


def remove_punctuation(text):
    return re.sub(r"[^\w\s]", "", text)


result_complete["CorrectAnswerCleaned"] = result_complete["CorrectAnswer"].str.replace(
    r"[^\w\s]", "", regex=True
)

result_complete["exact_match"] = (
    result_complete["generated_answer"].str.strip()
    == result_complete["CorrectAnswerCleaned"]
)
result_complete["exact_match"].value_counts()

exact_match
False    13957
True       157
Name: count, dtype: int64

In [65]:
result_complete[result_complete["Temperature"] == 0.1][
    [
        "generated_answer",
        "CorrectAnswer",
        "QCC",
        "Premise",
        "llama2_chat_initial_prompt",
    ]
].sample(1).to_dict()

{'generated_answer': {2074: ''},
 'CorrectAnswer': {2074: 'That is not possible.'},
 'QCC': {2074: 'What would have happened if a video had watched a man?'},
 'Premise': {2074: 'A man watches a video.'},
 'llama2_chat_initial_prompt': {2074: '\n<s>[INST] <<SYS>>\nYou are a conscientious assistant, and you must answer my questions exactly as my questions require.\n<</SYS>>\nPlease reference the following three examples (including premise, query, possible answers, and correct answer) to answer the final query.\n\n    Examples: \n\n    Premise: A man watches a video.\n    Query: What would have happened if a video had watched a man?\n    Possible answers: That is not possible.\n    Correct answer: Nothing special would have happened.\n    Premise: A man watches a video.\nQuery: What would have happened if a video had watched a man?\nPossible answers: That is not possible.Nothing special would have happened.The video would have been happy.\nCorrect answer:\n[/INST]\n'}}

In [66]:
from Levenshtein import distance


def levenshtein_similarity(str1, str2):
    max_length = max(len(str1), len(str2))
    if max_length == 0:
        return 1.0
    else:
        return 1.0 - (distance(str1, str2) / max_length)


result_complete["levenshtein_similarity"] = result_complete.apply(
    lambda row: levenshtein_similarity(
        row["generated_answer"].strip(), row["CorrectAnswerCleaned"].strip()
    ),
    axis=1,
)

In [67]:
result_complete[
    (result_complete["levenshtein_similarity"] >= 0.95)
    & (result_complete["levenshtein_similarity"] < 1)
][
    [
        "generated_answer",
        "CorrectAnswer",
        "QCC",
        "Premise",
        "llama2_chat_initial_prompt",
        "levenshtein_similarity",
    ]
]

Unnamed: 0,generated_answer,CorrectAnswer,QCC,Premise,llama2_chat_initial_prompt,levenshtein_similarity
11268,"In case someone finds out, they would get in ...",In case someone finds out they would get in tr...,What would have happened if the politician had...,A politician accepts an invitation.,\n<s>[INST] <<SYS>>\nYou are a conscientious a...,0.986486


In [68]:
exact_match_counts = (
    result_complete[result_complete["exact_match"] == True].groupby("Premise").size()
)

In [69]:
# Calculate the total number of correct answers (exact matches)
total_correct = result_complete["exact_match"].sum()

# Calculate the total number of questions
total_questions = len(result_complete)

# Calculate the T1 score
T1_score = total_correct / total_questions
T1_score

0.011123706957630722