#コロナ禍での行動変容の効果をLLMで再現できるか調べる\n
##実装する機能
・プロンプト入力
・プロンプト回答
・集計
・表作成

In [2]:
#import libraries
import os
from dotenv import load_dotenv
from openai import OpenAI
import pandas as pd
import numpy as np
import re
import random
import matplotlib.pyplot as plt
import ast
import json
from collections import Counter


client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY")
    )
pd.set_option('display.max_colwidth', None)

Matplotlib is building the font cache; this may take a moment.


In [7]:
def generate_completion(role, prompt,questionaire):
    response = client.chat.completions.create(
        model="gpt-4o",
        #model="gpt-3.5-turbo-1106",
        messages=[
            {"role": "system", "content": role},
            {"role": "user", "content": f"{prompt},\n{questionaire}"}
        ],
        temperature=0.2,
        max_tokens=1000,
        )
    return response

In [40]:
#make role
age_list=["20-29","30-39","40-49","50-59","60-64"]
sex_list=["male","female"]
age="20-29" #random.choice(age_list)
sex="male" #random.choice(sex_list)


#role = f"Act as a person with the following attributes. {random.choice(age_list)} years old, {random.choice(sex_list)}."
role =f"You are a person who is {age} years old,{sex}"
#make prompt
prompt="Understanding Your Attitude Towards Covid-19 Prevention\n\
We invite you to participate in this questionnaire to help us gain insights into your attitudes and behaviors regarding Covid-19 prevention. Your responses will be invaluable in shaping effective public health strategies.\n\
Instructions:\n\
・Carefully consider each question independently.\n\
・Select the option that best reflects your current situation and personal views.\n\
・Answer each question based solely on your own experiences and beliefs.\n\
Answer Format Example:\n\
Go shopping everyday\n\
['1.Very true']\n\
Thank you for your participation."

#make options
options_list=["1.Very true","2.True","3.Neither","4.Not true","5.Not at all"]
questions_list=["Avoid a poorly-ventilated closed space",
"Avoid large gatherings",
"Avoid conversations or shouting in close proximity",
"Avoid places where items 1-3 above overlap",
"Do not go to dinner with friends",
"Do not go to mass gatherings",
"Participate in virtual events using online tools",
"Undertake frequent handwashing",
"Undertake cough etiquette (use handkerchiefs or sleeves instead of hands)",
"Disinfect things around",
"Avoid going out when you have a cold",
"Avoid going to clinic even when having a cold symptom",
"Prepare consultation and transportation methods for when you feel ill",
"Always wear a surgical-style mask when going out",
"Stockpile surgical-style masks",
"Stockpile food, toilet paper, tissue paper, etc.",
"Avoid contact with younger people",
"Avoid contact with older people",
"Get sufficient rest and sleep",
"Eat a nutritious diet",
"Do exercise such as jogging or exercise using DVD"]

#make questionaire
questionaire=""
for i in range(len(questions_list)):
    questionaire+=f"{questions_list[i]}\n{options_list}\n\n"

def get_answer(content) -> dict[str:str]:
    pattern = re.compile(r"\['(.*?)'\]")
    answers = pattern.findall(content)
    answers_dict = dict(zip(questions_list,answers))
    return answers_dict

response = generate_completion(role,prompt,questionaire)
content=response.choices[0].message.content
answers=get_answer(content)
print(type(answers))
print(answers)


<class 'dict'>
{'Avoid a poorly-ventilated closed space': '1.Very true', 'Avoid large gatherings': '1.Very true', 'Avoid conversations or shouting in close proximity': '1.Very true', 'Avoid places where items 1-3 above overlap': '1.Very true', 'Do not go to dinner with friends': '2.True', 'Do not go to mass gatherings': '1.Very true', 'Participate in virtual events using online tools': '2.True', 'Undertake frequent handwashing': '1.Very true', 'Undertake cough etiquette (use handkerchiefs or sleeves instead of hands)': '1.Very true', 'Disinfect things around': '2.True', 'Avoid going out when you have a cold': '1.Very true', 'Avoid going to clinic even when having a cold symptom': '4.Not true', 'Prepare consultation and transportation methods for when you feel ill': '2.True', 'Always wear a surgical-style mask when going out': '1.Very true', 'Stockpile surgical-style masks': '2.True', 'Stockpile food, toilet paper, tissue paper, etc.': '2.True', 'Avoid contact with younger people': '4.N

In [70]:
def generate_data(count)->pd.DataFrame:
    global age_list, sex_list, prompt, questionaire
    data = []
    #generate completion
    for i in range(count):
        age=random.choice(age_list)
        sex=random.choice(sex_list)
        #role = f"Act as a person with the following attributes. {random.choice(age_list)} years old, {random.choice(sex_list)}."
        role =f"You are a person who is {age} years old,{sex}"
        response = generate_completion(role, prompt,questionaire)
        content = response.choices[0].message.content
        answers = get_answer(content)
        u_40 = 0 if age in ["20-29","30-39"] else 1
        new_data = {"age":age,"u_40":u_40,"sex":sex,"content":content}
        new_data.update(answers)
        data.append(new_data)

    df_data = pd.DataFrame(data)
    return df_data

In [97]:
import pandas as pd
# aggregate data
def aggregate_data(df_data):
    global questions_list, options_list
    #aggregate data
    data_aggregate =[]
    for question in questions_list:
        for option in options_list:
            count = df_data[question].str.contains(option).sum()
            data_aggregate.append({"question":question,"option":option,"count":count})

    df_aggregate = pd.DataFrame(data_aggregate)
    # Pivot the DataFrame
    df_aggregate_pivot = df_aggregate.pivot(index='question', columns='option', values='count')

    # Fill NaN with 0 (if any)
    df_aggregate_pivot = df_aggregate_pivot.fillna(0)
    return df_aggregate_pivot



In [72]:
def separate_dataframes(df_data, filter_list, filter_column):
    """
    Separate the DataFrame based on the unique values in the specified column.

    Parameters:
    df_data (pd.DataFrame): The input DataFrame.
    filter_list (list): The list of unique values to filter by.
    filter_column (str): The column to filter on.

    Returns:
    dict: A dictionary of DataFrames filtered by the specified column values.
    """
    df_data_dict = {}
    for value in filter_list:
        df_data_dict[f'df_data_{value}'] = df_data[df_data[filter_column] == value]
    return df_data_dict

In [103]:
import pandas as pd

def aggregate_dataframes(df_data_dict):

    # Create an iterator from the dictionary items
    dict_iterator = iter(df_data_dict.items())

    # Retrieve values one by one
    try:
        key,value = next(dict_iterator)
        df_aggregate_1 = aggregate_data(value)

        key,value = next(dict_iterator)
        df_aggregate_2 = aggregate_data(value)

        # Continue calling next(dict_iterator) as needed
    except StopIteration:
        print("No more items in the dictionary.")
    return df_aggregate_1,df_aggregate_2

def calculate_ratio(df_aggregate,attribute):
    df_aggregate[f"Ratio of true:{attribute}"] = (df_aggregate["1.Very true"] + df_aggregate["2.True"])/df_aggregate.sum(axis=1)
    return df_aggregate[f"Ratio of true:{attribute}"]

In [75]:
number_of_sample = 50
df_data = generate_data(number_of_sample)

df_data_dict_sex = separate_dataframes(df_data,sex_list,"sex")
df_data_dict_age = separate_dataframes(df_data,[0,1],"u_40")

df_aggregate = aggregate_data(df_data)

df_male_aggregate,df_female_aggregate = aggregate_dataframes(df_data_dict_sex)
df_u40_aggregate,df_o40_aggregate = aggregate_dataframes(df_data_dict_age)

dataframes_to_process = {
    "all":df_aggregate,
    "male":df_male_aggregate,
    "female":df_female_aggregate,
    "u40":df_u40_aggregate,
    "o40":df_o40_aggregate
}

# Calculate ratios for each DataFrame
ratios = [calculate_ratio(df,key) for key,df in dataframes_to_process]

# Concatenate the resulting DataFrames
df_ratio_unif = pd.concat(ratios, axis=1)

df_ratio_unif.head(21)

TypeError: tuple indices must be integers or slices, not str

In [107]:


dataframes_to_process = {
    "all":df_aggregate,
    "male":df_male_aggregate,
    "female":df_female_aggregate,
    "u40":df_u40_aggregate,
    "o40":df_o40_aggregate
}

# Calculate ratios for each DataFrame
ratios = [calculate_ratio(df,key) for key,df in dataframes_to_process.items()]

# Concatenate the resulting DataFrames
df_ratio_unif = pd.concat(ratios, axis=1)

df_ratio_unif.head(21)

Unnamed: 0_level_0,Ratio of true:all,Ratio of true:male,Ratio of true:female,Ratio of true:u40,Ratio of true:o40
question,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Always wear a surgical-style mask when going out,0.962237,0.930869,0.92572,0.888187,0.947295
Avoid a poorly-ventilated closed space,0.962237,0.930869,0.92572,0.888187,0.947295
Avoid contact with older people,0.681183,0.795889,0.518746,0.888187,0.553632
Avoid contact with younger people,0.0,0.0,0.0,0.0,0.0
Avoid conversations or shouting in close proximity,0.962237,0.930869,0.92572,0.888187,0.947295
Avoid going out when you have a cold,0.962237,0.930869,0.92572,0.888187,0.947295
Avoid going to clinic even when having a cold symptom,0.0,0.0,0.0,0.0,0.0
Avoid large gatherings,0.962237,0.930869,0.92572,0.888187,0.947295
Avoid places where items 1-3 above overlap,0.962237,0.930869,0.92572,0.888187,0.947295
Disinfect things around,0.962237,0.930869,0.92572,0.888187,0.947295


In [100]:
df_aggregate = aggregate_data(df_data)

df_male_aggregate,df_female_aggregate = aggregate_dataframes(df_data_dict_sex)
df_u40_aggregate,df_o40_aggregate = aggregate_dataframes(df_data_dict_age)

dataframes_to_process = [
    df_aggregate,
    df_male_aggregate,
    df_female_aggregate,
    df_u40_aggregate,
    df_o40_aggregate
]

# Calculate ratios for each DataFrame
ratios = [calculate_ratio(df) for df in dataframes_to_process]

# Concatenate the resulting DataFrames
df_ratio_unif = pd.concat(ratios, axis=1)

df_ratio_unif.head(21)

Unnamed: 0_level_0,Ratio of true,Ratio of true,Ratio of true,Ratio of true,Ratio of true
question,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Always wear a surgical-style mask when going out,1.0,1.0,1.0,1.0,1.0
Avoid a poorly-ventilated closed space,1.0,1.0,1.0,1.0,1.0
Avoid contact with older people,0.7,0.846154,0.541667,1.0,0.571429
Avoid contact with younger people,0.0,0.0,0.0,0.0,0.0
Avoid conversations or shouting in close proximity,1.0,1.0,1.0,1.0,1.0
Avoid going out when you have a cold,1.0,1.0,1.0,1.0,1.0
Avoid going to clinic even when having a cold symptom,0.0,0.0,0.0,0.0,0.0
Avoid large gatherings,1.0,1.0,1.0,1.0,1.0
Avoid places where items 1-3 above overlap,1.0,1.0,1.0,1.0,1.0
Disinfect things around,1.0,1.0,1.0,1.0,1.0


In [91]:
#print(df_data_dict_sex)
dict_iterator = iter(df_data_dict_sex.items())


# Retrieve values one by one
try:
    key, value = next(dict_iterator)
    print(f"Key: {key}, Value:\n{value}\n")

    key, value = next(dict_iterator)
    print(f"Key: {key}, Value:\n{value}\n")

    # Continue calling next(dict_iterator) as needed
except StopIteration:
    print("No more items in the dictionary.")


Key: df_data_male, Value:
      age  u_40   sex   
0   40-49     1  male  \
1   50-59     1  male   
2   40-49     1  male   
4   20-29     0  male   
6   40-49     1  male   
8   30-39     0  male   
9   20-29     0  male   
10  30-39     0  male   
14  50-59     1  male   
16  50-59     1  male   
17  30-39     0  male   
19  30-39     0  male   
22  40-49     1  male   
23  20-29     0  male   
24  30-39     0  male   
26  50-59     1  male   
29  50-59     1  male   
30  60-64     1  male   
37  40-49     1  male   
38  50-59     1  male   
40  30-39     0  male   
41  20-29     0  male   
42  60-64     1  male   
43  20-29     0  male   
45  40-49     1  male   
49  30-39     0  male   

                                                                                                                                                                                                                                                                                                          