## Persona

In [30]:
import importlib
import brikasutils as bu
importlib.reload(bu)
import fb_msg_reader as fb
importlib.reload(fb)
import shared_utils as utils
importlib.reload(utils)
from typing import List
import re


In [2]:
import persona
importlib.reload(persona)

texts_with_elias = [
    "selected-data/elias/message_1.json",
]

texts_with_petyo = [
    "selected-data/petyo/message_1.json",
    "selected-data/petyo/message_2.json",
    "selected-data/petyo/message_3.json",
    "selected-data/petyo/message_4.json",
    "selected-data/petyo/message_5.json",
]

ab = persona.PersonaEncoder()
ab.parse_fb_messages(texts_with_elias, "elias")
ab.parse_fb_messages(texts_with_petyo, "petyo")

ab.filter_chats_empty()
ab.filter_chats_regex(utils.BLACKLIST_CHAT_REGEX_FILTERS)

# Compress names
for nameid, chat in ab.chats.items():
    for msg in chat:  
        msg.sender = "Persona" if msg.sender == "Airidas Brikas" else "Friend"

# Start all chats from 2/3rds
for nameid, chat in ab.chats.items():
    ab.chats[nameid] = chat[int(len(chat)/3 * 2):]

ab.select_chat_limited_by_tokens("elias", 6000)
ab.select_chat_limited_by_tokens("petyo", 6000)
persona_text = ab.output()
bu.quickTXT(persona_text)



Read 1946 messages from 1 files. Failed to read 0 messages.
Messages ranged from 2021-09-13 to 2024-03-06
Messages saved to self.fb_messages['elias']
Read 40036 messages from 5 files. Failed to read 0 messages.
Messages ranged from 2020-08-17 to 2024-03-04
Messages saved to self.fb_messages['petyo']
Filtering
link-filter: 950
react-filter: 30




Selected chat elias for 5828 (211 messages)
Selected chat petyo for 5997 (402 messages)


## Survey

In [2]:
import survey
importlib.reload(survey)
csv_file = "surveys/survey_kano-model_v1.csv"
surv = survey.KanoSurvey(csv_file)

## Prompt Builder

In [21]:
def userMsg(*args) -> dict:
    return {"role": "user", "content": "\n".join(args)}
def assistantMsg(*args) -> dict:
    return {"role": "assistant", "content": "\n".join(args)}
def systemMsg(*args) -> dict:
    return {"role": "system", "content": "\n".join(args)}

In [11]:
persona_text = "Favorite video games are Rimworld, Minecraft, Age of Empires, 7 Days to Die"

In [23]:
final_prompts = []

for question in surv.questions:
    p = [
        # userMsg(
        #     "\n".join([
        #         "Text below contains extract of data about a persona. Use this data to understanding the likes and prefferences of the persona.",
        #         persona_text
        #     ])
        # ),
        # assistantMsg("Understood. I will answer my questions from the point of view of the persona"),
        userMsg("\n".join([
            f"You are surveyed about video game prefferences. Reply to the statement below with one of the five options: {', '.join(surv.POSSIBLE_ANSWERS)}. Your answer should only contain the chosen option. ",
            question
        ])),
    ]
    final_prompts.append(p)
bu.quickJSON(final_prompts, "ignorefolder/prompts.json")

## Run Simulation

In [24]:
##################################
SIMULATION_NAMEID = "base"
SETTINGS = {
    # "description": "Base test",
    "model": "gpt-3.5-turbo",
}
##################################

import pandas as pd
from openai import OpenAI
import os

client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
)

results = []
l = len(final_prompts)

for i, (prompt, question) in enumerate(list(zip(final_prompts, surv.questions))):
    print(f"{i}/{l}...", end="") # Print progress
    # Send the Request
    full_response = client.chat.completions.create(
        messages=prompt,
        model=SETTINGS["model"],
    )
    r = full_response.choices[0].message.content

    # Process response
    if r not in surv.POSSIBLE_ANSWERS:
        results.append({'question': question, 'answer': r, 'isValid': False})
        continue
    results.append({'question': question, 'answer': r, 'isValid': True})
    print(f"{question}:\n {r}")

# Save results
df = pd.DataFrame(results)
df.to_csv(f"results/{SIMULATION_NAMEID}_simulation.csv", index=False)
bu.quickJSON(results, f"results/{SIMULATION_NAMEID}_prompts.json")
bu.quickJSON(SETTINGS, f"results/{SIMULATION_NAMEID}_setings.json")

0/40...1/40...2/40...What would you say if the game had the option to save the game at any time?:
 I LIKE IT
3/40...What would you say if the game did NOT have the option to save the game at any time?:
 I EXPECT IT
4/40...What would you say if the game has good graphics?:
 I LIKE IT
5/40...What would you say if the game had NO good graphics, or rather poor graphics?:
 I CAN TOLERATE IT
6/40...What would you say if the game had an exciting storyline?:
 I LIKE IT
7/40...What would you say if the game did NOT have an exciting storyline?:
 I DISLIKE IT
8/40...What would you say if there were rewards such as extra points, in-game currency or coins in the game?:
 I LIKE IT
9/40...What would you say if there were NO rewards such as extra points, in-game currency or coins in the game?:
 I AM NEUTRAL
10/40...What would you say if the game had realistic game physics?:
 I LIKE IT
11/40...What would you say if the game does NOT have realistic physics?:
 I LIKE IT
12/40...What would you say if the 

## Analysis

In [37]:
# remove all characters from a black list from the column answer
for substr in utils.BLACKLIST_ANSWER_SUBSTRINGS:
    df['answer'] = df['answer'].apply(lambda x: re.sub(substr, "", x))
# Update isValid
    df['isValid'] = df['answer'].apply(lambda x: x in surv.POSSIBLE_ANSWERS)

# if all values in isValid is true, drop the column, else print a message
if df['isValid'].all():
    df = df.drop('isValid', axis=1)
else:
    print("Some answers were not valid")

df.head()

Unnamed: 0,question,answer
0,What would you say if there were options to de...,I LIKE IT
1,What would you say if there were NO options to...,I DISLIKE IT
2,What would you say if the game had the option ...,I LIKE IT
3,What would you say if the game did NOT have th...,I EXPECT IT
4,What would you say if the game has good graphics?,I LIKE IT


In [None]:
# Proces simulation output
import pandas as pd
df = pd.DataFrame(results)

air = surv.test_answers["airidas"]
eli = surv.test_answers["elias"]
df.insert(2, "airidas", air[:len(df)])
df.insert(3, "elias", eli[:len(df)])

df['answer'] = df['answer'].str.lower()
df['airidas'] = df['airidas'].str.lower()
df['elias'] = df['elias'].str.lower()

In [31]:
df = pd.read_csv('results/base_simulation.csv')
# df = df.drop(df.columns[0], axis=1) #if loaded from csv, drop the added index col

In [14]:


# add a new column boolean comparing
df['airidas_correct'] = df['answer'] == df['airidas']
df['elias_correct'] = df['answer'] == df['elias']

In [20]:
# compute one number of how the percentage of correct answers
print(df['airidas_correct'].sum() / len(df))
print(df['elias_correct'].sum() / len(df))


0.35
0.225
