## Initial setup

In [24]:
import importlib # Used to add reload functionality, for when we update our own custom libraries
import re
from typing import List


# Custom libraries
import brikasutils as bu
importlib.reload(bu)
import shared_utils as utils
from shared_utils import systemMsg, userMsg, assistantMsg, find_most_similar
importlib.reload(utils)

<module 'shared_utils' from '/Users/twenythree/Home/Prot/Thesis/shared_utils.py'>

## Persona

In [13]:
import re

# Custom libraries
import fb_msg_reader as fb
importlib.reload(fb)
import persona
importlib.reload(persona)


<module 'persona' from '/Users/twenythree/Home/Prot/Thesis/persona.py'>

In [14]:
texts_with_elias = [
    "selected-data/elias/message_1.json",
]
texts_with_petyo = [
    "selected-data/petyo/message_1.json",
    "selected-data/petyo/message_2.json",
    "selected-data/petyo/message_3.json",
    "selected-data/petyo/message_4.json",
    "selected-data/petyo/message_5.json",
]
texts_with_others_dict = {
    "anna": ["selected-data/others/anna.json"],
    "patryk": ["selected-data/others/patryk.json"],
    "andreas": ["selected-data/others/andreas.json"],
    "victoria": ["selected-data/others/victoria.json", "selected-data/others/victoria2.json"],
    "joanna": ["selected-data/others/joanna.json"],
    "antoni": ["selected-data/others/antoni.json"],
    "arijan": ["selected-data/others/arijan.json"],
    "denis": ["selected-data/others/denis.json"],
    "alexandra": ["selected-data/others/alexandra.json"],
    "FED": ["selected-data/others/FED.json"],
    "filip": ["selected-data/others/filip.json"],
    "kuba": ["selected-data/others/kuba.json"],
    "laura": ["selected-data/others/laura.json"],
    "liisa": ["selected-data/others/liisa.json"],
    "luiza": ["selected-data/others/luiza.json"],
    "marcus": ["selected-data/others/marcus.json"],
}


ab = persona.PersonaEncoder()
ab.parse_fb_messages(texts_with_elias, "elias")
ab.parse_fb_messages(texts_with_petyo, "petyo")

for name, texts in texts_with_others_dict.items():
    ab.parse_fb_messages(texts, name)

ab.filter_chats_empty()
ab.filter_chats_regex(utils.BLACKLIST_CHAT_REGEX_FILTERS)

# Compress names
for nameid, chat in ab.chats.items():
    for msg in chat:  
        msg.sender = "Persona" if msg.sender == "Airidas Brikas" else "Friend"

# Start all chats from 2/3rds
for nameid, chat in ab.chats.items():
    ab.chats[nameid] = chat[int(len(chat)/3 * 2):]

# ab.select_chat_limited_by_tokens("elias", 6000)
# ab.select_chat_limited_by_tokens("petyo", 6000)
ab.select_chat_full("elias")
ab.select_chat_full("petyo")

for name in texts_with_others_dict.keys():
    ab.select_chat_full(name)

token_counts = ab.count_all_selected_chat_tokens() # token_counts used later for statistics
print(f"Combined tokens: {sum(token_counts.values())}")
persona_text = ab.output()
bu.quickTXT(persona_text, filename=f"ignorefolder/pt_{bu.get_timestamp()}")



Read 1946 messages from 1 files. Failed to read 0 messages.
Messages ranged from 2021-09-13 to 2024-03-06
Messages saved to self.chats['elias']
Read 40036 messages from 5 files. Failed to read 0 messages.
Messages ranged from 2020-08-17 to 2024-03-04
Messages saved to self.chats['petyo']
Read 7953 messages from 1 files. Failed to read 0 messages.
Messages ranged from 2023-05-12 to 2024-03-04
Messages saved to self.chats['anna']
Read 5734 messages from 1 files. Failed to read 0 messages.
Messages ranged from 2020-09-18 to 2024-03-02
Messages saved to self.chats['patryk']
Read 372 messages from 1 files. Failed to read 0 messages.
Messages ranged from 2022-03-19 to 2024-02-24
Messages saved to self.chats['andreas']
Read 3399 messages from 2 files. Failed to read 0 messages.
Messages ranged from 2021-08-23 to 2024-03-02
Messages saved to self.chats['victoria']
Read 2951 messages from 1 files. Failed to read 0 messages.
Messages ranged from 2022-11-13 to 2024-02-19
Messages saved to self.ch

### Debug

In [6]:
ab.count_chat_tokens("petyo")

Chat petyo has 169785 (12332 messages)


## Survey

In [28]:
import survey
importlib.reload(survey)
surv = survey.KanoSurvey()

Using default Kano Survey CSV file: surveys/survey_kano-model.csv


## Embedding

### Debug

Stored 'texts_with_elias' (list)


### Chunking

In [17]:
# PARAMETERS
EMBED_MODEL = "nomic-embed-text"        # nomic-embed-text = long ctx / mxbai-embed-large = big
CHUNK_SIZE = 30                         # Number of messages per chunk
OVERLAP_SIZE = 10                       # Number of overlapping messages between consecutive chunks

In [18]:
import ollama
import numpy as np
from numpy.linalg import norm

# different chunk size
chunks = []
stat_total_msgs_in_chunks = 0 # for statistics

# Iterate over chats and messages to create chunks
for chat in ab.selectedChats.values():
    messages = list(chat)  # Convert chat iterator to list for easier slicing
    num_messages = len(messages)

    # Create overlapping chunks of messages
    for i in range(0, num_messages - CHUNK_SIZE + 1, CHUNK_SIZE - OVERLAP_SIZE):
        chunk = messages[i:i + CHUNK_SIZE]  # Extract chunk of messages
        chunk_text = "\n".join(str(msg) for msg in chunk)  # Concatenate messages into a single string
        chunks.append(chunk_text)  # Append chunk to list of chunks

        stat_total_msgs_in_chunks += len(chunk) # For statistics


##### Display Info
total_messages = sum(len(chat) for chat in ab.selectedChats.values())
chunks_count = len(chunks)
avg_chunk_char_len = np.mean([len(chunk) for chunk in chunks])

print(
    f"Chunk count: {chunks_count}",
    f"Average chunk character length: {round( avg_chunk_char_len)}",
    f"Rough estimate of tokens per chunk: {round(avg_chunk_char_len / 4)} (4 characters per token)",
    f"Messagees in input count: {total_messages}",
    f"Messages in chunks count: {stat_total_msgs_in_chunks}",
    f"Chunk \ Input ratio: {round(stat_total_msgs_in_chunks / total_messages,2)} (OVERLAP_SIZE={OVERLAP_SIZE})",
    f"Chunk Python type: {type(chunks[0])}",
    sep="\n"
) 

Chunk count: 1112
Average chunk character length: 1585
Rough estimate of tokens per chunk: 396 (4 characters per token)
Messagees in input count: 22561
Messages in chunks count: 33360
Chunk \ Input ratio: 1.48 (OVERLAP_SIZE=10)
Chunk Python type: <class 'str'>


### Generate Embeddings 

In [140]:
########### Serialization ########
EMBEDDING_NAMEID = "test1"
AUTO_INFO = {
    "model": EMBED_MODEL,
    "CHUNK_SIZE": CHUNK_SIZE,
    "OVERLAP_SIZE": OVERLAP_SIZE,
    "chunks_count": chunks_count,
    "total_messages": total_messages,
    "stat_total_msgs_in_chunks": stat_total_msgs_in_chunks,
    "modules_chat": token_counts,
}
##################################

# Generate embeddings for each chunk
embeddings = []

progress, chunks_len = 0, len(chunks) # for progress bar
for chunk_text in chunks:
    progress += 1
    print(f"Chunk {progress}/{chunks_len}")

    embedding = ollama.embeddings(model=EMBED_MODEL, prompt=chunk_text)["embedding"]
    embeddings.append(embedding)


# Display and save results (if needed later)
print(f"Chunks:{len(chunks)}, embeds:{len(embeddings)}")

bu.if_dir_not_exist_make("embeddings")
bu.quickJSON(AUTO_INFO, f"embeddings/{EMBEDDING_NAMEID}_info.json")
bu.quickJSON({"chunks": chunks, "embeddings": embeddings}, f"embeddings/{EMBEDDING_NAMEID}_embeddings.json")

Chunks:1112, embeds:1112


### Load Embeddings From File (optional)

In [21]:
EMBEDDING_NAMEID = "test1"

import json
with open(f"embeddings/{EMBEDDING_NAMEID}_embeddings.json", "r") as f:
    data = json.load(f)
    chunks = data["chunks"]
    embeddings = data["embeddings"]

print(f"Chunks:{len(chunks)}, embeds:{len(embeddings)}")

Chunks:1112, embeds:1112


#### debug

## Retrieval

### Static

In [22]:
RETRIEVAL_PROMPT = "video game features"
# RETRIEVAL_PROMPT = "personality"
CHUNKS_COUNT_IN_CTX = 35 # Number of nearby chunks to put in context window
# COMMENT 04-16, perhaps we could try 5x retrievals with isolated semantics

# Perform similarity search and print simulations
prompt_embedding = ollama.embeddings(model=EMBED_MODEL, prompt=RETRIEVAL_PROMPT)["embedding"]
chunks_most_similar_embeddings = find_most_similar(prompt_embedding, embeddings)[:CHUNKS_COUNT_IN_CTX]
chunks_most_similar = []
for embedding in chunks_most_similar_embeddings:
    chunks_most_similar.append(chunks[embedding[1]])

# token counts in all similar chunks
tokens_in_chunks = 0
for chunk in chunks_most_similar:
    tokens_in_chunks += utils.count_tokens(chunk)
print(f"Tokens in chunks: {tokens_in_chunks}")

bu.quickTXT("\n\n".join(chunks_most_similar), filename="ignorefolder/chunks.txt")



Tokens in chunks: 13347


### Dynamic

##### (dynamic, same as question)

In [26]:
dynamic_retrieval_prompts = list(surv.questions)

#### (dynamic, custom)

In [None]:
# TODO

#### Run dynamic retrieval

In [32]:
CHUNKS_COUNT_IN_CTX = 10 # Number of nearby chunks to put in context window
dynamic_chunks_most_similar: List[List[str]] = [] 

progress = 0
lenn = len(dynamic_retrieval_prompts)
for prompt in dynamic_retrieval_prompts:
    progress += 1
    print(f"\rPrompt {progress}/{lenn}", end="")

    prompt_embedding = ollama.embeddings(model=EMBED_MODEL, prompt=prompt)["embedding"]
    chunks_most_similar_embeddings = find_most_similar(prompt_embedding, embeddings)[:CHUNKS_COUNT_IN_CTX]
    chunks_most_similar = []
    for embedding in chunks_most_similar_embeddings:
        chunks_most_similar.append(chunks[embedding[1]])

    dynamic_chunks_most_similar.append(chunks_most_similar)
print(end="\n")
    
# VANITY PRINT
tokens_in_chunks = 0
for chunks_most_similar in dynamic_chunks_most_similar:
    for chunk in chunks_most_similar:
        tokens_in_chunks += utils.count_tokens(chunk)
    
print(f"Tokens in average chunk group: {tokens_in_chunks/len(dynamic_chunks_most_similar)}")
bu.quickJSON(dynamic_chunks_most_similar, filename=f"ignorefolder/dynamic-chunks.json")

Prompt 40/40
Tokens in average chunk group: 4521.775


#### Preview generated dynamic chunks (optional)

In [37]:
preview_text = ""
PREVIEW_LIMIT = 10

for i, chunks_most_similar in enumerate(dynamic_chunks_most_similar):
    preview_text += f"==============Prompt: {dynamic_retrieval_prompts[i]}==============\n"
    for j, chunk in enumerate(chunks_most_similar):
        if j >= PREVIEW_LIMIT:
            break
        preview_text += f"=======CHUNK {j}=======\n{chunk}\n\n"
    preview_text += "\n\n"
bu.quickTXT(preview_text, filename=f"ignorefolder/dynamic-chunks_preview.txt")

## Survey Simulation

### Prompt Builder - Persona - Dynamic

In [43]:
final_prompts = []

for question, chunks_most_similar in zip(surv.questions, dynamic_chunks_most_similar):
    p = [
        systemMsg(
            "You are specialized in impersonating people. You will be presented to the subject through explicit datapoints of their digital footprint. In addition, you will deduct their implicit tastes by shadowing chats between the subject and friends. You will be asked to answer questions from the point of view of the persona. Text below:",
            "Conversations between persona and friends",
            "\nNEW CONVERSATION:\n".join(chunks_most_similar)
        ),
        # Understanding affirmation
        assistantMsg('I will answer from the point of view of the persona, based on what I could the deduct from the text provided.'),
        # Survey question. With Simulation
        userMsg("\n".join([
            f"Persona is surveyed about their video game survey. The persona must choose answer the question below with one of the given options: {', '.join(surv.POSSIBLE_ANSWERS)}. Persona's answer must only contain the chosen option. ",
            question,
            "The persona chooses: "
        ])),
    ]
    final_prompts.append(p)

prompt_info = utils.describe_prompts_and_print(final_prompts)
bu.quickJSON(final_prompts, "ignorefolder/prompts.json")

Created 40 prompts.
Average prompt size: 4749 tokens.
Min prompt size: 3798, Max prompt size: 6145


### Prompt Builder - Persona

In [34]:
final_prompts = []

for question in surv.questions:
    p = [
        systemMsg(
            "You are specialized in impersonating people. You will be presented to the subject through explicit datapoints of their digital footprint. In addition, you will deduct their implicit tastes by shadowing chats between the subject and friends. You will be asked to answer questions from the point of view of the persona. Text below:",
            "Conversations between persona and friends",
            "\nNEW CONVERSATION:\n".join(chunks_most_similar)
        ),
        # Understanding affirmation
        assistantMsg('I will answer from the point of view of the persona, based on what I could the deduct from the text provided.'),
        # Survey question. With Simulation
        userMsg("\n".join([
            f"Persona is surveyed about their video game survey. The persona must choose answer the question below with one of the given options: {', '.join(surv.POSSIBLE_ANSWERS)}. Persona's answer must only contain the chosen option. ",
            question,
            "The persona chooses: "
        ])),
    ]
    final_prompts.append(p)

prompt_info = utils.describe_prompts_and_print(final_prompts) # Vanity print
bu.quickJSON(final_prompts, "ignorefolder/prompts.json")

Created 50 prompts.
Average prompt size: 13706 tokens.
Min prompt size: 13703, Max prompt size: 13711


### Prompt Builder - Base case

In [99]:
final_prompts = []

for question in surv.questions:
    p = [
        systemMsg(
            "You are participating in a survey. You will be presented with a series of questions about your video game preferrences.",
            f"You must choose answer to the question below with one of the five options: {', '.join(surv.POSSIBLE_ANSWERS)}. The answer must only contain the chosen option. "
        ),
        # Understanding affirmation
        assistantMsg('Understood. I will answer the question below with one of the given options.'),
        # Survey question. With Simulation
        userMsg(
            question,
            "Your choice: "
        ),
    ]
    final_prompts.append(p)

prompt_info = utils.describe_prompts_and_print(final_prompts) # Vanity print
bu.quickJSON(final_prompts, "ignorefolder/prompts.json")

Created 40 prompts.
Average prompt size: 110 tokens.
Min prompt size: 105, Max prompt size: 118


###### Unused / Debug

In [11]:
persona_text = "Favorite video games are Rimworld, Minecraft, Age of Empires, 7 Days to Die"

### Run Survey Simulation

In [60]:
# Run Simulation
##################################
SIMULATION_NAMEID = "video-game_airidas_dynamic_3"
LIMIT = None # For testing purposes. Set to NONE to run all
AUTO_INFO = {
    "date": bu.get_timestamp(),
    "EMBEDDING_NAMEID": EMBEDDING_NAMEID,
    "RETRIEVAL_PROMPT": RETRIEVAL_PROMPT,
    "CHUNKS_COUNT_IN_CTX": CHUNKS_COUNT_IN_CTX,
    "survey_type": str(type(surv)),
    "prompt_count": min(len(final_prompts), LIMIT) if LIMIT != None else len(final_prompts),
    "avg_tokens_in_prompt": round(prompt_info["total_all_prompt_tokens"]/len(final_prompts)),
}
SETTINGS = {
    "model": "gpt-3.5-turbo",
    "temperature": 0.5,
}

##################################

import pandas as pd
from openai import OpenAI
import os

client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
)

completions = []
l = len(final_prompts)

for i, (prompt, question) in enumerate(list(zip(final_prompts, surv.questions))):
    if LIMIT != None and i > LIMIT:
        break

    print(f"{i}/{l}...", end="\t") # Print progress
    # Send the Request
    full_response = client.chat.completions.create(
        messages=prompt,
        model=SETTINGS["model"],
        temperature=SETTINGS["temperature"],
    )
    r = full_response.choices[0].message.content

    completions.append({'question': question, 'answer': r})

    print(f"{question}: {r}")

# Save results
df = pd.DataFrame(completions)
df.to_csv(f"simulations/{SIMULATION_NAMEID}_simulation.csv", index=False)
bu.quickJSON({"settings": SETTINGS, "info": AUTO_INFO}, f"simulations/{SIMULATION_NAMEID}_info.json")
bu.quickJSON(final_prompts, f"ignorefolder/simulations/{SIMULATION_NAMEID}_prompts.json")

0/40...	

What would you say if there were options to design your own avatar?: I LIKE IT
1/40...	What would you say if there were NO options to design your own avatar?: I EXPECT IT
2/40...	What would you say if the game had the option to save the game at any time?: I LIKE IT
3/40...	What would you say if the game did NOT have the option to save the game at any time?: I CAN TOLERATE IT
4/40...	What would you say if the game has good graphics?: I LIKE IT
5/40...	What would you say if the game had NO good graphics, or rather poor graphics?: I CAN TOLERATE IT
6/40...	What would you say if the game had an exciting storyline?: I LIKE IT
7/40...	What would you say if the game did NOT have an exciting storyline?: I CAN TOLERATE IT
8/40...	What would you say if there were rewards such as extra points, in-game currency or coins in the game?: I LIKE IT
9/40...	What would you say if there were NO rewards such as extra points, in-game currency or coins in the game?: I EXPECT IT
10/40...	What would you say if

#### Debug

## Analysis - General

### Load Simulation File (optional)

In [2]:
import pandas as pd
import json
SIMULATION_NAMEID = "local/run2-airidas-personality_mixtral_cv1"

df = pd.read_csv(f'simulations/{SIMULATION_NAMEID}_simulation.csv')
with open(f'simulations/{SIMULATION_NAMEID}_info.json', 'r') as f:
    loaded = json.load(f)
try:
    AUTO_INFO = loaded["info"]
    SETTINGS = loaded["settings"]
    print("Settings and info loaded:")
    for k, v in AUTO_INFO.items():
        print(f"{k}: {v}")
    for k, v in SETTINGS.items():
        print(f"{k}: {v}")
except:
    print("No settings and/or info found")


try:
    if str(type(surv)) != AUTO_INFO["survey_type"]:
        print(f"WARNING: surv variable is not of the same type. {str(type(surv))} != {AUTO_INFO['survey_type']}")
except:
    pass

df.head(n=10)

Settings and info loaded:
date: 2024-04-18_205950
model: mixtral


Unnamed: 0,question,answer
0,I am the life of the party.,SOMEWHAT DISAGREE
1,I don't talk a lot.,SOMEWHAT DISAGREE
2,I feel comfortable around people.,AGREE
3,I keep in the background.,SOMEWHAT DISAGREE
4,I start conversations.,SOMEWHAT AGREE
5,I have little to say.,NEUTRAL
6,I talk to a lot of different people at parties.,SOMEWHAT AGREE
7,I don't like to draw attention to myself.,NEUTRAL
8,I don't mind being the center of attention.,SOMEWHAT DISAGREE
9,I am quiet around strangers.,SOMEWHAT DISAGREE


### Cleanup

#### For GPT3.5

In [61]:
# remove all characters from a black list from the column answer
for substr in utils.BLACKLIST_ANSWER_SUBSTRINGS:
    df['answer'] = df['answer'].apply(lambda x: re.sub(substr, "", x))

df['answer'] = df['answer'].str.upper()
# Update isValid
df['isValid'] = df['answer'].apply(lambda x: x in surv.POSSIBLE_ANSWERS)

# if all values in isValid is true, drop the column, else print a message
if df['isValid'].all():
    df = df.drop('isValid', axis=1)
    print("All answers were valid")
else:
    print("Some answers were not valid")

df.head(n=10)

All answers were valid


Unnamed: 0,question,answer
0,What would you say if there were options to de...,I LIKE IT
1,What would you say if there were NO options to...,I EXPECT IT
2,What would you say if the game had the option ...,I LIKE IT
3,What would you say if the game did NOT have th...,I CAN TOLERATE IT
4,What would you say if the game has good graphics?,I LIKE IT
5,What would you say if the game had NO good gra...,I CAN TOLERATE IT
6,What would you say if the game had an exciting...,I LIKE IT
7,What would you say if the game did NOT have an...,I CAN TOLERATE IT
8,What would you say if there were rewards such ...,I LIKE IT
9,What would you say if there were NO rewards su...,I EXPECT IT


#### For Local Models

In [29]:
survey.validateMatchingSurvey(surv, df["question"])



All questions are the same


In [None]:
#rename columns
df.columns = ['real', 'answer']

In [7]:
# remove all strings from a black list from the column answer
for substr in utils.BLACKLIST_ANSWER_SUBSTRINGS:
    df['answer'] = df['answer'].apply(lambda x: re.sub(substr, "", x))

df['answer'] = df['answer'].str.upper()
# Update isValid
df['isValid'] = df['answer'].apply(lambda x: x in surv.POSSIBLE_ANSWERS)

# if all values in isValid is true, drop the column, else print a message
if df['isValid'].all():
    df = df.drop('isValid', axis=1)
    print("All answers were valid")
else:
    print("Some answers were not valid")

df.head(n=10)

NameError: name 'surv' is not defined

### Proces simulation output - KANO

In [62]:
# Add airidas and elias answers
air = surv.test_answers["airidas"]
eli = surv.test_answers["elias"]
df.insert(2, "airidas", air[:len(df)])
df.insert(3, "elias", eli[:len(df)])

df['answer'] = df['answer'].str.upper()
df['airidas'] = df['airidas'].str.upper()
df['elias'] = df['elias'].str.upper()
df.head(n=10)

Unnamed: 0,question,answer,airidas,elias
0,What would you say if there were options to de...,I LIKE IT,I AM NEUTRAL,I AM NEUTRAL
1,What would you say if there were NO options to...,I EXPECT IT,I CAN TOLERATE IT,I EXPECT IT
2,What would you say if the game had the option ...,I LIKE IT,I EXPECT IT,I EXPECT IT
3,What would you say if the game did NOT have th...,I CAN TOLERATE IT,I DISLIKE IT,I CAN TOLERATE IT
4,What would you say if the game has good graphics?,I LIKE IT,I LIKE IT,I LIKE IT
5,What would you say if the game had NO good gra...,I CAN TOLERATE IT,I AM NEUTRAL,I CAN TOLERATE IT
6,What would you say if the game had an exciting...,I LIKE IT,I LIKE IT,I LIKE IT
7,What would you say if the game did NOT have an...,I CAN TOLERATE IT,I CAN TOLERATE IT,I DISLIKE IT
8,What would you say if there were rewards such ...,I LIKE IT,I LIKE IT,I DISLIKE IT
9,What would you say if there were NO rewards su...,I EXPECT IT,I DISLIKE IT,I LIKE IT


#### Proces simulation output - PERSONALITY

In [94]:
# Add airidas and elias answers
air = surv.test_answers["airidas"]
eli = surv.test_answers["elias"]
df.insert(2, "airidas", air[:len(df)])
df.insert(3, "elias", eli[:len(df)])
df.head(n=10)

Unnamed: 0,question,answer,airidas,elias
0,I am the life of the party.,AGREE,3,3
1,I don't talk a lot.,SOMEWHAT AGREE,2,2
2,I feel comfortable around people.,AGREE,4,4
3,I keep in the background.,SOMEWHAT AGREE,2,4
4,I start conversations.,AGREE,4,3
5,I have little to say.,DISAGREE,1,1
6,I talk to a lot of different people at parties.,AGREE,5,4
7,I don't like to draw attention to myself.,SOMEWHAT AGREE,2,3
8,I don't mind being the center of attention.,SOMEWHAT DISAGREE,4,1
9,I am quiet around strangers.,AGREE,1,4


### Remaps - UNIVERSAL

In [63]:
if isinstance(surv, survey.KanoSurvey):
    remap_dict = {"I EXPECT IT": 5, "I LIKE IT": 4, "I AM NEUTRAL": 3, "I CAN TOLERATE IT": 2, "I DISLIKE IT": 1}
    df['answer'] = df['answer'].map(remap_dict)
    df['airidas'] = df['airidas'].map(remap_dict)
    df['elias'] = df['elias'].map(remap_dict)
elif isinstance(surv, survey.PersonalitySurvey):
    remap_dict = {"AGREE": 5, "SOMEWHAT AGREE": 4, "NEUTRAL": 3, "SOMEWHAT DISAGREE": 2, "DISAGREE": 1}
    df['answer'] = df['answer'].map(remap_dict)

df.head()

Unnamed: 0,question,answer,airidas,elias
0,What would you say if there were options to de...,4,3,3
1,What would you say if there were NO options to...,5,2,5
2,What would you say if the game had the option ...,4,5,5
3,What would you say if the game did NOT have th...,2,1,2
4,What would you say if the game has good graphics?,4,4,4


In [64]:
# compute one number of how the percentage of correct answers
result_data = {
    "Exact Matches": (df['answer'] == df['airidas']).sum() / len(df),
    "Correlation": df['answer'].corr(df['airidas']),
    "Exact Matches - elias": (df['answer'] == df['elias']).sum() / len(df),
    "Correlation - elias": df['answer'].corr(df['elias']),
}

for k, v in result_data.items():
    print(f"{k}: {v}")


Exact Matches: 0.45
Correlation: 0.3496253080715016
Exact Matches - elias: 0.4
Correlation - elias: 0.2711680683547036


#### Load Results CSV

In [1]:
bu.if_dir_not_exist_make("results")
res = bu.LiveCSV("results/airidas_results.csv")

NameError: name 'bu' is not defined

In [65]:
new_res = {
    # "label": None,
    "SIMULATION_NAMEID": SIMULATION_NAMEID,
    "timestamp": bu.get_timestamp(),
    "survey_type": str(type(surv)),
    "temperature": SETTINGS["temperature"],
    # "note": "",
    "exact_matches": result_data["Exact Matches"],
    "corr": result_data["Correlation"],
    "exact_matches_elias": result_data["Exact Matches - elias"],
    "corr_elias": result_data["Correlation - elias"],
}

tmp = bu.convert_dicts_to_table([new_res])
res.append_data(tmp[1], tmp[0])

brikasutils.quickCSV: Saved 16 as results/airidas_results.csv


numpy.float64

In [50]:
df

Unnamed: 0,question,answer,airidas,elias
0,What would you say if there were options to de...,4,3,3
1,What would you say if there were NO options to...,5,2,5
2,What would you say if the game had the option ...,4,5,5
3,What would you say if the game did NOT have th...,4,1,2
4,What would you say if the game has good graphics?,4,4,4
5,What would you say if the game had NO good gra...,2,3,2
6,What would you say if the game had an exciting...,4,4,4
7,What would you say if the game did NOT have an...,5,2,1
8,What would you say if there were rewards such ...,5,4,1
9,What would you say if there were NO rewards su...,4,1,4


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  40 non-null     object
 1   answer    40 non-null     int64 
 2   airidas   40 non-null     int64 
 3   elias     40 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 1.4+ KB


## OLD RAG