## Initial setup

In [93]:
import importlib
import brikasutils as bu
importlib.reload(bu)
import shared_utils as utils
from shared_utils import systemMsg, userMsg, assistantMsg
importlib.reload(utils)
import survey
importlib.reload(survey)
import persona
importlib.reload(persona)

import ollama
import numpy as np
from numpy.linalg import norm
import pandas as pd
import os
import re
import time
import json
from typing import List
from openai import OpenAI

## Persona

In [95]:
et = persona.PersonaEncoder()

# ==== FB messages ====
et.parse_fb_messages(["data/1_raw/1_airidas.json"], "airidas")
et.parse_fb_messages(["data/1_raw/2_christian.json"], "christian")
et.parse_fb_messages(["data/1_raw/1_nikolay.json"], "nikolay")
et.parse_fb_messages(["data/1_raw/2_mathis.json"], "mathis")
et.parse_fb_messages(["data/1_raw/2_jacob.json"], "jacob")
et.parse_fb_messages(["data/1_raw/2_chris.json"], "chris")
et.parse_fb_messages(["data/1_raw/3_aziz.json"], "aziz")
et.parse_fb_messages(["data/1_raw/3_daniela.json"], "daniela")
et.parse_fb_messages(["data/1_raw/3_mihi.json"], "mihi")
et.parse_fb_messages(["data/1_raw/3_viktoria.json"], "viktoria")
et.parse_fb_messages(["data/1_raw/4_diba.json"], "diba")
et.parse_fb_messages(["data/1_raw/6_filip.json"], "filip")
et.parse_wa_messages(["data/1_raw/messages_1000.json"], "rebecca")
# for name, texts in texts_with_others_dict.items():
#     et.parse_fb_messages(texts, name)

# Regex cleaning
et.filter_chats_empty()
et.filter_chats_regex(utils.BLACKLIST_CHAT_REGEX_FILTERS)

# Compress names
for nameid, chat in et.chats.items():
    for msg in chat:  
        msg.sender = "Persona" if msg.sender == "Elias Salvador Smidt Torjani"  else "Friend"

# Start all chats from 2/3rds
# for name, chat in et.chats.items():
#     et.chats[name] = chat[int(len(chat)/3 * 2):]
# Select the final modules
# et.select_chat_limited_by_tokens("airidas", 10000)
et.select_chat_full("rebecca")
et.select_chat_full("airidas")
et.select_chat_full("christian")
et.select_chat_full("nikolay")
et.select_chat_full("mathis") 
et.select_chat_full("daniela")
et.select_chat_full("diba")
et.select_chat_full("aziz")
et.select_chat_full("jacob")  
et.select_chat_full("chris")
et.select_chat_full("filip")
et.select_chat_full("mihi")
et.select_chat_full("viktoria")

# save
BIG_MODULE=et.output()
bu.quickTXT(BIG_MODULE, filename=f"data/2_modules/big_{bu.get_timestamp()}")

# stats
token_counts = et.count_all_selected_chat_tokens() # token_counts used later for statistics
print(f"Combined tokens: {sum(token_counts.values())}")
# utils.count_tokens(BIG_MODULE) 
# or list(et.selectedChats.keys()) --> et.count_chat_tokens("{friend}")
# et.selectedChats["{friend}"][:5]

Read 1916 messages from 1 files. Failed to read 0 messages.
Messages ranged from 2021-09-13 to 2024-03-06
Messages saved to self.chats['airidas']
Read 618 messages from 1 files. Failed to read 0 messages.
Messages ranged from 2021-09-10 to 2024-03-03
Messages saved to self.chats['christian']
Read 297 messages from 1 files. Failed to read 0 messages.
Messages ranged from 2018-07-25 to 2024-01-01
Messages saved to self.chats['nikolay']
Read 144 messages from 1 files. Failed to read 0 messages.
Messages ranged from 2021-03-28 to 2021-12-30
Messages saved to self.chats['mathis']
Read 104 messages from 1 files. Failed to read 0 messages.
Messages ranged from 2021-08-25 to 2024-03-05
Messages saved to self.chats['jacob']
Read 159 messages from 1 files. Failed to read 0 messages.
Messages ranged from 2021-10-12 to 2023-04-30
Messages saved to self.chats['chris']
Read 161 messages from 1 files. Failed to read 0 messages.
Messages ranged from 2021-03-28 to 2021-06-06
Messages saved to self.chat

## Survey

In [119]:
# surv = survey.PersonalitySurvey()
surv = survey.KanoSurvey()
# surv = survey.buildFairnessPrompts()
# surv = survey.DictatorGameSurvey()
surv.questions[:2]#.head()

Using default Kano Survey CSV file: surveys/survey_kano-model.csv


4    What would you say if there were options to de...
5    What would you say if there were NO options to...
Name: question, dtype: object

## Embedding

### Chunking

In [120]:
# PARAMETERS
EMBED_MODEL = "nomic-embed-text"        # nomic-embed-text = long ctx / mxbai-embed-large = big
CHUNK_SIZE = 50                         # Number of messages per chunk
OVERLAP_SIZE = 10                       # Number of overlapping messages between consecutive chunks
# COMMENT 04-16, perhaps we could try 5x retrievals with isolated semantics

In [121]:
# Initialize lists for storing chunks – and embeddings later
# different chunk size
chunks = []
stat_total_msgs_in_chunks = 0 # for statistics

# different chunk size
# Iterate over chats and messages to create chunks
for chat in et.selectedChats.values():
    messages = list(chat)  # Convert chat iterator to list for easier slicing
    num_messages = len(messages)

    # Create overlapping chunks of messages
    for i in range(0, num_messages - CHUNK_SIZE + 1, CHUNK_SIZE - OVERLAP_SIZE):
        chunk = messages[i:i + CHUNK_SIZE]  # Extract chunk of messages
        chunk_text = "\n".join(str(msg) for msg in chunk)  # Concatenate messages into a single string
        chunks.append(chunk_text)  # Append chunk to list of chunks

        stat_total_msgs_in_chunks += len(chunk) # For statistics

##### Display Info
total_messages = sum(len(chat) for chat in et.selectedChats.values())
chunks_count = len(chunks)
avg_chunk_char_len = np.mean([len(chunk) for chunk in chunks])

print(
    f"Chunk count: {chunks_count}",
    f"Average chunk character length: {round( avg_chunk_char_len)}",
    f"Rough estimate of tokens per chunk: {round(avg_chunk_char_len / 4)} (4 characters per token)",
    f"Messagees in input count: {total_messages}",
    f"Messages in chunks count: {stat_total_msgs_in_chunks}",
    f"Chunk \ Input ratio: {round(stat_total_msgs_in_chunks / total_messages,2)} (OVERLAP_SIZE={OVERLAP_SIZE})",
    f"Chunk Python type: {type(chunks[0])}",
    sep="\n"
) 

Chunk count: 104
Average chunk character length: 2527
Rough estimate of tokens per chunk: 632 (4 characters per token)
Messagees in input count: 4465
Messages in chunks count: 5200
Chunk \ Input ratio: 1.16 (OVERLAP_SIZE=10)
Chunk Python type: <class 'str'>


### Generaterating embeddings

In [122]:
########### Serialization ########
EMBEDDING_NAMEID = "game_batch-B"
AUTO_INFO = {
    "model": EMBED_MODEL,
    "CHUNK_SIZE": CHUNK_SIZE,
    "OVERLAP_SIZE": OVERLAP_SIZE,
    "chunks_count": chunks_count,
    "total_messages": total_messages,
    "stat_total_msgs_in_chunks": stat_total_msgs_in_chunks,
    "modules_chat": token_counts,
}
##################################

In [123]:
# Generate embeddings for each chunk
embeddings = []

progress, chunks_len = 0, len(chunks) # for progress bar
for chunk_text in chunks:
    progress += 1
    print(f"\rChunk {progress}/{chunks_len}", end="")

    embedding = ollama.embeddings(model=EMBED_MODEL, prompt=chunk_text)["embedding"]
    embeddings.append(embedding)

####################################################
# token counts in all similar chunks
# tokens_in_chunks = 0
# for chunk in chunks_most_similar:
#     tokens_in_chunks += utils.count_tokens(chunk)
# print(f"Tokens in chunks: {tokens_in_chunks}")

bu.if_dir_not_exist_make("data/3_embeddings")
bu.quickJSON(AUTO_INFO, f"data/3_embeddings/{EMBEDDING_NAMEID}_info.json")
bu.quickJSON({"chunks": chunks, "embeddings": embeddings}, f"data/3_embeddings/{EMBEDDING_NAMEID}_embeddings.json")

Chunk 104/104

## Retrieval

In [124]:
VERSION_ID = "8k-dynamic"

subject = "elias"

# persona_small = "SMALL_MODULE"
# persona_med = "{MED_MODULE}"
# persona_text = "Favorite video games are Rimworld, Minecraft, Age of Empires, 7 Days to Die"

# Change below accoring to survey above
# RETRIEVAL_PROMPT = "openess conciousness extrovert aggreableness neuroticism" #"personality"
RETRIEVAL_PROMPT = "video game features"
# q_retrival_prompt =
# SURVEY_PROMPT = "Determine how much {subject} aggree with the statement. Guestimate how {subject} would answer to the question"

CHUNKS_COUNT_IN_CTX = 10 # Number of nearby chunks to put in context window

### Static

In [76]:
prompt_embedding = ollama.embeddings(model=EMBED_MODEL, prompt=RETRIEVAL_PROMPT)["embedding"]
chunks_most_similar_embeddings  = utils.find_most_similar(prompt_embedding, embeddings)[:CHUNKS_COUNT_IN_CTX]
chunks_most_similar = []
for embedding in chunks_most_similar_embeddings:
    chunks_most_similar.append(chunks[embedding[1]])

# token counts in all similar chunks
tokens_in_chunks = 0
for chunk in chunks_most_similar:
    tokens_in_chunks += utils.count_tokens(chunk)
print(f"Tokens in chunks: {tokens_in_chunks}")
####################################################
print(f"Chunks:{len(chunks)}, embeds:{len(embeddings)}")


# Display results
bu.quickTXT("\n\n".join(chunks_most_similar), filename=f"data/4_chunks/{EMBEDDING_NAMEID}_{VERSION_ID}_chunks.txt")

bu.if_dir_not_exist_make("data/4_chunks")
bu.quickJSON(AUTO_INFO, f"data/4_chunks/{EMBEDDING_NAMEID}_{VERSION_ID}_info.json")
bu.quickJSON({"chunks": chunks, "embeddings": embeddings}, f"data/4_chunks/{EMBEDDING_NAMEID}_{VERSION_ID}_embeddings.json")

Tokens in chunks: 5873
Chunks:104, embeds:104


### Dynamic

In [129]:
dynamic_retrieval_prompts = list(surv.questions)

CHUNKS_COUNT_IN_CTX = 5 # Number of nearby chunks to put in context window
dynamic_chunks_most_similar: List[List[str]] = [] 

progress = 0
lenn = len(dynamic_retrieval_prompts)
for prompt in dynamic_retrieval_prompts:
    progress += 1
    print(f"\rPrompt {progress}/{lenn}", end="")

    prompt_embedding = ollama.embeddings(model=EMBED_MODEL, prompt=prompt)["embedding"]
    chunks_most_similar_embeddings = utils.find_most_similar(prompt_embedding, embeddings)[:CHUNKS_COUNT_IN_CTX]
    chunks_most_similar = []
    for embedding in chunks_most_similar_embeddings:
        chunks_most_similar.append(chunks[embedding[1]])

    dynamic_chunks_most_similar.append(chunks_most_similar)
print(end="\n")
    
# VANITY PRINT
tokens_in_chunks = 0
for chunks_most_similar in dynamic_chunks_most_similar:
    for chunk in chunks_most_similar:
        tokens_in_chunks += utils.count_tokens(chunk)

del chunks_most_similar_embeddings # free memory
print(f"Tokens in average chunk group: {tokens_in_chunks/len(dynamic_chunks_most_similar)}")
bu.quickJSON(dynamic_chunks_most_similar, filename=f"data/4_chunks/{EMBEDDING_NAMEID}_{VERSION_ID}-chunks.json")
###########################################
# Vanity preview
preview_text = ""
PREVIEW_LIMIT = 10

for i, chunks_most_similar in enumerate(dynamic_chunks_most_similar):
    preview_text += f"==============Prompt: {dynamic_retrieval_prompts[i]}==============\n"
    for j, chunk in enumerate(chunks_most_similar):
        if j >= PREVIEW_LIMIT:
            break
        preview_text += f"=======CHUNK {j}=======\n{chunk}\n\n"
    preview_text += "\n\n"
bu.quickTXT(preview_text, filename=f"data/4_chunks/{EMBEDDING_NAMEID}_{VERSION_ID}-chunks")

Prompt 40/40
Tokens in average chunk group: 3169.925


## Prompt Builder

In [None]:
# You are {subject} vs you will impersonate {subject}
SYS_MSG = {
    "role": "system", 
    "content": "You are an actor specializing in impersonating non-famouns people. You will be presented to the subject through explicit datapoints of their digital footprint. In addition, you will deduct their implicit personality traits by shadowing chats between the subject and friends. You will be asked to answer questions from the point of view of the persona. The persona you will be impersonating is named Elias. Context:"
    }

ASSIST_MSG = {
    "role": "assistant",
    "content": ""
}

USER_MSG = {
    "role": "user",
    "content": ""
}

### Dynamic persona

In [127]:
final_prompts = []

for question, chunks_most_similar in zip(surv.questions, dynamic_chunks_most_similar):
    p = [
        systemMsg(
            "You are specialized in impersonating people. You will be presented to the subject through explicit datapoints of their digital footprint. In addition, you will deduct their implicit tastes by shadowing chats between the subject and friends. You will be asked to answer questions from the point of view of the persona. Text below:",
            "Conversations between persona and friends",
            "\nNEW CONVERSATION:\n".join(chunks_most_similar)
        ),
        # Understanding affirmation
        assistantMsg('I will answer from the point of view of the persona, based on what I could the deduct from the text provided.'),
        # Survey question. With Simulation
        userMsg("\n".join([
            f"Persona is surveyed about their video game survey. The persona must choose answer the question below with one of the given options: {', '.join(surv.POSSIBLE_ANSWERS)}. Persona's answer must only contain the chosen option, without any elaboration, nor introduction. ",
            question,
            "Persona chooses: "
        ])),
        # assistantMsg("\n".join([f"response: "
        # ])),
    ]
    final_prompts.append(p)

prompt_info = utils.describe_prompts_and_print(final_prompts)
bu.quickJSON(final_prompts, f"data/5_prep/{EMBEDDING_NAMEID}_{VERSION_ID}_prompts.json")
print(f"{len(final_prompts)}")#,{final_prompts[:1]}")

Created 40 prompts.
Average prompt size: 3373 tokens.
Min prompt size: 2474, Max prompt size: 4757
40


### Static persona

In [7]:
SYS_PROMPT = {
    "role": "system", 
    "content": "You are an actor specializing in impersonating non-famouns people. You will be presented to the subject through explicit datapoints of their digital footprint. In addition, you will deduct their implicit personality traits by shadowing chats between the subject and friends. You will be asked to answer questions from the point of view of the persona. The persona you will be impersonating is named Elias. Context:"
    }

# Load Embeddings From File (optional)
EMBEDDING_NAMEID = "game_batch-A"
VERSION_ID = "8k-static"

import json
with open(f"data/4_chunks/{EMBEDDING_NAMEID}_{VERSION_ID}_embeddings.json", "r") as f:
    data = json.load(f)
    chunks = data["chunks"]
    embeddings = data["embeddings"]

print(f"Chunks:{len(chunks)}, embeds:{len(embeddings)}")

Chunks:104, embeds:104


In [9]:
final_prompts = []

for question in surv.questions:
    p = [
        systemMsg(SYS_PROMPT['content']+"\n## chat conversions between subject and friends\n".join(chunks_most_similar)),
        assistantMsg('Understood. I will answer from the point of view of the persona, {subject}, based on what I could the deduct from the text provided above.'),
        userMsg("\n".join([
            f'\n\n**Your answer should only contain the chosen option without further explanation!** Reply to the statement below - how {subject} would reply - with one of these five options: {", ".join(surv.POSSIBLE_ANSWERS)}.',
            question,
            "The persona chooses: "
        ])),
    ]
    final_prompts.append(p)

print(f"{len(final_prompts)}")#,{final_prompts[:1]}")
prompt_info = utils.describe_prompts_and_print(final_prompts) # Vanity print
bu.quickJSON(final_prompts, f"data/5_prep/{EMBEDDING_NAMEID}_{VERSION_ID}_prompts.json")

NameError: name 'chunks_most_similar' is not defined

In [None]:
utils.describe_prompts_and_print(final_prompts)

### Base (no persona)

In [80]:
final_prompts = []

for question in surv.questions:
    p = [
        systemMsg(
            "You are participating in a survey. You will be presented with a series of questions about your video game preferrences.",
            f"You must choose answer to the question below with one of the five options: {', '.join(surv.POSSIBLE_ANSWERS)}. The answer must only contain the chosen option. "
        ),
        # Understanding affirmation
        assistantMsg('Understood. I will answer the question below with one of the given options.'),
        # Survey question. With Simulation
        userMsg(
            question,
            "Your choice: "
        ),
    ]
    final_prompts.append(p)

prompt_info = utils.describe_prompts_and_print(final_prompts) # Vanity print
bu.quickJSON(final_prompts, "data/5_prep/game_base_prompts.json")

Created 40 prompts.
Average prompt size: 110 tokens.
Min prompt size: 105, Max prompt size: 118


# Run simulation

In [None]:
# load json file to dict
# with open("simulations/toElias/run2-airidas-personality_cv1_prompts.json", "r") as read_file:
# with open("simulations/toElias/run2-airidas-video-game-cv1_prompts.json", "r") as read_file:
# with open("simulations/toElias/run2-base-personality-cv1_prompts.json", "r") as read_file:
with open("simulations/toElias/run2-base-video-game-cv1_prompts.json", "r") as read_file:
    pre_final_prompts = json.load(read_file)


# pre_final_prompts

In [None]:
# Run Simulation
SETTINGS = {
     "model": "command-r-plus:104b-q2_K", # mixtral, command-r-plus:104b-q2_K
     # "temperature": 0.5,
     # best wizard and mixtral try mixtral-8x22b wizard in uCloud
}

##################################
SIM_ID = f"run2-base-video-game_rplus_cv2"
LIMIT = None # For testing purposes. Set to NONE to run all
AUTO_INFO = {
    "date": bu.get_timestamp(),
    # "EMBEDDING_NAMEID": EMBEDDING_NAMEID,
    # "RETRIEVAL_PROMPT": RETRIEVAL_PROMPT,
    # "CHUNKS_COUNT_IN_CTX": CHUNKS_COUNT_IN_CTX,
    # "survey_type": str(type(surv)),
    # "prompt_count": min(len(final_prompts), LIMIT) if LIMIT != None else len(final_prompts),
    # "avg_tokens_in_prompt": round(prompt_info["total_all_prompt_tokens"]/len(final_prompts)),
}

client = OpenAI(
    base_url = 'http://localhost:11434/v1',
    api_key='ollama', # required, but unused
)

save = f"{SETTINGS['model']}_{SIM_ID}"
##################################

In [None]:
### ==== THE FUNCTIONAL 1!!!! =====
SETTINGS = {
     "model": "llama3", # mixtral, command-r-plus:104b-q2_K
}
##########################################
SIM_ID = f"run2-base-video-game_rplus_cv2"
LIMIT = None # For testing purposes. Set to NONE to run all
AUTO_INFO = {
    "date": bu.get_timestamp(),
}

client = OpenAI(
    base_url = 'http://localhost:11434/v1',
    api_key='ollama', # required, but unused
)
##########################################

save = f"{SETTINGS['model']}_{SIM_ID}"
completions = []
l = len(final_prompts)
timer = bu.Benchmarker()
for i, (prompt, question) in enumerate(list(zip(final_prompts, surv.questions))):
    if LIMIT != None and i > LIMIT:
        break
    timer.mark()
    print(f"{i}/{l}...", end="\t") # Print progress
    # Send the Request    
    full_response = client.chat.completions.create(
        model=SETTINGS["model"],
        messages=prompt,
        # timeout=120,
        # temperature=SETTINGS["temperature"],
    )

    r = full_response.choices[0].message.content
    completions.append({'question': question, 'answer': r})
    print(f"{question}: {r}")

timer.mark()
# Save results
df = pd.DataFrame(completions)
# df.to_csv(f"results/{save}_simulation.csv", index=False)
df.to_csv(f"simulations/{SIM_ID}_simulation.csv", index=False)
# bu.quickJSON(final_prompts, f"results/{save}_prompts.json")
bu.quickJSON(final_prompts, f"ignorefolder/{SIM_ID}_prompts.json")
# bu.quickJSON(SETTINGS, f"results/{save}_setings.json")
bu.quickJSON({"settings": SETTINGS, "info": AUTO_INFO}, f"simulations/{SIM_ID}_info.json")

## **Batch Sim**


# Analysis

## Load

In [5]:
import pandas as pd
import json
SIMULATION_NAMEID = "airidas-personality_mixtral_cv1" #f"{SIM_ID}"

df = pd.read_csv(f'simulations/local/personality/{SIMULATION_NAMEID}_simulation.csv')
with open(f'simulations/local/personality/{SIMULATION_NAMEID}_info.json', 'r') as f:
    loaded = json.load(f)
try:
    AUTO_INFO = loaded["info"]
    SETTINGS = loaded["settings"]
    print("Settings and info loaded:")
    for k, v in AUTO_INFO.items():
        print(f"{k}: {v}")
    for k, v in SETTINGS.items():
        print(f"{k}: {v}")
except:
    print("No settings and/or info found")

try:
    if str(type(surv)) != AUTO_INFO["survey_type"]:
        print(f"WARNING: surv variable is not of the same type. {str(type(surv))} != {AUTO_INFO['survey_type']}")
except:
    pass

df.head(n=5)

Settings and info loaded:
date: 2024-04-18_205950
model: mixtral


Unnamed: 0,question,answer
0,I am the life of the party.,SOMEWHAT DISAGREE
1,I don't talk a lot.,SOMEWHAT DISAGREE
2,I feel comfortable around people.,AGREE
3,I keep in the background.,SOMEWHAT DISAGREE
4,I start conversations.,SOMEWHAT AGREE


In [None]:
bu.if_dir_not_exist_make("data/5_sim-clean/results")
res = bu.LiveCSV(f"data/5_sim-clean/elias_runs.csv")

In [8]:
# Load the simulation results
# both personality and game WIP
import pandas as pd
import os
import numpy as np
import re

dfs = []
filenames = []

# List of folder paths
folder_paths = ['simulations/local/video-game/', 'simulations/local/personality/']

# Read the first CSV file from the first folder to get the 'question' column
first_folder_path = folder_paths[0]
first_file_path = os.path.join(first_folder_path, os.listdir(first_folder_path)[0])
first_df = pd.read_csv(first_file_path)
questions = first_df['question']

for folder_path in folder_paths:
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)

            # remove all characters from a black list from the column answer
            df['answer'] = df['answer'].apply(lambda x: x.strip())
            for substr in utils.BLACKLIST_ANSWER_SUBSTRINGS:
                df['answer'] = df['answer'].apply(lambda x: re.sub(substr, "", x))
            df['answer'] = df['answer'].str.upper()

            filename_without_ext = filename[:-4]  # Remove the '.csv' extension
            filename_without_simulation = (
                filename_without_ext.replace('_simulation', '')
                                    # .replace('video-', '')
            )  # Remove '_simulation' from the filename
            filenames.append(filename_without_simulation)
            dfs.append(df['answer'])

answers_df = pd.concat(dfs, axis=1, keys=filenames)
answers_df.insert(0, 'question', questions)  # Insert the 'question' column at the beginning
answers_df[:5]

Unnamed: 0,question,base-video-game_mixtral_cv2,base-video-game_rplus_cv2,airidas-video-game_rplus_cv1,base-video-game_mixtral_cv3,base-video-game_rplus_cv1,elias-video-game_rplus_cv1,elias-video-game_mixtral_cv1,airidas-video-game_mixtral_cv1,base-personality_rplus_cv1,base-personality_rplus_cv2,airidas-personality_mixtral_cv1,elias-personality_mixtral_cv1,elias-personality_rplus_cv1,base-personality_mixtral_cv3,airidas-personality_rplus_cv1,base-personality_mixtral_cv2
0,I am the life of the party.,SOMEWHAT DISAGREE,NEUTRAL,I LIKED IT,SOMEWHAT DISAGREE,I LIKE IT,I LIKE IT 😍🔥💥❤️🎉👽‍♂️🐵😈📦🏙💻⛹‍♀️⭐️✨⚡▪️▫️↕◀️▲●•🕺,I LIKE IT,SOMEWHAT DISAGREE,NEUTRAL,SOMEWHAT AGREEE,SOMEWHAT DISAGREE,SOMEWHAT DISAGREE,DISAGREE,SOMEWHAT DISAGREE,SOMEWHAT AGREE,SOMEWHAT DISAGREE
1,I don't talk a lot.,SOMEWHAT DISAGREE,DISAGREE,I AM NEUTRAL,SOMEWHAT DISAGREE,I CAN TOLERATE IT,I EXPECT IT,I DISLIKE IT,SOMEWHAT DISAGREE,NEUTRAL,NEUTRAL,SOMEWHAT DISAGREE,SOMEWHAT DISAGREE,DISAGREE,SOMEWHAT DISAGREE,"PERSONA'S ANSWER IS: DISAGREE""""""",SOMEWHAT DISAGREE
2,I feel comfortable around people.,AGREE,NEUTRAL\n \nMOST LIKELY RESPONSE(S) BY ELIAS ...,I LIKE IT,AGREE,I LIKE IT,I LIKE IT😌👍🏼🎮⏸️,I LIKE IT,AGREE,SOMEWHAT DISAGREE,SOMEWHAT AGREE,AGREE,AGREE,SOMEWHAT AGREEE,SOMEWHAT AGREE,PERSONA: SOMEWHAT DISAGREE,AGREE
3,I keep in the background.,SOMEWHAT DISAGREE,DISAGREE,I EXPECT IT,SOMEWHAT DISAGREE,I DISLIKE IT,I DISLIKE IT,I DISLIKE IT,SOMEWHAT DISAGREE,NEUTRAL,NEUTRAL,SOMEWHAT DISAGREE,SOMEWHAT DISAGREE,NEUTRAL,SOMEWHAT DISAGREE,AGREE,SOMEWHAT DISAGREE
4,I start conversations.,SOMEWHAT AGREE,NEUTRAL,I LIKE IT,AGREE,I LIKE IT,I LIKE IT,I LIKE IT,SOMEWHAT AGREE,NEUTRAL,DISAGREE I THINK YOU USUALLY DO :),SOMEWHAT AGREE,SOMEWHAT DISAGREE,SOMEWHAT AGREEE,AGREE,PERSONA ANSWER: SOMEWHAT AGREE,SOMEWHAT AGREE


In [130]:
surv = {}
surv['POSSIBLE_ANSWERS'] = ["I LIKE IT", "I EXPECT IT", "I AM NEUTRAL", "I CAN TOLERATE IT", "I DISLIKE IT", "SOMEWHAT DISAGREE", "DISAGREE", "NEUTRAL", "SOMEWHAT AGREE", "AGREE"]

def remove_invalid_answers(value):
    if pd.isna(value):
        return ""
    elif value in surv['POSSIBLE_ANSWERS']:
        return value
    else:
        return ""

def extract_possible_answer(value):
    for phrase in surv['POSSIBLE_ANSWERS']:
        pattern = r'(?i)' + re.escape(phrase)
        match = re.search(pattern, value)
        if match:
            return match.group()
    return value  # Return the original value if no possible answer is found

In [132]:
# RETRIEVAL_PROMPT = surv['POSSIBLE_ANSWERS']
PRE_DEF_ANSWERS = ["I LIKE IT", "I EXPECT IT", "I AM NEUTRAL", "I CAN TOLERATE IT", "I DISLIKE IT", "SOMEWHAT DISAGREE", "DISAGREE", "NEUTRAL", "SOMEWHAT AGREE", "AGREE"]

embeddings = [ollama.embeddings(model=EMBED_MODEL, prompt=answer)["embedding"] for answer in PRE_DEF_ANSWERS]


FINAL_STRINGS_2_CLEAN = df['answer'] #change to answers_df['answer'] for all simulations
# utils.find_most_similar(ollama.embeddings(model=EMBED_MODEL, prompt=FINAL_STRINGS_2_CLEAN[0])["embedding"], embeddings)
mapped_results = [utils.find_most_similar(string) for string in FINAL_STRINGS_2_CLEAN]
print(mapped_results)
# Generate embeddings for each chunk
embeddings = []

progress, chunks_len = 0, len(chunks) # for progress bar
for chunk_text in chunks:
    progress += 1
    print(f"\rChunk {progress}/{chunks_len}", end="")

    embedding = ollama.embeddings(model=EMBED_MODEL, prompt=chunk_text)["embedding"]
    embeddings.append(embedding)

bu.if_dir_not_exist_make("data/3_embeddings")
bu.quickJSON(AUTO_INFO, f"data/3_embeddings/POSSIBLE_ANSWERS_info.json")
bu.quickJSON({"chunks": chunks, "embeddings": embeddings}, f"data/3_embeddings/POSSIBLE_ANSWERS_embeddings.json")

Chunk 104/104

In [None]:
#     pattern = r'\b(?:' + '|'.join(re.escape(phrase) for phrase in phrases_to_extract) + r')\b'
#     matches = re.findall(pattern, text, flags=re.IGNORECASE) 
#     return ' '.join(matches) if matches else ''


# Update isValid
df['isValid'] = df['answer'].apply(lambda x: x in surv.POSSIBLE_ANSWERS)

# if all values in isValid is true, drop the column, else print a message
if df['isValid'].all():
    df = df.drop('isValid', axis=1)
    print("All answers were valid")
else:
    print("Some answers were not valid")

df.head(n=10)

#### Proces simulation output - KANO

In [None]:
surv = survey.KanoSurvey()
# Add airidas and elias answers
air = surv.test_answers["airidas"]
eli = surv.test_answers["elias"]
df.insert(2, "airidas", air[:len(df)])
df.insert(3, "elias", eli[:len(df)])

df['answer'] = df['answer'].str.upper()
df['airidas'] = df['airidas'].str.upper()
df['elias'] = df['elias'].str.upper()

#### Proces simulation output - PERSONALITY

In [None]:
surv = survey.PersonalitySurvey()
# df = df.dropna()

# Add airidas and elias answers
air = surv.test_answers["airidas"]
eli = surv.test_answers["elias"]
df.insert(2, "airidas", air[:len(df)])
df.insert(3, "elias", eli[:len(df)])
df[:5]

In [None]:
# compute one number of how the percentage of correct answers
result_data = {
    "Exact Matches": (df['answer'] == df['airidas']).sum() / len(df),
    "Correlation": df['answer'].corr(df['airidas']),
    "Exact Matches - elias": (df['answer'] == df['elias']).sum() / len(df),
    "Correlation - elias": df['answer'].corr(df['elias']),
}

for k, v in result_data.items():
    print(f"{k}: {v}")

In [None]:
str(type(surv))

In [None]:
new_res = {
    # "label": None,
    "SIMULATION_NAMEID": SIMULATION_NAMEID,#SIM_ID,
    "timestamp": bu.get_timestamp(),
    "survey_type": str(type(surv)),
    # "temperature": SETTINGS["temperature"],
    # "note": "",
    "exact_matches": result_data["Exact Matches"],
    "corr": result_data["Correlation"],
    "exact_matches_elias": result_data["Exact Matches - elias"],
    "corr_elias": result_data["Correlation - elias"],
}

tmp = bu.convert_dicts_to_table([new_res])
res.append_data(tmp[1], tmp[0])

### Cleaning

In [None]:
# df['airidas'] = df['airidas'].str.upper()
# df['elias'] = df['elias'].str.upper()
# df['answer'] = df['answer'].map(remap_dict)
# df['airidas'] = df['airidas'].map(remap_dict)
# df['elias'] = df['elias'].map(remap_dict)


########################
# df = df.drop(columns=['uppercase_text'])
# df['CLONE_eli'] = df['answer'].apply(extract_uppercase_text)
# df['CLONE_eli'] = df['CLONE_eli'].str.upper()
# .str.upper() or .lower()
# df['answer'] = df['answer'].map(remap_dict, na_action='ignore')
#df['CLONE_eli'] = df['CLONE_eli'].fillna(0).astype(int)

### Remapping

In [None]:
if isinstance(surv, survey.KanoSurvey):
    remap_dict = {"I EXPECT IT": 5, "I LIKE IT": 4, "I AM NEUTRAL": 3, "I CAN TOLERATE IT": 2, "I DISLIKE IT": 1}
    df['answer'] = df['answer'].map(remap_dict)
    df['airidas'] = df['airidas'].map(remap_dict)
    df['elias'] = df['elias'].map(remap_dict)
elif isinstance(surv, survey.PersonalitySurvey):
    remap_dict = {"AGREE": 5, "SOMEWHAT AGREE": 4, "NEUTRAL": 3, "SOMEWHAT DISAGREE": 2, "DISAGREE": 1}
    df['answer'] = df['answer'].map(remap_dict)

### Remaps - UNIVERSAL

In [None]:
# compute one number of how the percentage of correct answers
print(f"Exact Matches: {(df['CLONE_eli'] == df['IRL_eli']).sum() / len(df)}")
print(f"Correlation: {df['CLONE_eli'].corr(df['IRL_eli'])}")
df['elias_correct'] = df['CLONE_eli'] == df['IRL_eli']

In [81]:
import importlib
import pandas as pd
from openai import OpenAI
import os
import json
import traceback
import sys

import brikasutils as bu
import shared_utils as utils
import survey
importlib.reload(bu)
importlib.reload(utils)
importlib.reload(survey)

queue = bu.FileRunQueue(queue_folder_path="batch/queue", completed_folder_path="batch/done")
report_live_csv = bu.LiveCSV("batch/run_reports.csv")
timer = bu.Benchmarker()


for filepath in queue:
    timer.mark_start(filepath)

    try: 
        ########## Handle batch stuff ########
        filename = os.path.splitext(os.path.basename(filepath))[0]
        with open(filepath, 'r') as f:
            rundata = json.load(f)

        # Load prompt file
        with open(rundata["instructions"]["prompt_file"], 'r') as f:
            final_prompts = json.load(f)

        # Make the surv
        if rundata["instructions"]["survey_type"] == "KanoSurvey":
            surv = survey.KanoSurvey()
        elif rundata["instructions"]["survey_type"] == "PersonalitySurvey":
            surv = survey.PersonalitySurvey()
        else:
            raise Exception("Invalid survey type")

        timestamp = bu.get_timestamp()
        ######### Run Simulation ########
        SIMULATION_NAMEID = filename
        LIMIT = rundata["instructions"]["LIMIT"] if "LIMIT" in rundata["instructions"] else None
        AUTO_INFO = {
            "date": timestamp,
            **rundata["info"], # unpacked from rundata
            "limit": LIMIT,
            "prompt_count": min(len(final_prompts), LIMIT) if LIMIT != None else len(final_prompts),
            "avg_tokens_in_prompt": round(utils.describe_prompts(final_prompts)["total_all_prompt_tokens"]/len(final_prompts)),
        }
        SETTINGS = {
            **rundata["settings"], # unpacked from rundata
        }

        # client depends on if it's local or not
        if rundata["instructions"]["isLocal"]:
            client = OpenAI(
                base_url = 'http://localhost:11434/v1',
                api_key='ollama', # required, but unused
            )
        else:
            client = OpenAI(
                api_key=os.environ.get("OPENAI_API_KEY"),
            )

        completions = []
        l = len(final_prompts)

        for i, (prompt, question) in enumerate(list(zip(final_prompts, surv.questions))):
            if LIMIT != None and i > LIMIT:
                break

            print(f"{i}/{l}...", end="\t") # Print progress
            # Send the Request
            full_response = client.chat.completions.create(
                messages=prompt,
                **SETTINGS,
            )
            r = full_response.choices[0].message.content

            completions.append({'question': question, 'answer': r})

            print(f"{question}: {r}")
            
        ############ Save Important results
        df = pd.DataFrame(completions)
        df.to_csv(f"batch/output/{SIMULATION_NAMEID}_simulation.csv", index=False)
        bu.if_dir_not_exist_make("batch/output/info")
        bu.quickJSON({"settings": SETTINGS, "info": AUTO_INFO}, f"batch/output/info/{SIMULATION_NAMEID}_info.json")

        status = "OK"
    
    except Exception:
        print(f"##### Error while running {filename}.")
        error_string = traceback.format_exc()
        print(error_string)
        status = "Failed"

    ########### Time the run
    try:
        time_taken = timer.mark_end(filepath)
    except:
        print("Error while timing run: ")
        print(traceback.format_exc())
        time_taken = None

    ########### Report the run
    try:
        new_report = {
            "filename": filename,
            "timestamp": timestamp,
            "time_taken": time_taken,
            "status": status,
            **rundata["instructions"],
            "error": error_string if status == "Failed" else "",
        }

        tmp = bu.convert_dicts_to_table([new_report])
        report_live_csv.append_data(tmp[1], tmp[0])
    except Exception as e:
        print(f"Error while reporting: ")
        traceback.print_exc()

    print(f"Processed {filename}. Stauts: {status}")

timer.print_total_execution_time()

LiveCSV: File /Users/et/Desktop/Thesis/batch/run_reports.csv not existing. Creating new.
brikasutils.quickCSV: Saved 0 as batch/run_reports.csv
[1/12] Running base_game_llama_1.json from queue. 
Using default Kano Survey CSV file: surveys/survey_kano-model.csv
0/40...	What would you say if there were options to design your own avatar?: I LIKE IT
1/40...	What would you say if there were NO options to design your own avatar?: I DISLIKE IT
2/40...	What would you say if the game had the option to save the game at any time?: I LIKE IT
3/40...	What would you say if the game did NOT have the option to save the game at any time?: I DISLIKE IT
4/40...	What would you say if the game has good graphics?: I LIKE IT
5/40...	What would you say if the game had NO good graphics, or rather poor graphics?: I DISLIKE IT
6/40...	What would you say if the game had an exciting storyline?: I LIKE IT
7/40...	What would you say if the game did NOT have an exciting storyline?: I DISLIKE IT
8/40...	What would you

## Modelfile

Force short JSON (markdown) answer

Add this to the end of your prompt:
> ```json

Add this to the "stop" sequence:
>```

----

llama3:70b
TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>

{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>

{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>

{{ .Response }}<|eot_id|>"""
PARAMETER stop "<|start_header_id|>"
PARAMETER stop "<|end_header_id|>"
PARAMETER stop "<|eot_id|>"


Mixtral
TEMPLATE """ [INST] {{ .System }} {{ .Prompt }} [/INST]"""
PARAMETER stop "[INST]"
PARAMETER stop "[/INST]"

----

nomic_embed
TEMPLATE """{{ .Prompt }}"""
PARAMETER num_ctx 8192