# Initial setup: **INPUT**

## Lib import

In [236]:
import importlib
import brikasutils as bu
importlib.reload(bu)
import shared_utils as utils
from shared_utils import systemMsg, userMsg, assistantMsg
importlib.reload(utils)
import survey
importlib.reload(survey)
import persona
importlib.reload(persona)

import ollama
import numpy as np
from numpy.linalg import norm
import pandas as pd
import os
import re
import time
import json
from typing import List
from openai import OpenAI

## Persona

In [237]:
et = persona.PersonaEncoder()

# ==== FB messages ====
et.parse_fb_messages(["data/1_raw/1_airidas.json"], "airidas")
et.parse_fb_messages(["data/1_raw/2_christian.json"], "christian")
et.parse_fb_messages(["data/1_raw/1_nikolay.json"], "nikolay")
et.parse_fb_messages(["data/1_raw/2_mathis.json"], "mathis")
et.parse_fb_messages(["data/1_raw/2_jacob.json"], "jacob")
et.parse_fb_messages(["data/1_raw/2_chris.json"], "chris")
et.parse_fb_messages(["data/1_raw/3_aziz.json"], "aziz")
et.parse_fb_messages(["data/1_raw/3_daniela.json"], "daniela")
et.parse_fb_messages(["data/1_raw/3_mihi.json"], "mihi")
et.parse_fb_messages(["data/1_raw/3_viktoria.json"], "viktoria")
et.parse_fb_messages(["data/1_raw/4_diba.json"], "diba")
et.parse_fb_messages(["data/1_raw/6_filip.json"], "filip")
et.parse_wa_messages(["data/1_raw/messages_1000.json"], "rebecca")
# for name, texts in texts_with_others_dict.items():
#     et.parse_fb_messages(texts, name)

# Regex cleaning
et.filter_chats_empty()
et.filter_chats_regex(utils.BLACKLIST_CHAT_REGEX_FILTERS)

# Compress names
for nameid, chat in et.chats.items():
    for msg in chat:  
        msg.sender = "Persona" if msg.sender == "Elias Salvador Smidt Torjani"  else "Friend"

# Start all chats from 2/3rds
# for name, chat in et.chats.items():
#     et.chats[name] = chat[int(len(chat)/3 * 2):]
# Select the final modules
# et.select_chat_limited_by_tokens("airidas", 10000)
et.select_chat_full("rebecca")
et.select_chat_full("airidas")
et.select_chat_full("christian")
et.select_chat_full("nikolay")
et.select_chat_full("mathis") 
et.select_chat_full("daniela")
et.select_chat_full("diba")
et.select_chat_full("aziz")
et.select_chat_full("jacob")  
et.select_chat_full("chris")
et.select_chat_full("filip")
et.select_chat_full("mihi")
et.select_chat_full("viktoria")

# save
BIG_MODULE=et.output()
bu.quickTXT(BIG_MODULE, filename=f"data/2_modules/big_{bu.get_timestamp()}")

# stats
token_counts = et.count_all_selected_chat_tokens() # token_counts used later for statistics
print(f"Combined tokens: {sum(token_counts.values())}")
# utils.count_tokens(BIG_MODULE) 
# or list(et.selectedChats.keys()) --> et.count_chat_tokens("{friend}")
# et.selectedChats["{friend}"][:5]

Read 1916 messages from 1 files. Failed to read 0 messages.
Messages ranged from 2021-09-13 to 2024-03-06
Messages saved to self.chats['airidas']
Read 618 messages from 1 files. Failed to read 0 messages.
Messages ranged from 2021-09-10 to 2024-03-03
Messages saved to self.chats['christian']
Read 297 messages from 1 files. Failed to read 0 messages.
Messages ranged from 2018-07-25 to 2024-01-01
Messages saved to self.chats['nikolay']
Read 144 messages from 1 files. Failed to read 0 messages.
Messages ranged from 2021-03-28 to 2021-12-30
Messages saved to self.chats['mathis']
Read 104 messages from 1 files. Failed to read 0 messages.
Messages ranged from 2021-08-25 to 2024-03-05
Messages saved to self.chats['jacob']
Read 159 messages from 1 files. Failed to read 0 messages.
Messages ranged from 2021-10-12 to 2023-04-30
Messages saved to self.chats['chris']
Read 161 messages from 1 files. Failed to read 0 messages.
Messages ranged from 2021-03-28 to 2021-06-06
Messages saved to self.chat

## Embedding **Chunking**

In [None]:
# PARAMETERS
EMBED_MODEL = "nomic-embed-text"        # nomic-embed-text = long ctx / mxbai-embed-large = big
CHUNK_SIZE = 40                         # N of msgs per chunk: 10-90?
OVERLAP_SIZE = 10                       # N of overlapping msgs between consecutive chunks: 5-50?

In [None]:
# Initialize lists for storing chunks – and embeddings later
chunks = []
stat_total_msgs_in_chunks = 0 # for statistics

# different chunk size
# Iterate over chats and messages to create chunks
for chat in et.selectedChats.values():
    messages = list(chat)  # Convert chat iterator to list for easier slicing
    num_messages = len(messages)

    # Create overlapping chunks of messages
    for i in range(0, num_messages - CHUNK_SIZE + 1, CHUNK_SIZE - OVERLAP_SIZE):
        chunk = messages[i:i + CHUNK_SIZE]  # Extract chunk of messages
        chunk_text = "\n".join(str(msg) for msg in chunk)  # Concatenate messages into a single string
        chunks.append(chunk_text)  # Append chunk to list of chunks

        stat_total_msgs_in_chunks += len(chunk) # For statistics

##### Display Info
total_messages = sum(len(chat) for chat in et.selectedChats.values())
chunks_count = len(chunks)
avg_chunk_char_len = np.mean([len(chunk) for chunk in chunks])

print(
    f"Chunk count: {chunks_count}",
    # f"Average chunk character length: {round( avg_chunk_char_len)}",
    f"Rough estimate of tokens per chunk: {round(avg_chunk_char_len / 4)} (4 characters per token)",
    f"Messagees in input count: {total_messages}",
    f"Messages in chunks count: {stat_total_msgs_in_chunks}",
    f"Chunk \ Input ratio: {round(stat_total_msgs_in_chunks / total_messages,2)} (OVERLAP_SIZE={OVERLAP_SIZE})",
    f"Chunk Python type: {type(chunks[0])}",
    sep="\n"
) 

## **Generaterating** embeddings

In [None]:
########### Serialization ###########
EMBEDDING_ID = f"{CHUNK_SIZE}-{OVERLAP_SIZE}"
AUTO_INFO = {
    "model": EMBED_MODEL,
    "CHUNK_SIZE": CHUNK_SIZE,
    "OVERLAP_SIZE": OVERLAP_SIZE,
    "chunks_count": chunks_count,
    "total_messages": total_messages,
    "stat_total_msgs_in_chunks": stat_total_msgs_in_chunks,
    "modules_chat": token_counts,
}

In [None]:
# Generate embeddings for each chunk
embeddings = []
progress, chunks_len = 0, len(chunks) # for progress bar
for chunk_text in chunks:
    progress += 1
    print(f"\rChunk {progress}/{chunks_len}", end="")
    embedding = ollama.embeddings(model=EMBED_MODEL, prompt=chunk_text)["embedding"]
    embeddings.append(embedding)
####################################################
# token counts in all similar chunks
# tokens_in_chunks = 0
# for chunk in chunks_most_similar:
#     tokens_in_chunks += utils.count_tokens(chunk)
# print(f"Tokens in chunks: {tokens_in_chunks}")
####################################################
bu.if_dir_not_exist_make("data/3_embeddings")
bu.quickJSON(AUTO_INFO, f"data/3_embeddings/{EMBEDDING_ID}_info.json")
bu.quickJSON({"chunks": chunks, "embeddings": embeddings}, f"data/3_embeddings/{EMBEDDING_ID}_embeddings.json")

# **CTX**

## Survey

In [None]:
# surv = survey.KanoSurvey()
surv = survey.PersonalitySurvey()
#The Five Factors of personality are:
# Openness - How open a person is to new ideas and experiences
# Conscientiousness - How goal-directed, persistent, and organized a person is
# Extraversion - How much a person is energized by the outside world
# Agreeableness - How much a person puts others' interests and needs ahead of their own
# Neuroticism - How sensitive a person is to stress and negative emotional triggers

# surv = survey.buildFairnessPrompts()
# surv = survey.DictatorGameSurvey()
surv.questions[:2]#.head()

In [None]:
if isinstance(surv, survey.KanoSurvey):
    RETRIEVAL_PROMPT = "video game features"
    DYNAMIC_RETRIEVAL_PROMPTS = list(surv.questions)
    SURVEY_TYPE = "KanoSurvey",
    SURVEY = "video game preferences"
    METHOD = "Kano survey"
    WHICH_SURVEY = "kano"
    PROMPT_LENGTH = 40
elif isinstance(surv, survey.PersonalitySurvey):
    RETRIEVAL_PROMPT = "openess conciousness extrovert aggreableness neuroticism"
    DYNAMIC_RETRIEVAL_PROMPTS = list(surv.questions)
    SURVEY_TYPE = "PersonalitySurvey",
    SURVEY = "personality traits"
    METHOD = "OCEAN test"
    WHICH_SURVEY = "pers"
    PROMPT_LENGTH = 50

CHUNKS_COUNT_IN_CTX = 10 # Number of nearby chunks to put in context window

########### Serialization ###########
# EMBEDDING_ID = f"{CHUNK_SIZE}-{OVERLAP_SIZE}"
VERSION_ID = f"8k_{WHICH_SURVEY}" # pers/kano_{ctx tokens}k
CHECKPOINT = f"{EMBEDDING_ID}-{CHUNKS_COUNT_IN_CTX}-{VERSION_ID}"
AUTO_INFO = {
    "CHUNKS_COUNT_IN_CTX": CHUNKS_COUNT_IN_CTX,
    "EMBEDDING_ID": EMBEDDING_ID,
    "VERSION_ID": VERSION_ID,
    "model": EMBED_MODEL,
    "CHUNK_SIZE": CHUNK_SIZE,
    "OVERLAP_SIZE": OVERLAP_SIZE,
    "chunks_count": chunks_count,
    "total_messages": total_messages,
    "stat_total_msgs_in_chunks": stat_total_msgs_in_chunks,
    "modules_chat": token_counts,
    "SURVEY and method": f"{SURVEY} and {METHOD}",
    "RETRIEVAL_PROMPT": RETRIEVAL_PROMPT,
    "DYNAMIC_RETRIEVAL_PROMPTS": DYNAMIC_RETRIEVAL_PROMPTS,
}
########### Serialization ###########

## Retrieval

### Hybrid

In [239]:
HYBRID_CTX = int(CHUNKS_COUNT_IN_CTX/2)
prompt_embedding = ollama.embeddings(model=EMBED_MODEL, prompt=RETRIEVAL_PROMPT)["embedding"]
chunks_most_similar_embeddings  = utils.find_most_similar(prompt_embedding, embeddings)[:HYBRID_CTX]

# Static part
chunks_most_similar = []
for embedding in chunks_most_similar_embeddings:
    chunks_most_similar.append(chunks[embedding[1]])

# Dynamic part
DYNAMIC_RETRIEVAL_PROMPTS
dynamic_chunks_most_similar: List[List[str]] = [] 
progress = 0
lenn = len(DYNAMIC_RETRIEVAL_PROMPTS)
for prompt in DYNAMIC_RETRIEVAL_PROMPTS:
    progress += 1
    print(f"\rPrompt {progress}/{lenn}", end="")
    chunks_most_similar = []
    for embedding in chunks_most_similar_embeddings:
        chunks_most_similar.append(chunks[embedding[1]])
    dynamic_chunks_most_similar.append(chunks_most_similar)
print(end="\n")




### BOTH VANITY PRINT CHECKS SHOULD BE LIKE THIS AFTER THEIR PARTS ###
# token counts in all similar chunks
tokens_in_chunks = 0
for chunks_most_similar in dynamic_chunks_most_similar:
    for chunk in chunks_most_similar:
        tokens_in_chunks += utils.count_tokens(chunk)
print(f"Tokens in chunks: {tokens_in_chunks}")
print(f"Chunks:{len(chunks)}, embeds:{len(embeddings)}")


print(f"Tokens in average chunk group: {tokens_in_chunks/len(dynamic_chunks_most_similar)}")
bu.quickJSON(dynamic_chunks_most_similar, filename=f"data/4_chunks/{CHECKPOINT}-hybrid_chunks.json")


Prompt 50/50
Tokens in chunks: 140550
Chunks:138, embeds:138
Tokens in average chunk group: 2811.0


In [None]:
import json

# Static part
prompt_embedding = ollama.embeddings(model=EMBED_MODEL, prompt=RETRIEVAL_PROMPT)["embedding"]
chunks_most_similar_embeddings = utils.find_most_similar(prompt_embedding, embeddings)[:HYBRID_CTX]

# Static part chunks
static_chunks = []
for embedding in chunks_most_similar_embeddings:
    static_chunks.append(chunks[embedding[1]])

# Dynamic part
DYNAMIC_RETRIEVAL_PROMPTS = list(surv.questions)
hybrid_chunks = []

# Add static chunks with markdown separator
static_chunk_string = "### Related to entire survey\n\n" + "\n\n".join(static_chunks)

for prompt in DYNAMIC_RETRIEVAL_PROMPTS:
    prompt_embedding = ollama.embeddings(model=EMBED_MODEL, prompt=prompt)["embedding"]
    chunks_most_similar_embeddings = utils.find_most_similar(prompt_embedding, embeddings)[:HYBRID_CTX]

    dynamic_chunks = []
    for embedding in chunks_most_similar_embeddings:
        dynamic_chunks.append(chunks[embedding[1]])

    # Combine static and dynamic chunks with markdown separators
    chunk_string = [static_chunk_string,
                    "### Related to the specific question you will be asked\n\n" + "\n\n".join(dynamic_chunks)]

    hybrid_chunks.append(chunk_string)

with open(f"data/4_chunks/{CHECKPOINT}-hybrid_chunks.json", "w", encoding="utf-8") as f:
    json.dump(hybrid_chunks, f)

In [None]:
import json

# Static part
prompt_embedding = ollama.embeddings(model=EMBED_MODEL, prompt=RETRIEVAL_PROMPT)["embedding"]
chunks_most_similar_embeddings = utils.find_most_similar(prompt_embedding, embeddings)[:HYBRID_CTX]

# Static part chunks
static_chunks = []
for embedding in chunks_most_similar_embeddings:
    static_chunks.append(chunks[embedding[1]])

# Dynamic part
DYNAMIC_RETRIEVAL_PROMPTS = list(surv.questions)
hybrid_chunks = []

# Add static chunks with markdown separator
static_chunk_string = "### Related to entire survey\n\n" + "\n\n".join(static_chunks)

for prompt in DYNAMIC_RETRIEVAL_PROMPTS:
    prompt_embedding = ollama.embeddings(model=EMBED_MODEL, prompt=prompt)["embedding"]
    chunks_most_similar_embeddings = utils.find_most_similar(prompt_embedding, embeddings)[:HYBRID_CTX]

    dynamic_chunks = []
    for embedding in chunks_most_similar_embeddings:
        dynamic_chunks.append(chunks[embedding[1]])

    # Combine static and dynamic chunks with markdown separators and newline delimiter
    chunk_string = static_chunk_string + "\n\n\n" + \
                   "### Related to the specific question you will be asked\n\n" + "\n\n".join(dynamic_chunks)

    hybrid_chunks.append(chunk_string)

with open(f"data/4_chunks/{CHECKPOINT}-hybrid_chunks.json", "w", encoding="utf-8") as f:
    json.dump(hybrid_chunks, f)

In [None]:
import json

# Static part
prompt_embedding = ollama.embeddings(model=EMBED_MODEL, prompt=RETRIEVAL_PROMPT)["embedding"]
chunks_most_similar_embeddings = utils.find_most_similar(prompt_embedding, embeddings)[:HYBRID_CTX]

# Static part chunks
static_chunks = []
for embedding in chunks_most_similar_embeddings:
    static_chunks.append(chunks[embedding[1]])

# Dynamic part
DYNAMIC_RETRIEVAL_PROMPTS = list(surv.questions)
hybrid_data = {"chunks": [], "embeddings": []}

# Add static chunks and embeddings
hybrid_data["chunks"].extend(static_chunks)
hybrid_data["embeddings"].extend(chunks_most_similar_embeddings)

for prompt in DYNAMIC_RETRIEVAL_PROMPTS:
    prompt_embedding = ollama.embeddings(model=EMBED_MODEL, prompt=prompt)["embedding"]
    chunks_most_similar_embeddings = utils.find_most_similar(prompt_embedding, embeddings)[:HYBRID_CTX]

    dynamic_chunks = []
    for embedding in chunks_most_similar_embeddings:
        dynamic_chunks.append(chunks[embedding[1]])

    # Append dynamic chunks and embeddings
    hybrid_data["chunks"].extend(dynamic_chunks)
    hybrid_data["embeddings"].extend(chunks_most_similar_embeddings)

# Save to a JSON file
with open(f"data/4_chunks/{CHECKPOINT}-hybrid_data.json", "w", encoding="utf-8") as f:
    json.dump(hybrid_data, f)

### Static

In [None]:
prompt_embedding = ollama.embeddings(model=EMBED_MODEL, prompt=RETRIEVAL_PROMPT)["embedding"]
chunks_most_similar_embeddings  = utils.find_most_similar(prompt_embedding, embeddings)[:CHUNKS_COUNT_IN_CTX]
chunks_most_similar = []
for embedding in chunks_most_similar_embeddings:
    chunks_most_similar.append(chunks[embedding[1]])

# token counts in all similar chunks
tokens_in_chunks = 0
for chunk in chunks_most_similar:
    tokens_in_chunks += utils.count_tokens(chunk)
print(f"Tokens in chunks: {tokens_in_chunks}")
print(f"Chunks:{len(chunks)}, embeds:{len(embeddings)}")

# Display results
bu.quickTXT("\n\n".join(chunks_most_similar), filename=f"data/4_chunks/{CHECKPOINT}-static_chunks.txt")
bu.if_dir_not_exist_make("data/4_chunks")
bu.quickJSON(AUTO_INFO, f"data/4_chunks/{CHECKPOINT}-static_info.json")
bu.quickJSON({"chunks": chunks, "embeddings": embeddings}, f"data/4_chunks/{CHECKPOINT}-static_embeddings.json")

### Dynamic

In [None]:
dynamic_retrieval_prompts = list(surv.questions)
dynamic_chunks_most_similar: List[List[str]] = [] 

progress = 0
lenn = len(dynamic_retrieval_prompts)
for prompt in dynamic_retrieval_prompts:
    progress += 1
    print(f"\rPrompt {progress}/{lenn}", end="")

    prompt_embedding = ollama.embeddings(model=EMBED_MODEL, prompt=prompt)["embedding"]
    chunks_most_similar_embeddings = utils.find_most_similar(prompt_embedding, embeddings)[:CHUNKS_COUNT_IN_CTX]
    chunks_most_similar = []
    for embedding in chunks_most_similar_embeddings:
        chunks_most_similar.append(chunks[embedding[1]])

    dynamic_chunks_most_similar.append(chunks_most_similar)
print(end="\n")

# Display results
bu.if_dir_not_exist_make("data/4_chunks")
bu.quickJSON(AUTO_INFO, f"data/4_chunks/{CHECKPOINT}-dynamic_info.json")
bu.quickJSON({"chunks": chunks, "embeddings": embeddings}, f"data/4_chunks/{CHECKPOINT}-dynamic_embeddings.json")
############################################ VANITY BELOW ########################################
# VANITY PRINT
tokens_in_chunks = 0
for chunks_most_similar in dynamic_chunks_most_similar:
    for chunk in chunks_most_similar:
        tokens_in_chunks += utils.count_tokens(chunk)

del chunks_most_similar_embeddings # free memory
print(f"Tokens in average chunk group: {tokens_in_chunks/len(dynamic_chunks_most_similar)}")
bu.quickJSON(dynamic_chunks_most_similar, filename=f"data/4_chunks/{CHECKPOINT}-dynamic_chunks.json")
###########################################
# Vanity preview
preview_text = ""
PREVIEW_LIMIT = 5
for i, chunks_most_similar in enumerate(dynamic_chunks_most_similar):
    preview_text += f"==============Prompt: {dynamic_retrieval_prompts[i]}==============\n"
    for j, chunk in enumerate(chunks_most_similar):
        if j >= PREVIEW_LIMIT:
            break
        preview_text += f"=======CHUNK {j}=======\n{chunk}\n\n"
    preview_text += "\n\n"
bu.quickTXT(preview_text, filename=f"data/4_chunks/{CHECKPOINT}-dynamic_chunks")

## Prompt Builder

Go through w/ both static and dynamic

In [None]:
RETRIAVAL_METHOD = "dynamic" #static/dynamic/hybrid
PROMPT_METHOD = "IMPERSONATE" #ARE/IMPERSONATE

SUBJECT = "Elias"

# Load Embeddings From File (optional)
import json
# with open(f"data/4_chunks/{EMBEDDING_ID}-{CHUNKS_COUNT_IN_CTX}_{VERSION_ID}-dynamic_embeddings.json", "r") as f:
with open(f"data/4_chunks/{CHECKPOINT}-{RETRIAVAL_METHOD}_embeddings.json", "r") as f:
    data = json.load(f)
    chunks = data["chunks"]
    embeddings = data["embeddings"]
print(f"Chunks:{len(chunks)}, embeds:{len(embeddings)}")

# "PERSONA_TEXT": "Favorite video games are Minecraft, Fortnite, and Call of Duty.",
# "MED_MODULE": " "
# SURVEY_PROMPT = "Determine how much {subject} aggree with the statement. Guestimate how {subject} would answer to the question"
TINY_MODULE = "You are Elias, a 24 year old business and IT student from Copenhagen, where you now live in a dormatory."

####################### You are {SUBJECT} vs you will impersonate {SUBJECT} #####################
PREP_CHECKPOINT = f"{CHECKPOINT}-{RETRIAVAL_METHOD}_{SUBJECT}-{PROMPT_METHOD}"
PREP_CHECKPOINT

In [None]:
########################################### Method A ############################################
if PROMPT_METHOD == "IMPERSONATE":
    pre_prompt_template = """
SYS_MSG = {
    "role": "system", 
    "content": f"You are an expert actor, specializing in impersonation of non-famouns people. You will be presented to the subject through explicit datapoints of their digital footprint. In addition, you will deduct their implicit {SURVEY} by shadowing chats between the subject and friends. You will be asked to fully immerse yourself in the role, and answer questions from the point of view of the persona. \\n\\n**The persona, which you will be tasked to mimick is named '{SUBJECT}'.** \\n#Context \\n##Chat conversations between the subject and their friends:\\n**From most to least related**\\n"
}
ASSIST_MSG = {
    "role": "assistant",
    "content": f"Understood. I will answer from the point of view of the persona, {SUBJECT}, based on what I could the deduct from the text provided."
}
USER_MSG = {
    "role": "user",
    "content": f"Persona is questioned about their {SURVEY} in an {METHOD}. The persona must choose an appropriate answer to the question below with one of these five given options: {', '.join(surv.POSSIBLE_ANSWERS)}. Persona's answer must only contain the chosen option, without any elaboration, nor introduction."
}
"""
########################################### Method B ###########################################
elif PROMPT_METHOD == "ARE":
    pre_prompt_template = """
SYS_MSG = {
    "role": "system", 
    "content": f"**{TINY_MODULE}**. You have shared your thoughts, feelings, and experiences through text messages with friedns. Answer the following questions honestly and naturally, as you would in everyday conversations. \\n\\n#Context \\n##Conversations between persona and friends:"
}
ASSIST_MSG = {
    "role": "assistant",
    "content": f"Understood. I am {SUBJECT}, and I will answer the survey to the best of my ability."
}
USER_MSG = {   
    "role": "user",
    "content": f"The survey is about your {SURVEY}. You must choose an appropriate answer to the question below with one of these five given options: {', '.join(surv.POSSIBLE_ANSWERS)}. Your answer must only contain the chosen option, without any elaboration, nor introduction.\n**From most to least related**\\n"
}
"""

exec(pre_prompt_template)

print(f"{SYS_MSG['content']}")

In [None]:
VARIABLES = {
    "Which survey": surv,
    # "Prompt method": f"You are {SUBJECT} vs you will impersonate {SUBJECT}",
    # "Retrieval method": "Dynamic/static/hybrid",
    "RETRIEVAL_PROMPT": RETRIEVAL_PROMPT,
    "EMBEDDING_ID": EMBEDDING_ID,
    "VERSION_ID": VERSION_ID,
    "CHUNK_SIZE": CHUNK_SIZE,
    "OVERLAP_SIZE": OVERLAP_SIZE,
    "EMBED_MODEL": EMBED_MODEL,
    "CHUNKS_COUNT_IN_CTX": CHUNKS_COUNT_IN_CTX,
    "EMBEDDING_ID": EMBEDDING_ID,
    "DYNAMIC_CHUNKS_COUNT": len(dynamic_chunks_most_similar),
}

### Dynamic

In [None]:
final_prompts = []

prompt_template = """
for question, chunks_most_similar in zip(surv.questions, dynamic_chunks_most_similar):
    p = [
        systemMsg("\\n".join([
            f"{SYS_MSG['content']}",
            "\\nNEW CONVERSATION:\\n".join(chunks_most_similar)
        ])),      
        assistantMsg(ASSIST_MSG['content']),
        userMsg("\\n".join([
            f"{USER_MSG['content']}\\n\\n**Your question is:**\\n\\n",
            question,
            "\\nThe persona chooses:"
        ]))]
    final_prompts.append(p)
"""    
exec(prompt_template)
prompt_info = utils.describe_prompts_and_print(final_prompts)
bu.quickJSON(final_prompts, f"data/5_prep/{PREP_CHECKPOINT}_prompts.json")
print(f"{len(final_prompts)}")#,{final_prompts[:1]}")

### Static

In [None]:
final_prompts = []
prompt_template = """
for question in surv.questions:
    p = [
        systemMsg("\\n".join([
            f"{SYS_MSG['content']}",
            "\\nNEW CONVERSATION:\\n".join(chunks_most_similar)
        ])),  
        assistantMsg(ASSIST_MSG['content']),
        userMsg("\\n".join([
            f"{USER_MSG['content']}\\n\\n**Your question is:**\\n\\n",
            question,
            "\\nThe persona chooses:"
        ]))]
    final_prompts.append(p)
"""
exec(prompt_template)
prompt_info = utils.describe_prompts_and_print(final_prompts)
bu.quickJSON(final_prompts, f"data/5_prep/{PREP_CHECKPOINT}_prompts.json")
print(f"{len(final_prompts)}")#,{final_prompts[:1]}")

### Base (no persona)

In [None]:
final_prompts = []
prompt_template = """
for question in surv.questions:
    p = [
        systemMsg(
            "You are participating in a survey. You will be presented with a series of questions about your {SURVEY}.",
            f"You must choose answer to the question below with one of the five options: {', '.join(surv.POSSIBLE_ANSWERS)}. The answer must only contain the chosen option. "
        ),
        assistantMsg('Understood. I will answer the question below with one of the given options.'),
        userMsg(
            question,
            "Your choice: "
        )]
    final_prompts.append(p)
"""
exec(prompt_template)
prompt_info = utils.describe_prompts_and_print(final_prompts) # Vanity print
bu.quickJSON(final_prompts, f"data/5_prep/{WHICH_SURVEY}_base_prompts.json")

In [None]:
MODEL = "llama3"

instructions = {
    "prompt_file": f"batch/prompts/{WHICH_SURVEY}_base_prompt.json",
    "survey_type": f"{SURVEY_TYPE}",
    "isLocal": True,
    "LIMIT": None
}
settings = {
    "model": MODEL,
    "timeout": 300
}
AUTO_INFO = {
    "survey": WHICH_SURVEY,
    "prompt_template": prompt_template,
    **utils.describe_prompts([])
    }
bu.quickJSON({"instructions": instructions, "settings": settings, "info": AUTO_INFO}, f"data/5_prep/{WHICH_SURVEY}_base_batch-schema.json")