# Setup

In [2]:
import importlib
import brikasutils as bu
importlib.reload(bu)
import fb_msg_reader as fb
importlib.reload(fb)
import shared_utils as utils
importlib.reload(utils)
import survey
importlib.reload(survey)
import persona
importlib.reload(persona)

import ollama
import numpy as np
from numpy.linalg import norm
import pandas as pd
import os
import re
import time
import json
from typing import List
from openai import OpenAI

## Load data

In [4]:
et = persona.PersonaEncoder()

# ==== FB messages ====
et.parse_fb_messages(["data-raw/1_airidas.json"], "airidas")
et.parse_fb_messages(["data-raw/2_christian.json"], "christian")
et.parse_fb_messages(["data-raw/1_nikolay.json"], "nikolay")
et.parse_fb_messages(["data-raw/2_mathis.json"], "mathis")
et.parse_fb_messages(["data-raw/2_jacob.json"], "jacob")
et.parse_fb_messages(["data-raw/2_chris.json"], "chris")
et.parse_fb_messages(["data-raw/3_aziz.json"], "aziz")
et.parse_fb_messages(["data-raw/3_daniela.json"], "daniela")
et.parse_fb_messages(["data-raw/3_mihi.json"], "mihi")
et.parse_fb_messages(["data-raw/3_viktoria.json"], "viktoria")
et.parse_fb_messages(["data-raw/4_diba.json"], "diba")
et.parse_fb_messages(["data-raw/6_filip.json"], "filip")
#et.parse_wa_messages(texts_with_rebecca, "rebecca")
# texts_with_others_dict = {
#     "rebecca": ["data-raw/messages_1000.json"],
# }
# for name, texts in texts_with_others_dict.items():
#     et.parse_fb_messages(texts, name)

# Regex cleaning
et.filter_chats_empty()
et.filter_chats_regex(utils.BLACKLIST_CHAT_REGEX_FILTERS)

# Compress names
for nameid, chat in et.chats.items():
    for msg in chat:  
        msg.sender = "Persona" if msg.sender == "Elias Salvador Smidt Torjani"  else "Friend"

# Start all chats from 2/3rds
# for name, chat in et.chats.items():
#     et.chats[name] = chat[int(len(chat)/3 * 2):]
# Select the final modules
et.select_chat_limited_by_tokens("airidas", 10000)
et.select_chat_limited_by_tokens("christian", 10000)
et.select_chat_full("nikolay")
et.select_chat_full("mathis") 
et.select_chat_full("daniela")
et.select_chat_full("diba")
et.select_chat_full("aziz")
et.select_chat_full("jacob")  
et.select_chat_full("chris")
et.select_chat_full("filip")
et.select_chat_full("mihi")
et.select_chat_full("viktoria")
# for name in texts_with_others_dict.keys():
#     ab.select_chat_full(name)

# save
big_module=et.output()
bu.quickTXT(big_module, filename=f"data/big_module_{bu.get_timestamp()}")

# stats
token_counts = et.count_all_selected_chat_tokens() # token_counts used later for statistics
print(f"Combined tokens: {sum(token_counts.values())}")
# utils.count_tokens(big_module) 
# or list(et.selectedChats.keys()) --> et.count_chat_tokens("{friend}")
# et.selectedChats["{friend}"][:5]

Read 1916 messages from 1 files. Failed to read 0 messages.
Messages ranged from 2021-09-13 to 2024-03-06
Messages saved to self.chats['airidas']
Read 618 messages from 1 files. Failed to read 0 messages.
Messages ranged from 2021-09-10 to 2024-03-03
Messages saved to self.chats['christian']
Read 297 messages from 1 files. Failed to read 0 messages.
Messages ranged from 2018-07-25 to 2024-01-01
Messages saved to self.chats['nikolay']
Read 144 messages from 1 files. Failed to read 0 messages.
Messages ranged from 2021-03-28 to 2021-12-30
Messages saved to self.chats['mathis']
Read 104 messages from 1 files. Failed to read 0 messages.
Messages ranged from 2021-08-25 to 2024-03-05
Messages saved to self.chats['jacob']
Read 159 messages from 1 files. Failed to read 0 messages.
Messages ranged from 2021-10-12 to 2023-04-30
Messages saved to self.chats['chris']
Read 161 messages from 1 files. Failed to read 0 messages.
Messages ranged from 2021-03-28 to 2021-06-06
Messages saved to self.chat

# Embedding

In [7]:
surv = survey.PersonalitySurvey()
# surv = survey.KanoSurvey()
# surv = survey.buildFairnessPrompts()
# surv = survey.DictatorGameSurvey()

surv.questions[:2]#.head()

# Change below accoring to survey above
RETRIEVAL_PROMPT = "openess conciousness extrovert aggreableness neuroticism" #"personality"
# q_retrival_prompt =
SURVEY_PROMPT = "Determine how much {subject} aggree with the statement. Guestimate how {subject} would answer to the question"

Using default Personality Survey CSV file: surveys/survey_personality-test.csv


In [9]:
# PARAMETERS
EMBED_MODEL = "nomic-embed-text"        # nomic-embed-text = long ctx / mxbai-embed-large = big
CHUNK_SIZE = 30                         # Number of messages per chunk
OVERLAP_SIZE = 10                       # Number of overlapping messages between consecutive chunks
CHUNKS_COUNT_IN_CTX = 30 # Number of nearby chunks to put in context window
# COMMENT 04-16, perhaps we could try 5x retrievals with isolated semantics

In [11]:
# Initialize lists for storing chunks – and embeddings later
# different chunk size
chunks = []
stat_total_msgs_in_chunks = 0 # for statistics

# different chunk size
# Iterate over chats and messages to create chunks
for chat in et.selectedChats.values():
    messages = list(chat)  # Convert chat iterator to list for easier slicing
    num_messages = len(messages)

    # Create overlapping chunks of messages
    for i in range(0, num_messages - CHUNK_SIZE + 1, CHUNK_SIZE - OVERLAP_SIZE):
        chunk = messages[i:i + CHUNK_SIZE]  # Extract chunk of messages
        chunk_text = "\n".join(str(msg) for msg in chunk)  # Concatenate messages into a single string
        chunks.append(chunk_text)  # Append chunk to list of chunks

        stat_total_msgs_in_chunks += len(chunk) # For statistics

##### Display Info
total_messages = sum(len(chat) for chat in et.selectedChats.values())
chunks_count = len(chunks)
avg_chunk_char_len = np.mean([len(chunk) for chunk in chunks])

print(
    f"Chunk count: {chunks_count}",
    f"Average chunk character length: {round( avg_chunk_char_len)}",
    f"Rough estimate of tokens per chunk: {round(avg_chunk_char_len / 4)} (4 characters per token)",
    f"Messagees in input count: {total_messages}",
    f"Messages in chunks count: {stat_total_msgs_in_chunks}",
    f"Chunk \ Input ratio: {round(stat_total_msgs_in_chunks / total_messages,2)} (OVERLAP_SIZE={OVERLAP_SIZE})",
    f"Chunk Python type: {type(chunks[0])}",
    sep="\n"
) 

Chunk count: 130
Average chunk character length: 1244
Rough estimate of tokens per chunk: 311 (4 characters per token)
Messagees in input count: 2828
Messages in chunks count: 3900
Chunk \ Input ratio: 1.38 (OVERLAP_SIZE=10)
Chunk Python type: <class 'str'>


In [12]:
########### Serialization ########
EMBEDDING_NAMEID = "test03"
AUTO_INFO = {
    "model": EMBED_MODEL,
    "CHUNK_SIZE": CHUNK_SIZE,
    "OVERLAP_SIZE": OVERLAP_SIZE,
    "chunks_count": chunks_count,
    "total_messages": total_messages,
    "stat_total_msgs_in_chunks": stat_total_msgs_in_chunks,
    "modules_chat": token_counts,
}
##################################

In [13]:
# Generate embeddings for each chunk
embeddings = []

progress, chunks_len = 0, len(chunks) # for progress bar
for chunk_text in chunks:
    progress += 1
    print(f"Chunk {progress}/{chunks_len}")

    embedding = ollama.embeddings(model=EMBED_MODEL, prompt=chunk_text)["embedding"]
    embeddings.append(embedding)

####################################################
# Generate embeddings for each chunk
# for chunk_text in chunks:
#     embedding = ollama.embeddings(model=EMBED_MODEL, prompt=chunk_text)["embedding"]
#     embeddings.append(embedding)


# token counts in all similar chunks
tokens_in_chunks = 0
for chunk in chunks_most_similar:
    tokens_in_chunks += utils.count_tokens(chunk)
print(f"Tokens in chunks: {tokens_in_chunks}")

bu.quickTXT("\n\n".join(chunks_most_similar), filename="data-prep/chunks.txt")

# Perform similarity search and print results
prompt_embedding = ollama.embeddings(model=EMBED_MODEL, prompt=RETRIEVAL_PROMPT)["embedding"]
most_similar_chunks = embed.find_most_similar(prompt_embedding, embeddings)[:CHUNKS_COUNT_IN_CTX]
chunks_most_similar = []
for embedding in chunks_most_similar_embeddings:
    chunks_most_similar.append(chunks[embedding[1]])

# Display results
print(f"Chunks:{len(chunks)}, embeds:{len(embeddings)}, nearby_chunks:{len(most_similar_chunks)}")
for item in most_similar_chunks:
    print(chunks[item[1]])
####################################################

# Display and save results (if needed later)
print(f"Chunks:{len(chunks)}, embeds:{len(embeddings)}")

bu.if_dir_not_exist_make("embeddings")
bu.quickJSON(AUTO_INFO, f"embeddings/{EMBEDDING_NAMEID}_info.json")
bu.quickJSON({"chunks": chunks, "embeddings": embeddings}, f"embeddings/{EMBEDDING_NAMEID}_embeddings.json")

Chunk 1/130
Chunk 2/130
Chunk 3/130
Chunk 4/130
Chunk 5/130
Chunk 6/130
Chunk 7/130
Chunk 8/130
Chunk 9/130
Chunk 10/130
Chunk 11/130
Chunk 12/130
Chunk 13/130
Chunk 14/130
Chunk 15/130
Chunk 16/130
Chunk 17/130
Chunk 18/130
Chunk 19/130
Chunk 20/130
Chunk 21/130
Chunk 22/130
Chunk 23/130
Chunk 24/130
Chunk 25/130
Chunk 26/130
Chunk 27/130
Chunk 28/130
Chunk 29/130
Chunk 30/130
Chunk 31/130
Chunk 32/130
Chunk 33/130
Chunk 34/130
Chunk 35/130
Chunk 36/130
Chunk 37/130
Chunk 38/130
Chunk 39/130
Chunk 40/130
Chunk 41/130
Chunk 42/130
Chunk 43/130
Chunk 44/130
Chunk 45/130
Chunk 46/130
Chunk 47/130
Chunk 48/130
Chunk 49/130
Chunk 50/130
Chunk 51/130
Chunk 52/130
Chunk 53/130
Chunk 54/130
Chunk 55/130
Chunk 56/130
Chunk 57/130
Chunk 58/130
Chunk 59/130
Chunk 60/130
Chunk 61/130
Chunk 62/130
Chunk 63/130
Chunk 64/130
Chunk 65/130
Chunk 66/130
Chunk 67/130
Chunk 68/130
Chunk 69/130
Chunk 70/130
Chunk 71/130
Chunk 72/130
Chunk 73/130
Chunk 74/130
Chunk 75/130
Chunk 76/130
Chunk 77/130
Chunk 78

NameError: name 'chunks_most_similar' is not defined

In [92]:
with open(f"embeddings/{EMBEDDING_NAMEID}_embeddings.json", "r") as f:
    data = json.load(f)
    chunks = data["chunks"]
    embeddings = data["embeddings"]

### shit?

In [None]:
# Display results
for item in embed.find_most_similar:
    print(chunks[item[1]])

In [None]:
#################################################################

#1.0 of above smth further above
paragraphs = []
for chat in elias.selectedChats.values():
    for msg in chat:
        if msg.sender == "Persona":
            paragraphs.append(str(msg))
        
embeddings = [
    ollama.embeddings(model=embed, prompt=chunk)["embedding"] for chunk in paragraphs
]

prompt_embedding = ollama.embeddings(model=embed, prompt=retrieval_prompt)["embedding"]
most_similar_chunks = find_most_similar(prompt_embedding, embeddings)[:5]

print(f"Paragraphs:{len(paragraphs)}, embeds:{len(embeddings)}, nearby_chunks:{len(most_similar_chunks)}")
print("\n".join(paragraphs[item[1]] for item in most_similar_chunks))
#print(f"{most_similar_chunks[0][1]}, {most_similar_chunks[:3]}\n{len(paragraphs)}")

## Retrieve

In [93]:
PROMPT = {
    "role": "system", 
    "content": "You are an actor specializing in impersonating non-famouns people. You will be presented to the subject through explicit datapoints of their digital footprint. In addition, you will deduct their implicit personality traits by shadowing chats between the subject and friends. You will be asked to answer questions from the point of view of the persona. The persona you will be impersonating is named Elias. Context:"
    }

# persona_small = "{small module}"
# persona_med = "{med module}"
# persona_text = "Favorite video games are Rimworld, Minecraft, Age of Empires, 7 Days to Die"
subject = elias

## Prompt Builder

In [94]:
final_prompts = []

for question in surv.questions:
    p = [
        pc.systemMsg(PROMPT['content']+"\n## chat conversions between subject and friends\n".join(chunks[item[1]] for item in most_similar_chunks)),
        pc.assistantMsg('Understood. I will answer from the point of view of the persona, {subject}, based on what I could the deduct from the text provided above.'),
        pc.userMsg("\n".join([
            f'\n\n**Your answer should only contain the chosen option without further explanation!** Reply to the statement below - how {subject} would reply - with one of these five options: {", ".join(surv.POSSIBLE_ANSWERS)}.',
            question
        ])),
    ]
    final_prompts.append(p)

print(f"{len(final_prompts)}")#,{final_prompts[:1]}")

50


In [95]:
# Report prompt tokens
total_all_prompt_tokens = 0
prompt_tokens_min = 0
prompt_tokens_max = 0
for p in final_prompts:
    pt = 0 # Prompt tokens
    for msg in p:
        pt += utils.count_tokens(msg["content"])
    if prompt_tokens_min == 0 or pt < prompt_tokens_min:
        prompt_tokens_min = pt
    if pt > prompt_tokens_max:
        prompt_tokens_max = pt

    total_all_prompt_tokens += pt

print(f"Created {len(final_prompts)} prompts.")
print(f"Average prompt size: {round(total_all_prompt_tokens/len(final_prompts))} tokens.")
print(f"Min prompt size: {prompt_tokens_min}, Max prompt size: {prompt_tokens_max}")

bu.quickJSON(final_prompts, "data-prep/prompts.json")

Created 50 prompts.
Average prompt size: 11388 tokens.
Min prompt size: 11385, Max prompt size: 11393


# Run simulation

In [96]:
# Run Simulation
##################################
SIM_ID = f"elias_personality_03"
LIMIT = 10 # For testing purposes. Set to NONE to run all
AUTO_INFO = {
    "date": bu.get_timestamp(),
    "EMBEDDING_NAMEID": EMBEDDING_NAMEID,
    "RETRIEVAL_PROMPT": RETRIEVAL_PROMPT,
    "CHUNKS_COUNT_IN_CTX": CHUNKS_COUNT_IN_CTX,
    "survey_type": str(type(surv)),
    "prompt_count": min(len(final_prompts), LIMIT) if LIMIT != None else len(final_prompts),
}

SETTINGS = {
     "model": "mistral",
    #  "temperature": 0.5,
     # best wizard and mixtral try mixtral-8x22b wizard in uCloud
}

client = OpenAI(
    base_url = 'http://localhost:11434/v1',
    api_key='ollama', # required, but unused
)

save = f"{SETTINGS['model']}_{SIM_ID}"
##################################

In [97]:
### ==== THE FUNCTIONAL 1!!!! =====
results = []
l = len(final_prompts)
# lim = None

for i, (prompt, question) in enumerate(list(zip(final_prompts, surv.questions))):
    if LIMIT != None and i > LIMIT:
        break
    
    print(f"{i}/{l}...", end="\t") # Print progress
    # Send the Request    
    full_response = client.chat.completions.create(
        model=SETTINGS["model"],
        messages=prompt,
        # temperature=SETTINGS["temperature"],
    )

    r = full_response.choices[0].message.content
    results.append({'question': question, 'answer': r})
    print(f"{question}: {r}")

# Save results
df = pd.DataFrame(results)
# df.to_csv(f"results/{save}_simulation.csv", index=False)
df.to_csv(f"results/{SIM_ID}_simulation.csv", index=False)
# bu.quickJSON(final_prompts, f"results/{save}_prompts.json")
bu.quickJSON(final_prompts, f"results/{SIM_ID}_prompts.json")
# bu.quickJSON(SETTINGS, f"results/{save}_setings.json")
bu.quickJSON({"settings": SETTINGS, "info": AUTO_INFO}, f"results/{SIM_ID}_info.json")

0/50...	I am the life of the party.:  SOMEWHAT AGREE
1/50...	I don't talk a lot.:  NEUTRAL
2/50...	I feel comfortable around people.:  AGREE.
3/50...	I keep in the background.:  NEUTRAL
4/50...	I start conversations.:  NEUTRAL
5/50...	I have little to say.:  NEUTRAL
6/50...	I talk to a lot of different people at parties.:  NEUTRAL
7/50...	I don't like to draw attention to myself.:  NEUTRAL
8/50...	I don't mind being the center of attention.:  NEUTRAL
9/50...	I am quiet around strangers.:  NEUTRAL.
10/50...	I get stressed out easily.:  NEUTRAL (The conversational data does not contain enough information to determine if the persona agrees or disagrees with the statement.)


# Analysis

In [98]:
base_kano= 'results/18th results/command-r-plus:104b-q2_K_base_kano_18th_prompts.json'
base_personality = 'results/18th results/command-r-plus:104b-q2_K_base_personality_18th_simulation.csv'
elias_kano = 'results/18th results/command-r-plus:104b-q2_K_elias_kano_simulation.csv'
elias_personality = 'results/18th results/command-r-plus:104b-q2_K_elias_personality_18th_simulation.csv'


In [114]:
df = pd.read_csv(base_personality)

In [115]:
# # csv_file = "surveys/survey_kano-model_v1.csv"
# # surv = survey.KanoSurvey(csv_file)
csv_file = "surveys/survey_personality-test_v1.csv"
surv = survey.PersonalitySurvey(csv_file)

In [103]:
if isinstance(surv, survey.KanoSurvey):
    remap_dict = {"I EXPECT IT": 5, "I LIKE IT": 4, "I AM NEUTRAL": 3, "I CAN TOLERATE IT": 2, "I DISLIKE IT": 1}
    df['answer'] = df['answer'].map(remap_dict)
    df['airidas'] = df['airidas'].map(remap_dict)
    df['elias'] = df['elias'].map(remap_dict)
elif isinstance(surv, survey.PersonalitySurvey):
    remap_dict = {"AGREE": 5, "SOMEWHAT AGREE": 4, "NEUTRAL": 3, "SOMEWHAT DISAGREE": 2, "DISAGREE": 1}
    df['answer'] = df['answer'].map(remap_dict)

df.head()

Unnamed: 0,question,answer
0,I am the life of the party.,3.0
1,I don't talk a lot.,3.0
2,I feel comfortable around people.,3.0
3,I keep in the background.,4.0
4,I start conversations.,4.0


In [None]:
# surv.POSSIBLE_ANSWERS[0]
# list(surv.POSSIBLE_ANSWERS)

In [None]:
# remap_dict = {f"{surv.POSSIBLE_ANSWERS[0]}": 1, f"{surv.POSSIBLE_ANSWERS[1]}": 2, f"{surv.POSSIBLE_ANSWERS[2]}": 3, f"{surv.POSSIBLE_ANSWERS[3]}": 4, f"{surv.POSSIBLE_ANSWERS[4]}": 5}
#remap_dict = {"I EXPECT IT": 5, "I LIKE IT": 4, "I AM NEUTRAL": 3, "I CAN TOLERATE IT": 2, "I DISLIKE IT": 1}
remap_dict = {str(value): index + 1 for index, value in enumerate(surv.POSSIBLE_ANSWERS)}

def extract_uppercase_text(text):
    """Extract uppercase text from a string using regex."""
    
    phrases_to_extract = [
        surv.POSSIBLE_ANSWERS[0],
        surv.POSSIBLE_ANSWERS[1],
        surv.POSSIBLE_ANSWERS[2],
        surv.POSSIBLE_ANSWERS[3],
        surv.POSSIBLE_ANSWERS[4],
    #     "I EXPECT IT",
    #     "I LIKE IT",
    #     "I AM NEUTRAL",
    #     "I CAN TOLERATE IT",
    #     "I DISLIKE IT"
    ]
    pattern = r'\b(?:' + '|'.join(re.escape(phrase) for phrase in phrases_to_extract) + r')\b'
    matches = re.findall(pattern, text, flags=re.IGNORECASE) 
    return ' '.join(matches) if matches else ''

In [104]:
#df = pd.read_csv(f'results/{save}_simulation.csv')
# df = pd.read_csv('results/mistral_elias_personality_02_simulation.csv')
#### Proces simulation output
air = surv.test_answers["airidas"]
eli = surv.test_answers["elias"]
df.insert(2, "airidas", air[:len(df)])
df.insert(3, "elias", eli[:len(df)])

In [106]:
# drop rows with NaN
df = df.dropna()
df

Unnamed: 0,question,answer,airidas,elias
0,I am the life of the party.,3.0,3,3
1,I don't talk a lot.,3.0,2,2
2,I feel comfortable around people.,3.0,4,4
3,I keep in the background.,4.0,2,4
4,I start conversations.,4.0,4,3
6,I talk to a lot of different people at parties.,1.0,5,4
7,I don't like to draw attention to myself.,1.0,2,3
8,I don't mind being the center of attention.,4.0,4,1
9,I am quiet around strangers.,1.0,1,4
10,I get stressed out easily.,1.0,4,3


In [107]:
# compute one number of how the percentage of correct answers
result_data = {
    "Exact Matches": (df['answer'] == df['airidas']).sum() / len(df),
    "Correlation": df['answer'].corr(df['airidas']),
    "Exact Matches - elias": (df['answer'] == df['elias']).sum() / len(df),
    "Correlation - elias": df['answer'].corr(df['elias']),
}

for k, v in result_data.items():
    print(f"{k}: {v}")


Exact Matches: 0.23809523809523808
Correlation: 0.09798087391026379
Exact Matches - elias: 0.2619047619047619
Correlation - elias: 0.1665474589335875


In [109]:
bu.if_dir_not_exist_make("results")
res = bu.LiveCSV("results/elias_results.csv")

LiveCSV: File /Users/e/Documents/GitHub/Thesis/results/elias_results.csv not existing. Creating new.
brikasutils.quickCSV: Saved 0 as results/elias_results.csv


In [112]:
new_res = {
    # "label": None,
    "SIMULATION_NAMEID": SIM_ID,
    "timestamp": bu.get_timestamp(),
    "survey_type": str(type(surv)),
    # "temperature": SETTINGS["temperature"],
    # "note": "",
    "exact_matches": result_data["Exact Matches"],
    "corr": result_data["Correlation"],
    "exact_matches_elias": result_data["Exact Matches - elias"],
    "corr_elias": result_data["Correlation - elias"],
}

tmp = bu.convert_dicts_to_table([new_res])
res.append_data(tmp[1], tmp[0])

brikasutils.quickCSV: Saved 1 as results/elias_results.csv


In [None]:
SIM_ID = "run1-airidas-personality"

df = pd.read_csv(f'results/{SIM_ID}_simulation.csv')
# df = df.drop(df.columns[0], axis=1) #if loaded from csv, drop the added index col
df.head()

with open(f'results/{SIM_ID}_info.json', 'r') as f:
    AUTO_INFO = json.load(f)
for k, v in AUTO_INFO.items():
    print(f"{k}: {v}")

try:
    if str(type(surv) != AUTO_INFO["survey_type"]):
        print(f"WARNING: surv variable is not of the same type. {str(type(surv))} != {AUTO_INFO['survey_type']}")
except:
    pass

In [None]:
# remove all characters from a black list from the column answer
for substr in utils.BLACKLIST_ANSWER_SUBSTRINGS:
    df['answer'] = df['answer'].apply(lambda x: re.sub(substr, "", x))

df['answer'] = df['answer'].str.upper()
# Update isValid
df['isValid'] = df['answer'].apply(lambda x: x in surv.POSSIBLE_ANSWERS)

# if all values in isValid is true, drop the column, else print a message
if df['isValid'].all():
    df = df.drop('isValid', axis=1)
    print("All answers were valid")
else:
    print("Some answers were not valid")

df
#### Cleanup
# remove all characters from a black list from the column answer
# for substr in utils.BLACKLIST_ANSWER_SUBSTRINGS:
#      df['answer'] = df['answer'].apply(lambda x: re.sub(substr, "", x))
# # Update isValid
#      df['isValid'] = df['answer'].apply(lambda x: x in surv.POSSIBLE_ANSWERS)

# if all values in isValid is true, drop the column, else print a message
# if df['isValid'].all():
#     df = df.drop('isValid', axis=1)
# else:
#     print("Some answers were not valid")

#### Proces simulation output - KANO

In [None]:
# Add airidas and elias answers
air = surv.test_answers["airidas"]
eli = surv.test_answers["elias"]
df.insert(2, "airidas", air[:len(df)])
df.insert(3, "elias", eli[:len(df)])

df['answer'] = df['answer'].str.upper()
df['airidas'] = df['airidas'].str.upper()
df['elias'] = df['elias'].str.upper()

#### Proces simulation output - PERSONALITY

In [None]:
# Add airidas and elias answers
air = surv.test_answers["airidas"]
eli = surv.test_answers["elias"]
df.insert(2, "airidas", air[:len(df)])
df.insert(3, "elias", eli[:len(df)])

### Remaps - UNIVERSAL

In [None]:
if isinstance(surv, survey.KanoSurvey):
    remap_dict = {"I EXPECT IT": 5, "I LIKE IT": 4, "I AM NEUTRAL": 3, "I CAN TOLERATE IT": 2, "I DISLIKE IT": 1}
    df['answer'] = df['answer'].map(remap_dict)
    df['airidas'] = df['airidas'].map(remap_dict)
    df['elias'] = df['elias'].map(remap_dict)
elif isinstance(surv, survey.PersonalitySurvey):
    remap_dict = {"AGREE": 5, "SOMEWHAT AGREE": 4, "NEUTRAL": 3, "SOMEWHAT DISAGREE": 2, "DISAGREE": 1}
    df['answer'] = df['answer'].map(remap_dict)

df

In [None]:
## Airi
df['answer'] = df['answer'].str.upper()
df['airidas'] = df['airidas'].str.upper()
df['elias'] = df['elias'].str.upper()

df['answer'] = df['answer'].map(remap_dict)
df['airidas'] = df['airidas'].map(remap_dict)
df['elias'] = df['elias'].map(remap_dict)

In [None]:
df['CLONE_eli'] = df['answer'].apply(extract_uppercase_text)
df['CLONE_eli'] = df['CLONE_eli'].str.upper()

In [None]:
# df = df.drop(columns=['uppercase_text'])
# .str.upper() or .lower()
# df['answer'] = df['answer'].map(remap_dict, na_action='ignore')

df['CLONE_eli'] = df['CLONE_eli'].map(remap_dict)
#df['CLONE_eli'] = df['CLONE_eli'].fillna(0).astype(int)
# df['air'] = df['air'].map(remap_dict)
# df['eli'] = df['eli'].map(remap_dict)

In [None]:
# compute one number of how the percentage of correct answers
print(f"Exact Matches: {(df['CLONE_eli'] == df['IRL_eli']).sum() / len(df)}")
print(f"Correlation: {df['CLONE_eli'].corr(df['IRL_eli'])}")

df['elias_correct'] = df['CLONE_eli'] == df['IRL_eli']

Force short JSON answer

Add this to the end of your prompt:
> ```json

Add this to the "stop" sequence:
>```

The idea is to force the model to continue writing json markdown. And end the generation when it outputs "```" which ends the json markdown section.

----
## Modelfile

Command-r-plus
TEMPLATE """{{ if .System }}<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{ .System }}<|END_OF_TURN_TOKEN|>{{ end }}{{ if .Prompt }}<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{ .Prompt }}<|END_OF_TURN_TOKEN|>{{ end }}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{{ .Response }}<|END_OF_TURN_TOKEN|>"""
PARAMETER stop "<|START_OF_TURN_TOKEN|>"
PARAMETER stop "<|END_OF_TURN_TOKEN|>"


Mixtral
TEMPLATE """ [INST] {{ .System }} {{ .Prompt }} [/INST]"""
PARAMETER stop "[INST]"
PARAMETER stop "[/INST]"



TEMPLATE """ [INST] {{ .System }} {{ .Prompt }} ```json [/INST] """
PARAMETER stop "[INST]"
PARAMETER stop "[/INST]"
PARAMETER stop "```"








mixtral x22
TEMPLATE """[INST] {{ if .System }}{{ .System }} {{ end }}{{ .Prompt }} [/INST]"""
PARAMETER stop "[INST]"
PARAMETER stop "[/INST]"

wizard x22
TEMPLATE """{{ if .System }}{{ .System }} {{ end }}{{ if .Prompt }}USER: {{ .Prompt }} {{ end }}ASSISTANT: {{ .Response }}"""
SYSTEM """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."""
PARAMETER stop "USER:"
PARAMETER stop "ASSISTANT:"

nomic_embed
TEMPLATE """{{ .Prompt }}"""
PARAMETER num_ctx 8192


Mistral 7b
TEMPLATE """[INST] {{ .System }} {{ .Prompt }} [/INST]"""
PARAMETER stop "[INST]"
PARAMETER stop "[/INST]"

Mistral 7b-wizard
TEMPLATE """{{ if .System }}{{ .System }} {{ end }}{{ if .Prompt }}USER: {{ .Prompt }} {{ end }}ASSISTANT: {{ .Response }}"""
SYSTEM """A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."""
PARAMETER stop "USER:"
PARAMETER stop "ASSISTANT:"