## Data repair
dreams_interpretations.scv dataset is malformed due to the manner in which it was scraped.  
See: https://www.kaggle.com/code/manswad/how-i-scarped-the-notebook

Therefore, it needs to be fixed (semi-automatically)

In [None]:
# preprocess the file itself: run regexes on file: 
# remove extra newline between parts of words
'("[\w\']+)\n(\w+ )*(\w+")',  "$1 $2"

# remove this pesky char \xa0
" " , " "

# remove extra spaces
"\s{2,}" , " "

# replace double quotes with single quotes
'""(([\w ])+)""', "$1"

In [None]:
import pandas as pd
import pandasql as ps
import numpy as np
import re 

keywords_df = pd.read_csv("datasets/dreams_interpretations.csv")

In [None]:
keywords_df

In [None]:
keywords_df["Dream Symbol"].nunique()

In [None]:
keywords_df.sample(10)

In [None]:

dream_df = pd.read_csv("datasets/rsos_dream_data.tsv", sep="\t")
dream_df.head()

In [None]:
dream_df["text_dream"].str.len().hist( bins=200, figsize=(15, 10))

Some dreamers are real graphomans.

In [None]:
keywords_df["Interpretation"].str.len().sort_values(ascending=False).head(30)

In [None]:
keywords_df

In [None]:
keywords_df.iloc[10]

In [None]:
wat = keywords_df["Interpretation"].iloc[71]

In [None]:


def slice_text(text):
    text = text.replace("\xa0", " ")

    text = re.sub(" +", " ", text).strip()


    p1 = r"(\. *)((\w+ )+)(To (see|dream|watch|make))"
    r1 = r'$1\n$2: $4'.replace(r'$', '\\')
    
    text = re.sub(p1, r1, text).strip()

    #text = re.sub(r"\s+", " ", text)

    text = text.split("\n")[1:]
    #print(text)
    text = [t.split(":") for t in text]
    text = [[t[0].strip(), t[1].strip()] for t in text]
    return text 


In [None]:
def itrtr(row):
    sym = row["Dream Symbol"].replace("\n"," ")
    intp = row["Interpretation"]
    txt = f'. {sym} {intp}'
    #print(txt)
    a = slice_text(txt)
    #adf = pd.DataFrame(a, columns=['Dream Symbol',"Interpretation"])
    return a

In [None]:
keywords_df.iloc[59]

In [None]:
itrtr(keywords_df.iloc[59])

In [None]:
keywords_df['sliced'] = keywords_df.apply(itrtr, axis=1)
keywords_df

In [None]:
keywords_df['sliced'].values

In [None]:
keywords_df['len'] = keywords_df['sliced'].apply(lambda x: len(x))
keywords_df

In [None]:
glued = keywords_df[keywords_df['len'] >1 ]
glued

In [None]:
expl = glued['sliced'].explode(1)
expl

In [None]:
type(expl[0])

In [None]:
expl = pd.DataFrame([{'Dream Symbol':e[0], 'Interpretation': e[1]}  for e in expl])
expl

In [None]:
basic = keywords_df[keywords_df['len'] <=1 ][['Dream Symbol', 'Interpretation']]
basic

In [None]:
fixed = pd.concat([basic, expl], ignore_index=True)
fixed

In [None]:
fixed["len"] = fixed["Interpretation"].str.len()

In [None]:
fixed.sort_values(by="len", ascending=False).head(30)

In [None]:
fixed

In [None]:
fixed.sort_values(by="len", ascending=False).to_html("datasets/fixed_interpretations.html", index=False)

In [None]:
fixed.sort_values(by="Dream Symbol").to_csv("datasets/fixed_interpretations.csv", index=False)

In [None]:
fixed["Dream Symbol"].nunique()

In [None]:
q = """
select Interpretation, count(*) as cnt, string_agg(`Dream Symbol`, ',') as symbols
from fixed
group by Interpretation
order by cnt desc
"""

grouped = ps.sqldf(q)
grouped


In [None]:
len(set(["a","b"]))

In [None]:
grouped["sets"] = grouped.symbols.str.strip().str.split(",").apply(set)
grouped["len"] = grouped.sets.apply(len)
grouped = grouped.sort_values(by="len", ascending=False)
grouped

In [None]:
azaz = grouped[grouped["len"] > 1]
azaz


In [None]:
def fixer(row):
    symbols = row["sets"]
    symbols = list(symbols)
    if len(symbols) == 1:
        return symbols[0]
    symbols.sort(key=len)
    symbols = symbols[:2]
    symbols.sort() 
    symbols = "".join(symbols)
    return symbols

grouped["Dream Symbol"] = grouped.apply(fixer, axis=1)
grouped = grouped[["Dream Symbol", "Interpretation"]]   
grouped

In [None]:
q = """
select `Dream Symbol`, count(*) as cnt, string_agg(Interpretation, ',') as Interpretations
from grouped
group by `Dream Symbol`
order by cnt desc
"""

extract_keys = ps.sqldf(q)
extract_keys

In [None]:
q = """
SELECT `Dream Symbol`, 
  MAX(LENGTH(Interpretation)) AS max_length, 
  Interpretation
FROM grouped
GROUP BY `Dream Symbol`
HAVING LENGTH(Interpretation) = max_length
ORDER BY `Dream Symbol`
"""
result = ps.sqldf(q)
result

In [None]:
result[["Dream Symbol", "Interpretation"]].to_csv("datasets/fixed_interpretations.csv", index=False)

## Data Preparation

In [None]:
dataset = []

prmt = """Given dream description, interpret the meaning of the dream. 
Provided also are the dream symbols that appear in the dream and their meanings. 
Use the dream symbols meanings to help you interpret the dream. """.replace("\n", " ")


for i, ex in dream_df.iterrows():
    #print(ex)
    keys = ex["Dream Symbol"].split(",")[:5]
    
    #print(keys)
    syms = keywords_df[keywords_df["Dream Symbol"].isin(keys)]

    descr = syms.apply(lambda r: f' - {r["Dream Symbol"]}:  {r["Interpretation"]}', axis = 1)
    item = {
        "prompt": prmt, 
        "dream": ex["text_dream"],
        "symbols": "\n".join(descr),
        }
    dataset.append(item)
    

dataset = pd.DataFrame(dataset)
dataset

In [None]:
from summarizer import PromptFormatter, load_mistral_4bit_model
import hashlib
from plotly import express as px

model_name = "mistralai/Mistral-7B-Instruct-v0.2"
model_family="decoder"

formatter = PromptFormatter(model_family)

model, tokenizer = load_mistral_4bit_model(model_name)

In [None]:
dataset["input"] = dataset.apply(lambda r: formatter.format(r['prompt'], r['dream'], r['symbols']), axis = 1)
dataset["len"] = dataset["input"].str.len()
dataset["input_tokens"] = dataset.input.apply(lambda prmt: tokenizer.tokenize(prmt, truncation=False, max_length=1024))
dataset["input_tokens_len"] = dataset.input_tokens.apply(len)
dataset

In [None]:
del model, tokenizer

In [None]:
def get_hash(text):
    return hashlib.md5(text.encode("utf-8")).hexdigest()

dataset["hash"] = dataset["input"].apply(get_hash) 


In [None]:
px.histogram(dataset, x = "input_tokens_len")

In [None]:
px.histogram(dataset, x = "len")

In [None]:
px.scatter(dataset, x = "len", y = "input_tokens_len")

In [None]:
dataset

## Results investigation 
this analizes the results of the dream interpreter processing

In [None]:
import os
save_dir = "output"

In [None]:
dfs = []

for f in os.listdir(save_dir):
    if f.endswith(".csv"):
        try:
            existing_df = pd.read_csv(os.path.join(save_dir, f))
            existing_df["filename"] = f
            dfs.append(existing_df)
        except Exception:
            continue

dataset = pd.concat(dfs)
dataset

In [None]:
dataset["meaning"] = dataset.interpretation.apply(formatter.unformat)
dataset["meaning_len"] = dataset.meaning.apply(lambda m: len(m.split(" ")))
dataset["interpretation_len"] = dataset.interpretation.apply(lambda m: len(m.split(" ")))

In [None]:
procd = dataset[dataset.filename.str.contains('.*20250506.*', regex= True)][["dream", "symbols", "meaning"]]
procd

In [None]:
from utils import save_df_as_pretty_html

save_df_as_pretty_html(procd, "may06_meanings.html")

In [None]:
px.histogram(dataset, x = "meaning_len")

In [None]:
px.histogram(dataset, x = "interpretation_len")

In [None]:
dataset.sort_values("input_tokens_len", inplace=True)
dataset.reset_index(inplace=True)

In [None]:
px.scatter(dataset, x = dataset.index, y = "input_tokens_len")