## **Step 1 - keywords Extraction**
***

We have two datasets, one with dream text descriptions:

In [None]:
from keyword_extractor import read_datasets, extract_and_save_keywords_from_dataframes
from yaml_parser import load_config
config = load_config()
dream_df, keywords_df = read_datasets(config)
dream_df.head()

And another one with interpretations of dreams according to keywords:

In [None]:
keywords_df

Now, we will use pretrained LLMs in order to extract the given keywords from the keywords dataset , from the dream text description from the dream text dataset.

### **GPT2**
***

In [None]:
dream_df = extract_and_save_keywords_from_dataframes()

In [None]:
css = """
    .table-style {
                  width: 100%;
                  border-style: solid;
                  border-width: 5px;
}

    .table-style td {
                  white-space:pre
                  width: 100px;
                  border-style: solid;
                  border-width: 5px;
}
"""

In [None]:
dream_df[["text_dream","Dream Symbol"]][:100].style\
  .set_table_attributes('class="table-style"')\
  .to_html("datasets/dream_and_its_keys.html", index=False, classes=css, border=2)


## Step 2 - Summarize interpretations

### Load data and prepare (small) dataset for experimenting

In [None]:
import pandas as pd
from datetime import datetime
from transformers import pipeline
from utils import  release_all_gpu_memory, save_df_as_pretty_html


In [None]:
from summarizer import load_causal_model, batch_generate_interpretations

import torch

In [None]:
dream_df= pd.read_csv('datasets/rsos_dream_data.tsv', sep='\t')
dream_df

In [None]:
keywords_df = pd.read_csv("datasets/fixed_interpretations.csv")
keywords_df

In [None]:
exmpl = dream_df[dream_df["text_dream"].str.len()< 300]

In [None]:
exmpl = exmpl[["text_dream","Dream Symbol"]].sample(5, random_state=44)

In [None]:
exmpl

In [None]:
exmpl["Dream Symbol"]

In [None]:
dataset = []

prmt = """Given dream description, interpret the meaning of the dream. 
Provided also are the dream symbols that appear in the dream and their meanings. 
Use the dream symbols meanings to help you interpret the dream. """.replace("\n", " ")

rs = 42

for i, ex in exmpl.iterrows():
    #print(ex)
    keys = ex["Dream Symbol"].split(",")[:5]
    
    #print(keys)
    syms = keywords_df[keywords_df["Dream Symbol"].isin(keys)]

    descr = syms.apply(lambda r: f' - {r["Dream Symbol"]}:  {r["Interpretation"]}', axis = 1)
    item = {
        "prompt": prmt, 
        "dream": ex["text_dream"],
        "symbols": "\n".join(descr),
        }
    dataset.append(item)
    rs += 1
    

dataset = pd.DataFrame(dataset)
dataset


### Summarize with flan-T5-large model

In [None]:
release_all_gpu_memory()

In [None]:
# Step 1: Load FLAN-T5 model and tokenizer
model_name = "google/flan-t5-large"
model_name_short = model_name.split("/")[-1]
device = 0 if torch.cuda.is_available() else -1
model, tokenizer = load_causal_model(model_name)

In [None]:
text2text_generator = pipeline(
        "text2text-generation",
        model=model,
        tokenizer=tokenizer,
        max_length=1024,           # ✅ allow longer input
        truncation=True,           # ✅ ensure truncation at tokenizer level
        device=device,
    )

In [None]:
tstp = datetime.now().strftime(r"%y.%m.%d-%H")
result_df = batch_generate_interpretations(dataset, text2text_generator, batch_size=1, max_length=250)


In [None]:
postproc = lambda out: out["generated_text"].strip()
result_df["interpretation"] = result_df["interpretation"].apply(postproc)


In [None]:
result_df

In [None]:
result_df.columns

In [None]:
save_df = result_df[['prompt', 'symbols','dream', 'interpretation']]

path = f"output/{model_name_short}_{tstp}"

save_df_as_pretty_html(save_df, path + ".html")

save_df.to_csv(path + ".csv")

In [None]:
result_df.interpretation.str.len()

### Summarize with Mistral model

In [None]:
from summarizer import load_mistral_4bit_model

In [None]:
dataset

In [None]:
release_all_gpu_memory(["model", "tokenizer", "text2text_generator"])


In [None]:
print("Loading Mistral-7B-Instruct in 4-bit...")

model_name = "mistralai/Mistral-7B-Instruct-v0.2"
model_name_short = model_name.split("/")[-1]
  
max_new_tokens=256

model, tokenizer = load_mistral_4bit_model(model_name)


In [None]:
model_pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=max_new_tokens,
        do_sample=False
    )

In [None]:

print("\n🧠 Running interpretations...")
tstp = datetime.now().strftime(r"%y.%m.%d-%H")

result_df = batch_generate_interpretations(dataset, model_pipeline, batch_size=4)
#print(result_df[["dream", "interpretation"]])


In [None]:
postproc = lambda out: out[0]["generated_text"].split("Interpretation:")[-1].strip()
result_df["interpretation"] = result_df["interpretation"].apply(postproc)


In [None]:
result_df

In [None]:

save_df = result_df[['prompt', 'symbols','dream', 'interpretation']]

path = f"output/{model_name_short}_{tstp}"
save_df_as_pretty_html(save_df, path + ".html")

save_df.to_csv(path + ".csv")