## **Step 1 - keywords Extraction**
***

We have two datasets, one with dream text descriptions:

In [None]:
from keyword_extractor import read_datasets, extract_and_save_keywords_from_dataframes
from yaml_parser import load_config
config = load_config()
dream_df, keywords_df = read_datasets(config)
dream_df.head()

And another one with interpretations of dreams according to keywords:

In [None]:
keywords_df

Now, we will use pretrained LLMs in order to extract the given keywords from the keywords dataset , from the dream text description from the dream text dataset.

### **GPT2**
***

In [None]:
dream_df = extract_and_save_keywords_from_dataframes()

In [None]:
css = """
    .table-style {
                  width: 100%;
                  border-style: solid;
                  border-width: 5px;
}

    .table-style td {
                  white-space:pre
                  width: 100px;
                  border-style: solid;
                  border-width: 5px;
}
"""

In [None]:
dream_df[["text_dream","Dream Symbol"]][:100].style\
  .set_table_attributes('class="table-style"')\
  .to_html("datasets/dream_and_its_keys.html", index=False, classes=css, border=2)


## Step 2 - Summarize interpretations

In [None]:
import pandas as pd
import pandasql as ps
import numpy as np
import re 

In [None]:
dream_df= pd.read_csv('datasets/rsos_dream_data.tsv', sep='\t')
dream_df

In [None]:
keywords_df = pd.read_csv("datasets/fixed_interpretations.csv")
keywords_df

In [None]:
exmpl = dream_df[dream_df["text_dream"].str.len()< 300]

In [None]:
exmpl = exmpl[["text_dream","Dream Symbol"]].sample(20, random_state=43)

In [None]:
exmpl

In [None]:
keywords = set(keywords_df["Dream Symbol"])

In [None]:
def extract_keys(dream):
    text = dream["text_dream"]
    keys = dream["Dream Symbol"].split(",")
    return [k for k in keys if k.lower() in text.lower()]

tst = exmpl.iloc[1]
print(tst)
keys = extract_keys(tst)[:10]
keys

In [None]:
exmpl["Dream Symbol"]

In [None]:
dataset = []

prmt = """Given dream description, interpret the meaning of the dream. 
Provided also are the dream symbols that appear in the dream and their meanings. 
Use the dream symbols meanings to help you interpret the dream. """.replace("\n", " ")

rs = 42

for i, ex in exmpl.iterrows():
    print(ex)
    keys = ex["Dream Symbol"].split(",")[:5]
    
    #print(keys)
    syms = keywords_df[keywords_df["Dream Symbol"].isin(keys)]

    descr = syms.apply(lambda r: f' - {r["Dream Symbol"]}:  {r["Interpretation"]}', axis = 1)
    item = {
        "prompt": prmt, 
        "dream": ex["text_dream"],
        "symbols": "\n".join(descr),
        }
    dataset.append(item)
    rs += 1
    

dataset = pd.DataFrame(dataset)
dataset


In [None]:
def release_all_gpu_memory(additional_objects=None):
    import gc
    import torch

    # Delete model objects (make sure they're declared global or passed)
    globals_to_clear = ["model", "tokenizer", "text2text_generator"] + additional_objects
    for name in globals_to_clear:
        if name in globals():
            print("clearing ", name)
            del globals()[name]

    gc.collect()

    if torch.cuda.is_available():
        print("clearing cuda cache")
        torch.cuda.empty_cache()
        print("clearing ipc cache")
        torch.cuda.ipc_collect()

    print("‚úÖ All GPU memory cleared.")

In [None]:
release_all_gpu_memory()

### Summarize with flan-T5 model

In [None]:

import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm

release_all_gpu_memory()

# Step 1: Load FLAN-T5 model and tokenizer
model_name = "google/flan-t5-large"
device = 0 if torch.cuda.is_available() else -1

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Check model's max position embeddings
print(f"Model can handle up to {model.config} tokens.")  # should be 1024


text2text_generator = pipeline("text2text-generation",
        model=model,
        tokenizer=tokenizer,
        truncation=True,           # ‚úÖ ensure truncation at tokenizer level
        max_length=1024,           # ‚úÖ allow longer input
        device=device,
        )


# Step 2: Define input formatting
def format_instruction(prompt, dream, symbols):
    return (
        f"Instruction: {prompt.strip()}\n\n"
        f"Dream: {dream.strip()}\n\n"
        f"Symbols:\n{symbols.strip()}\n\n"
        "Interpretation:"
    )

# Step 3: Batch interpret function
def batch_interpret_df(df, model_pipeline, batch_size=4, max_output_length=250):
    interpretations = []
    for i in tqdm(range(0, len(df), batch_size), desc="Generating Interpretations"):
        batch_df = df.iloc[i:i+batch_size]
        inputs = [
            format_instruction(row["prompt"], row["dream"], row["symbols"])
            for _, row in batch_df.iterrows()
        ]
        print(len(inputs[0]))
        outputs = model_pipeline(inputs, max_length=max_output_length, do_sample=False)
        interpretations.extend([out["generated_text"] for out in outputs])
    df["interpretation"] = interpretations
    return df


In [None]:
type(model)

In [None]:
result_df = batch_interpret_df(dataset, text2text_generator, batch_size=1)
# print(result_df[["dream", "interpretation"]])


In [None]:
result_df

In [None]:
result_df.columns

In [None]:
result_df[['prompt', 'symbols','dream', 'interpretation']].to_html("datasets/dream_interpretations.html", index=False)

In [None]:
result_df.interpretation.str.len()

### Summarize with Mistral model

In [None]:
# üîß Script: dream_interpreter_mistral_4bit.py
# ------------------------------------------
# This script loads the Mistral-7B-Instruct model in 4-bit quantized mode
# and runs batch dream interpretation using instruction prompting.

import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from tqdm import tqdm


# Format prompt for causal model

def format_mistral_input(prompt, dream, symbols):
    return f"""### Instruction:
{prompt.strip()}

### Dream:
{dream.strip()}

### Symbols:
{symbols.strip()}

### Interpretation:"""

# Load Mistral 7B in 4-bit mode
def load_mistral_4bit_pipeline(model_name="mistralai/Mistral-7B-Instruct", max_new_tokens=256):
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4"
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        quantization_config=bnb_config,
        torch_dtype=torch.float16
    )

    return pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=max_new_tokens,
        do_sample=False
    )

# Batch processing
def batch_generate_interpretations(df, model_pipeline, batch_size=2, max_input_tokens=8192):
    interpretations = []
    for i in tqdm(range(0, len(df), batch_size), desc="Generating Interpretations"):
        batch_df = df.iloc[i:i+batch_size]

        prompts = [format_mistral_input(row["prompt"], row["dream"], row["symbols"]) for _, row in batch_df.iterrows()]

        for prompt in prompts:
            token_count = len(model_pipeline.tokenizer.encode(prompt))
            if token_count > max_input_tokens:
                print(f"‚ö†Ô∏è Prompt truncated: {token_count} tokens (limit = {max_input_tokens})")

        outputs = model_pipeline(prompts)
        batch_outputs = [out[0]["generated_text"].split("### Interpretation:")[-1].strip() for out in outputs]

        interpretations.extend(batch_outputs)

    df["interpretation"] = interpretations
    return df


In [None]:
dataset

In [None]:
release_all_gpu_memory(["model_pipeline"])
print("\nüîÅ Loading Mistral-7B-Instruct in 4-bit...")
model_pipeline = load_mistral_4bit_pipeline("mistralai/Mistral-7B-Instruct-v0.2")


In [None]:

print("\nüß† Running interpretations...")

result_df = batch_generate_interpretations(dataset, model_pipeline, batch_size=4)
print(result_df[["dream", "interpretation"]])


In [None]:
result_df

In [None]:
def save_df_as_pretty_html(df, filename="output.html"):
    # Convert newlines to <br> for HTML
    df_html_ready = df.copy()
    for col in df_html_ready.columns:
        df_html_ready[col] = df_html_ready[col].astype(str).str.replace('\n', '<br>', regex=False)

    # Generate styled HTML
    html = df_html_ready.to_html(
        escape=False,  # Needed to render <br>
        index=False,
        border=0,
        classes="styled-table"
    )

    # Add CSS styling
    style = """
    <style>
    .styled-table {
        border-collapse: collapse;
        margin: 25px 0;
        font-size: 16px;
        font-family: Arial, sans-serif;
        width: 100%;
        table-layout: auto;
        word-break: break-word;
    }
    .styled-table th, .styled-table td {
        border: 1px solid #dddddd;
        padding: 10px;
        vertical-align: top;
        text-align: left;
    }
    .styled-table th {
        background-color: #f2f2f2;
    }
    </style>
    """

    # Write full HTML document
    with open(filename, "w", encoding="utf-8") as f:
        f.write(f"<!DOCTYPE html><html><head>{style}</head><body>{html}</body></html>")

    print(f"‚úÖ HTML table saved to: {filename}")

In [None]:
result_df["symbols"] = result_df["symbols"].str.replace(r"\n", "<br>")
save_df = result_df[['prompt', 'symbols','dream', 'interpretation']]
path = "datasets/mistral_interpretations.html"
save_df_as_pretty_html(save_df, path)