## **Step 1 - keywords Extraction**
***

We have two datasets, one with dream text descriptions:

In [None]:
from keyword_extractor import read_datasets, extract_and_save_keywords_from_dataframes
from yaml_parser import load_config
config = load_config()
dream_df, keywords_df = read_datasets(config)
dream_df.head()

And another one with interpretations of dreams according to keywords:

In [None]:
keywords_df

Now, we will use pretrained LLMs in order to extract the given keywords from the keywords dataset , from the dream text description from the dream text dataset.

### **GPT2**
***

In [None]:
dream_df = extract_and_save_keywords_from_dataframes()

## Step 2 - Summarize interpretations

### Load data and prepare (small) dataset for experimenting

In [None]:
import pandas as pd
import pandasql as ps
from plotly import express as px
from datetime import datetime
from transformers import pipeline
from datasets import Dataset
from utils import  release_all_gpu_memory, save_df_as_pretty_html
from summarizer import load_causal_model, batch_generate_interpretations
import torch

In [None]:
dataset = []

prmt = """Given dream description, interpret the meaning of the dream. 
Provided also are the dream symbols that appear in the dream and their meanings. 
Use the dream symbols meanings to help you interpret the dream. """.replace("\n", " ")


for i, ex in dream_df.iterrows():
    #print(ex)
    keys = ex["Dream Symbol"].split(",")[:5]
    
    #print(keys)
    syms = keywords_df[keywords_df["Dream Symbol"].isin(keys)]

    descr = syms.apply(lambda r: f' - {r["Dream Symbol"]}:  {r["Interpretation"]}', axis = 1)
    item = {
        "prompt": prmt, 
        "dream": ex["text_dream"],
        "symbols": "\n".join(descr),
        }
    dataset.append(item)
    

dataset = pd.DataFrame(dataset)
dataset


### Summarize with flan-T5-large model

In [None]:
release_all_gpu_memory()

In [None]:
# Step 1: Load FLAN-T5 model and tokenizer
model_name = "google/flan-t5-large"
model_name_short = model_name.split("/")[-1]
device = 0 if torch.cuda.is_available() else -1
model, tokenizer = load_causal_model(model_name)

In [None]:
text2text_generator = pipeline(
        "text2text-generation",
        model=model,
        tokenizer=tokenizer,
        max_length=1024,           # ✅ allow longer input
        truncation=True,           # ✅ ensure truncation at tokenizer level
        device=device,
    )

In [None]:
tstp = datetime.now().strftime(r"%y.%m.%d-%H")
result_df = batch_generate_interpretations(dataset, text2text_generator, batch_size=100, max_length=250)


In [None]:
postproc = lambda out: out["generated_text"].strip()
result_df["interpretation"] = result_df["interpretation"].apply(postproc)


In [None]:
result_df

In [None]:
result_df.columns

In [None]:
save_df = result_df[['prompt', 'symbols','dream', 'interpretation']]

path = f"output/{model_name_short}_{tstp}"

save_df_as_pretty_html(save_df, path + ".html")

save_df.to_csv(path + ".csv", index = False)

In [None]:
result_df.interpretation.str.len()

### Summarize with Mistral model

In [None]:
from summarizer import load_mistral_4bit_model, find_max_batch_size, PromptFormatter

In [None]:
print("Loading Mistral-7B-Instruct in 4-bit...")

model_name = "mistralai/Mistral-7B-Instruct-v0.2"
model_name_short = model_name.split("/")[-1]

model_family="decoder"
formatter = PromptFormatter(model_family)
# max_new_tokens=256         max_new_tokens=max_new_tokens,

model, tokenizer = load_mistral_4bit_model(model_name)


#### Prepare and analize dataset

In [None]:
dataset["input"] = dataset.apply(lambda r: formatter.format(r['prompt'], r['dream'], r['symbols']), axis = 1)
dataset["len"] = dataset["input"].str.len()
dataset["input_tokens"] = dataset.input.apply(lambda prmt: tokenizer.tokenize(prmt, truncation=False, max_length=1024))
dataset["input_tokens_len"] = dataset.input_tokens.apply(len)
dataset

In [None]:
px.histogram(dataset, x = "input_tokens_len")

In [None]:
px.histogram(dataset, x = "len")

In [None]:
px.scatter(dataset, x = "len", y = "input_tokens_len")

In [None]:
dataset = dataset[dataset["input_tokens_len"] <1024]
dataset

### Run the pipeline on the whole dataset

In [None]:
#dataset.to_csv("datasets/prepared_dataset.csv", index=False)
dataset = pd.read_csv("datasets/prepared_dataset.csv")

In [None]:
sample_prompt = dataset.iloc[3248]
print(f"Sample prompt length (characters): {len(sample_prompt)}")
optimal_batch_size = find_max_batch_size(model, tokenizer, sample_prompt,   max_possible=2048,     max_length=1024)

first run on 24 Gb GPU: 
```
Sample prompt length (characters): 2437  
Trying batch_size = 512... ❌ OOM  
Trying batch_size = 256... ❌ OOM  
Trying batch_size = 128... ❌ OOM  
Trying batch_size = 64... ❌ OOM  
Trying batch_size = 32... ✅ success  
Trying batch_size = 48... ✅ success  
Trying batch_size = 56... ✅ success  
Trying batch_size = 60... ❌ OOM  
Trying batch_size = 58... ❌ OOM  
Trying batch_size = 57... ✅ success  

✅ Optimal batch size: 57
```

second run on 24 Gb GPU: 
```
Sample prompt length (characters): 2437
Trying batch_size = 1024... ❌ OOM
Trying batch_size = 512... ❌ OOM
Trying batch_size = 256... ❌ OOM
Trying batch_size = 128... ❌ OOM
Trying batch_size = 64... ✅ success
Trying batch_size = 96... ✅ success
Trying batch_size = 112... ❌ OOM
Trying batch_size = 104... ❌ OOM
Trying batch_size = 100... ❌ OOM
Trying batch_size = 98... ✅ success
Trying batch_size = 99... ❌ OOM

✅ Optimal batch size: 98
```

In [None]:
release_all_gpu_memory(["model","model_pipeline","dataloader"])

In [None]:
dataloader = Dataset.from_pandas(dataset)

In [None]:
optimal_batch_size = 96

model_pipeline = pipeline(
        "text-generation",
        model=model,
        batch_size=optimal_batch_size,
        tokenizer=tokenizer,
        max_length = 1024,
        truncation=False,
        do_sample=False
    )


In [None]:
from importlib import reload
import summarizer
reload(summarizer)

# Now re-import manually
from summarizer import batch_generate_interpretations


In [None]:

print("\n🧠 Running interpretations...")
tstp = datetime.now().strftime(r"%y.%m.%d-%H")

result_df = batch_generate_interpretations(dataloader, model_pipeline, formatter, batch_size=optimal_batch_size, max_length=1024)
#print(result_df[["dream", "interpretation"]])


In [None]:

save_df = result_df[['prompt', 'symbols','dream', 'interpretation']]

path = f"output/{model_name_short}_{tstp}"
save_df_as_pretty_html(save_df, path + ".html")

save_df.to_csv(path + ".csv")

# Memory investigation

In [None]:
from utils import globals_snapshot

tps = globals_snapshot()

q = """
select type, count(var) as cnt
from tps
group by type 
order by cnt desc 

"""
df = ps.sqldf(q)
df