## **Step 1 - keywords Extraction**
***

We have two datasets, one with dream text descriptions:

In [None]:
from keyword_extractor import read_datasets, extract_and_save_keywords_from_dataframes
from yaml_parser import load_config
config = load_config()
dream_df, keywords_df = read_datasets(config)
dream_df.head()

And another one with interpretations of dreams according to keywords:

In [None]:
keywords_df

Now, we will use pretrained LLMs in order to extract the given keywords from the keywords dataset , from the dream text description from the dream text dataset.

### **GPT2**
***

In [None]:
dream_df = extract_and_save_keywords_from_dataframes()

## Step 2 - Summarize interpretations

### Load data and prepare (small) dataset for experimenting

In [16]:
import pandas as pd
import pandasql as ps
from plotly import express as px
from datetime import datetime
from transformers import pipeline
from datasets import Dataset
from utils import  release_all_gpu_memory, save_df_as_pretty_html
from summarizer import load_causal_model, batch_generate_interpretations
import torch
import hashlib

In [None]:
dataset = []

prmt = """Given dream description, interpret the meaning of the dream. 
Provided also are the dream symbols that appear in the dream and their meanings. 
Use the dream symbols meanings to help you interpret the dream. """.replace("\n", " ")


for i, ex in dream_df.iterrows():
    #print(ex)
    keys = ex["Dream Symbol"].split(",")[:5]
    
    #print(keys)
    syms = keywords_df[keywords_df["Dream Symbol"].isin(keys)]

    descr = syms.apply(lambda r: f' - {r["Dream Symbol"]}:  {r["Interpretation"]}', axis = 1)
    item = {
        "prompt": prmt, 
        "dream": ex["text_dream"],
        "symbols": "\n".join(descr),
        }
    dataset.append(item)
    

dataset = pd.DataFrame(dataset)
dataset


### Summarize with flan-T5-large model

In [None]:
release_all_gpu_memory()

In [None]:
# Step 1: Load FLAN-T5 model and tokenizer
model_name = "google/flan-t5-large"
model_name_short = model_name.split("/")[-1]
device = 0 if torch.cuda.is_available() else -1
model, tokenizer = load_causal_model(model_name)

In [None]:
text2text_generator = pipeline(
        "text2text-generation",
        model=model,
        tokenizer=tokenizer,
        max_length=1024,           # ✅ allow longer input
        truncation=True,           # ✅ ensure truncation at tokenizer level
        device=device,
    )

In [None]:
tstp = datetime.now().strftime(r"%y.%m.%d-%H")
result_df = batch_generate_interpretations(dataset, text2text_generator, batch_size=100, max_length=250)


In [None]:
postproc = lambda out: out["generated_text"].strip()
result_df["interpretation"] = result_df["interpretation"].apply(postproc)


In [None]:
result_df

In [None]:
result_df.columns

In [None]:
save_df = result_df[['prompt', 'symbols','dream', 'interpretation']]

path = f"output/{model_name_short}_{tstp}"

save_df_as_pretty_html(save_df, path + ".html")

save_df.to_csv(path + ".csv", index = False)

In [None]:
result_df.interpretation.str.len()

### Summarize with Mistral model

In [2]:
from summarizer import load_mistral_4bit_model, find_max_batch_size, PromptFormatter

In [3]:
print("Loading Mistral-7B-Instruct in 4-bit...")

model_name = "mistralai/Mistral-7B-Instruct-v0.2"
model_name_short = model_name.split("/")[-1]

model_family="decoder"
formatter = PromptFormatter(model_family)
# max_new_tokens=256         max_new_tokens=max_new_tokens,

model, tokenizer = load_mistral_4bit_model(model_name)


Loading Mistral-7B-Instruct in 4-bit...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

#### Prepare and analize dataset

In [None]:
dataset["input"] = dataset.apply(lambda r: formatter.format(r['prompt'], r['dream'], r['symbols']), axis = 1)
dataset["len"] = dataset["input"].str.len()
dataset["input_tokens"] = dataset.input.apply(lambda prmt: tokenizer.tokenize(prmt, truncation=False, max_length=1024))
dataset["input_tokens_len"] = dataset.input_tokens.apply(len)
dataset

In [None]:
px.histogram(dataset, x = "input_tokens_len")

In [None]:
px.histogram(dataset, x = "len")

In [None]:
px.scatter(dataset, x = "len", y = "input_tokens_len")

In [None]:
dataset = dataset[dataset["input_tokens_len"] <1024]
dataset

### Run the pipeline on the whole dataset

In [4]:
#dataset.to_csv("datasets/prepared_dataset.csv", index=False)
dataset = pd.read_csv("datasets/prepared_dataset.csv")

In [10]:
sample_prompt = dataset.iloc[11549]["input"]
sample_prompt


"### Instruction:\nGiven dream description, interpret the meaning of the dream.  Provided also are the dream symbols that appear in the dream and their meanings.  Use the dream symbols meanings to help you interpret the dream.\n\n### Dream:\nI had an off-the-wall dream last night, and I didn't remember it until I was in the middle of class today. Isn't that funny? I think somebody said something or did something and it just all came back to me. I dreamt that I was back at Westport High School taking PE with Mrs. Swenson again, and she was a real cool, relaxed, laid back kind of person. Well, both of them were -- Mr. Jameson was mean to everybody else but nice to me for some reason, maybe because I was nice to him from the start. I don't know. But I dreamt that I was back with Mrs. Swenson and there was a gate at the back of the tennis court leading to the track that was one of those crazy emergency exit doors with the stupid alarm things, except the alarm wasn't just at the door; it tr

In [13]:
dataset.sort_values("input_tokens_len",     ascending=False)

Unnamed: 0,prompt,dream,symbols,input,len,input_tokens,input_tokens_len
11334,"Given dream description, interpret the meaning...",We were going camping and we were in the car a...,- Camping: To dream that you are camping ind...,"### Instruction:\nGiven dream description, int...",4226,"['▁###', '▁Inst', 'ruction', ':', '<0x0A>', 'G...",999
13201,"Given dream description, interpret the meaning...","A Potentially Round Table I'm with my mom, br...",- Canoe: To see a canoe in your dream repres...,"### Instruction:\nGiven dream description, int...",4213,"['▁###', '▁Inst', 'ruction', ':', '<0x0A>', 'G...",999
1937,"Given dream description, interpret the meaning...",I am in a large house. High school friend Dora...,- Haunted House: To dream of a haunted house...,"### Instruction:\nGiven dream description, int...",3996,"['▁###', '▁Inst', 'ruction', ':', '<0x0A>', 'G...",999
14830,"Given dream description, interpret the meaning...",I was in my apartment with a girl whom I know ...,- Accident: To dream that you are in an acci...,"### Instruction:\nGiven dream description, int...",4344,"['▁###', '▁Inst', 'ruction', ':', '<0x0A>', 'G...",999
14541,"Given dream description, interpret the meaning...",We were driving down the road past churches wh...,- Deja Vu: To dream of Déjà Vu indicates som...,"### Instruction:\nGiven dream description, int...",4178,"['▁###', '▁Inst', 'ruction', ':', '<0x0A>', 'G...",999
...,...,...,...,...,...,...,...
6530,"Given dream description, interpret the meaning...",I was doing something that kind of reminded me...,- Geometry: To dream about geometry refers t...,"### Instruction:\nGiven dream description, int...",1480,"['▁###', '▁Inst', 'ruction', ':', '<0x0A>', 'G...",332
13402,"Given dream description, interpret the meaning...",I had a dream about being in a band that plays...,- Band: To see a band or play with a band in...,"### Instruction:\nGiven dream description, int...",1403,"['▁###', '▁Inst', 'ruction', ':', '<0x0A>', 'G...",330
17906,"Given dream description, interpret the meaning...","In a room filled with people, they were awardi...",- Cabaret: To watch a cabaret in your dream ...,"### Instruction:\nGiven dream description, int...",1370,"['▁###', '▁Inst', 'ruction', ':', '<0x0A>', 'G...",329
3100,"Given dream description, interpret the meaning...",A cat is sick and very tired. Then it dies. Th...,- Dachshund: To see a Dachshund in your drea...,"### Instruction:\nGiven dream description, int...",1338,"['▁###', '▁Inst', 'ruction', ':', '<0x0A>', 'G...",324


In [5]:
dataset = dataset[dataset["input_tokens_len"] <1000]

In [17]:
def get_hash(text):
    return hashlib.md5(text.encode("utf-8")).hexdigest()

dataset["hash"] = dataset["input"].apply(get_hash) 



In [18]:
dataset

Unnamed: 0,prompt,dream,symbols,input,len,input_tokens,input_tokens_len,hash
0,"Given dream description, interpret the meaning...","The one at the Meads's house, where it's bigge...",- Haunted House: To dream of a haunted house...,"### Instruction:\nGiven dream description, int...",2246,"['▁###', '▁Inst', 'ruction', ':', '<0x0A>', 'G...",540,86650c93e28bd9853d92fe96b4e5700b
1,"Given dream description, interpret the meaning...",I'm at a family reunion in a large fine house ...,- Fainting: To dream that you are fainting s...,"### Instruction:\nGiven dream description, int...",2702,"['▁###', '▁Inst', 'ruction', ':', '<0x0A>', 'G...",644,0e2aab6205953556b287e2e0dd36eaf5
2,"Given dream description, interpret the meaning...",I watch a plane fly past and shortly realize i...,- Abortion: To dream that you have an aborti...,"### Instruction:\nGiven dream description, int...",3467,"['▁###', '▁Inst', 'ruction', ':', '<0x0A>', 'G...",807,9a51bd5d4083fb5f70146f9772f3758a
5,"Given dream description, interpret the meaning...",Living next door to Loretta in an apartment - ...,- Backyard: To dream about your backyard rep...,"### Instruction:\nGiven dream description, int...",2998,"['▁###', '▁Inst', 'ruction', ':', '<0x0A>', 'G...",719,e6a569c9a61ccf47697df9fdfdc20fd6
6,"Given dream description, interpret the meaning...",Kidnapped - I'm on my way somewhere else (by c...,- Abduction: To dream of being abducted indi...,"### Instruction:\nGiven dream description, int...",3005,"['▁###', '▁Inst', 'ruction', ':', '<0x0A>', 'G...",716,b71a456a6af99d9f86aa8f77b72d9d9a
...,...,...,...,...,...,...,...,...
18943,"Given dream description, interpret the meaning...",The dream was about me and my boyfriend going ...,- Landslide: To see a landslide in your drea...,"### Instruction:\nGiven dream description, int...",2617,"['▁###', '▁Inst', 'ruction', ':', '<0x0A>', 'G...",581,992e14d1251d590a36828c60cb96a6d1
18944,"Given dream description, interpret the meaning...",Two weeks ago this guy asked me to Senior Ball...,- Bait: To see bait in your dream suggests t...,"### Instruction:\nGiven dream description, int...",2208,"['▁###', '▁Inst', 'ruction', ':', '<0x0A>', 'G...",531,6948c0d7a3389483cdca760a0fd37003
18945,"Given dream description, interpret the meaning...",My boyfriend just broke up with me so he was o...,- Deja Vu: To dream of Déjà Vu indicates som...,"### Instruction:\nGiven dream description, int...",2503,"['▁###', '▁Inst', 'ruction', ':', '<0x0A>', 'G...",579,c251eae1fa38f7834e6dbfa2e40f5180
18946,"Given dream description, interpret the meaning...",I was in my backyard and I was flying. I would...,- Jumping: To dream that you are jumping ind...,"### Instruction:\nGiven dream description, int...",2174,"['▁###', '▁Inst', 'ruction', ':', '<0x0A>', 'G...",506,e6fe4e448b2104b701f44ad29b57a70e


In [11]:
print(f"Sample prompt length (characters): {len(sample_prompt)}")
optimal_batch_size = find_max_batch_size(model, tokenizer, sample_prompt, max_possible=2048, max_length=1024)

Device set to use cuda:0


Sample prompt length (characters): 4371
Trying batch_size = 1024... ❌ OOM


Device set to use cuda:0


Trying batch_size = 512... ❌ OOM


Device set to use cuda:0


Trying batch_size = 256... ❌ OOM


Device set to use cuda:0


Trying batch_size = 128... ❌ OOM


Device set to use cuda:0


Trying batch_size = 64... ❌ OOM


Device set to use cuda:0


Trying batch_size = 32... 

Device set to use cuda:0


✅ success
Trying batch_size = 48... 

Device set to use cuda:0


✅ success
Trying batch_size = 56... ❌ OOM


Device set to use cuda:0


Trying batch_size = 52... 

Device set to use cuda:0


✅ success
Trying batch_size = 54... ❌ OOM


Device set to use cuda:0


Trying batch_size = 53... ❌ OOM

✅ Optimal batch size: 52


first run on 24 Gb GPU: 
```
Sample prompt length (characters): 2437  
Trying batch_size = 512... ❌ OOM  
Trying batch_size = 256... ❌ OOM  
Trying batch_size = 128... ❌ OOM  
Trying batch_size = 64... ❌ OOM  
Trying batch_size = 32... ✅ success  
Trying batch_size = 48... ✅ success  
Trying batch_size = 56... ✅ success  
Trying batch_size = 60... ❌ OOM  
Trying batch_size = 58... ❌ OOM  
Trying batch_size = 57... ✅ success  

✅ Optimal batch size: 57
```

second run on 24 Gb GPU: 
```
Sample prompt length (characters): 2437
Trying batch_size = 1024... ❌ OOM
Trying batch_size = 512... ❌ OOM
Trying batch_size = 256... ❌ OOM
Trying batch_size = 128... ❌ OOM
Trying batch_size = 64... ✅ success
Trying batch_size = 96... ✅ success
Trying batch_size = 112... ❌ OOM
Trying batch_size = 104... ❌ OOM
Trying batch_size = 100... ❌ OOM
Trying batch_size = 98... ✅ success
Trying batch_size = 99... ❌ OOM

✅ Optimal batch size: 98
```

In [12]:
release_all_gpu_memory(["model","model_pipeline","dataloader"])

['model', 'tokenizer', 'text2text_generator', 'model', 'model_pipeline', 'dataloader']
['__name__', '__doc__', '__package__', '__loader__', '__spec__', '__file__', '__cached__', '__builtins__', 'release_all_gpu_memory', 'globals_snapshot', 'save_df_as_pretty_html']
clearing cuda cache
clearing ipc cache
✅ All GPU memory cleared.


In [6]:
dataloader = Dataset.from_pandas(dataset)

In [20]:
optimal_batch_size = 50

model_pipeline = pipeline(
        "text-generation",
        model=model,
        batch_size=optimal_batch_size,
        tokenizer=tokenizer,
        max_length = 1024,
        truncation=True,
        do_sample=False
    )


Device set to use cuda:0


In [11]:
from importlib import reload
import summarizer
reload(summarizer)

# Now re-import manually
from summarizer import batch_generate_interpretations


In [None]:

print("\n🧠 Running interpretations...")
tstp = datetime.now().strftime(r"%y.%m.%d-%H")

batch_generate_interpretations(dataset, model_pipeline, input_column="input", output_column="interpretation", batch_size=optimal_batch_size)
#print(result_df[["dream", "interpretation"]])



🧠 Running interpretations...
✅ Already processed: 6 / 18776 entries


Generating batches:   0%|          | 0/376 [00:00<?, ?it/s]

Generating batches:   0%|          | 1/376 [08:04<50:27:05, 484.34s/it]

In [None]:

save_df = result_df[['prompt', 'symbols','dream', 'interpretation']]

path = f"output/{model_name_short}_{tstp}"
save_df_as_pretty_html(save_df, path + ".html")

save_df.to_csv(path + ".csv")

# Memory investigation

In [None]:
from utils import globals_snapshot

tps = globals_snapshot()

q = """
select type, count(var) as cnt
from tps
group by type 
order by cnt desc 

"""
df = ps.sqldf(q)
df