In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import string
import lxml
import time
import os
import lxml.html as lhtml
from datetime import datetime
import pandas as pd
import re
from utils import save_df_as_pretty_html
from plotly import express as px
# Base URL pattern for dream moods dictionary


# List to store dream symbols and meanings
dreams = []
meanings = []

## Data Scraping from dreammoods.com

The website structure is fairly simple. Dream symbols are grouped into pages of their first letter:  
http://www.dreammoods.com/dreamdictionary/c_all.htm

In [None]:

def download_page(url, filename ):
    if os.path.exists(filename):
        print (f"File {filename} already exists, skipping download.")
        return 'skip'
    
    response = requests.get(url)

    if response.status_code == 200:  # Check if the page was successfully fetched
        with open(filename, "w") as f:
            f.write(response.text)
        print(f"Downloaded {url} to {filename}")
        return 'ok'
    
    else:        
        newurl = url.replace("_all","")
        if newurl != url:
            print(f"Failed to retrieve {url}, trying alternative")
            download_page(newurl, filename)
        else:
            print(f"Failed to retrieve {url} and no alternative available.")
            return 'fail'
        



In [None]:

base_url = "http://www.dreammoods.com/dreamdictionary/"
suffix = "{}_all.htm"

for letter in string.ascii_lowercase:

    file = f"html/{letter}_all.html"

    url = base_url + suffix.format(letter)

    download_page(url, file)
    time.sleep(.5)  # Sleep for 1 second to avoid overwhelming the server


### Custom scraper
Their HTML structure, however, is anything but simple. So we had to build a highly customized scraper.

In [None]:
# use this string to debug code near specific text in html
breakpointstring = "your anchor text"

def brpt_anchor(val):
    #if re.search(breakpointstring, tc): 
    if breakpointstring.lower() in str(val).lower():
        print("found anchor text: ", breakpointstring, ", value: ", val)

In [None]:

def break_attr(dic):
    drop = ['style','align','face']
    for dr in drop:
        if dr in dic:
            dic.pop(dr)
    return str(dic)

def classify(t):
    
    brpt_anchor(t.text)
        
    
    if type(t) is lxml.html.HtmlComment:
        return "trsh"

    if t.tag == "b":
        return "mean" #"bold"
    
    trashtags = ["div","script","iframe","img"]

    if t.tag == "strong":
        return "sym" #"bold"    

    elif t.tag == "font":

        if 'size' in t.attrib:
            sz = t.attrib['size']
            if sz == "4" or sz == "+1":
                return "sym" #"size4"
            elif sz == "3":
                return "mean" #"size3"
        else:
            return "mean" # no size defined
            
    elif t.tag == "a":
        if 'href' in t.attrib:
            if t.attrib['href'] == "#Top":
                return "trsh"
            return "link"
        elif 'name' in t.attrib:
            return "sym" #"header"
    
    elif t.tag in trashtags:
        return "trsh"
    
    return "mean" # "desc"

def fetch_link(chld):

    if chld.tag == "a":
        if 'href' in chld.attrib:
            return ',' + chld.attrib['href'] + ','
            
    else:
        for c in chld.getchildren():
            href = fetch_link(c)
            if href != "":
                return href
        return ""


def collect(node, cl):

    brpt_anchor(node.text)

    if "trsh" in cl:
        return ""    
    
    if "link" in cl:
        res = fetch_link(node)
        return res
    
    if node.text == None:
        return ""

    else:
        #res = node.text_content()
        res = str(node.text  or '')+  str(node.tail or'')
        res = res.replace("\xa0"," ")
        res = re.sub(r"\n+","\n",res)
        res = re.sub(r'\s+', ' ', res)
        if cl == "mean":
            res = res + " "
        return res


def breakdown(node):
    
    brpt_anchor(node.text_content())

    clas = [classify(node)]
    cont = [collect(node, clas[0])]

    if "trsh" in clas[-1]:
        return ["trsh"], ""
    
    for c in node.getchildren():
        
        cl, co = breakdown(c)

        clas += cl
        cont += co

    return clas, cont


def extract_reference(cont):
    syms = []
    for c in cont:
        c= c.lower()
        if "dreammoods.com/dreamdictionary" in c and "#" in c:
            syms.append("#Ref:" + c.split("#")[-1].strip(","))
        elif "lease" in c or "see" in c:
            if "also" in c:
                return None
            #syms.append(c)
    return syms[0] if len(syms)>0 else None



def collect_paragraphs(r):
    syms_data = []
    for c in r.getchildren():
        clas, cont = breakdown(c)
        fclas = "sym" if "sym" in clas else "link" if "link" in clas else "mean"
        
        if fclas == "link":
            text = extract_reference(cont)
        else:
            text = "".join(cont).strip()
            text = re.sub(r'\s+', ' ', text)
        syms_data.append({"clas": clas, "final_class":fclas, "cont": cont, "text": text})

    return syms_data



Test with a single file, e.g. c_all.html

In [None]:
breakpointstring = "door kno"

with open("html/dreamdictionary/d_all.html", "r") as f:
    html = f.read()

html = html.replace("\xa0"," ")

xp = "/html/body/table[2]/tr/td/div/center/table/tr[4]/td[1]"

tree = lhtml.fromstring(html)

tabl = tree.xpath(xp)[0]

sd = collect_paragraphs(tabl)


In [None]:
sd = pd.DataFrame(sd)
sd

Extract links

In [None]:
links = sd[sd["final_class"] == "link"]
links


In [None]:
def squeeze_meaning_by_symbol(syms_data):
    # convert scraped data to symbol: meaning data 
    data2 =[]
    sym = ""
    meanings = []

    for i, r in syms_data.iterrows():
        if r['final_class'] == "sym":
            
            data2.append({"symbol": sym, "meaning": meanings})
            meanings = []
            sym = r['text']
        else:
            if r['text'] is not None:
                meanings.append(r['text'])
        

    data2 = pd.DataFrame(data2)
    data2['n_meanings'] = data2.meaning.apply(len)
    return data2



In [None]:
breakpointstring = "animals being abused"

data2 = squeeze_meaning_by_symbol(sd)
data3 = data2.explode('meaning')
data3['mean_len'] = data3.meaning.str.len()
data3.dropna(inplace=True)
data3

In [None]:
tc = int(data3['mean_len'].sum())
tstp = datetime.now().strftime(r"%y.%m.%d-%H")
tc, tstp

total chars extracted from c_all.html:  
25.05.10 19:00 - 191953


In [None]:
data3.to_csv(f"datasets/c_scraped_{tstp}_{tc}.csv", index=False)

In the end, the above logic can be summarized in a function: 

In [None]:
def extract_letter(file):
    with open(file, "r") as f:
        html = f.read()
        
    xp = "/html/body/table[2]/tr/td/div/center/table/tr[4]/td[1]"

    tree = lhtml.fromstring(html)

    #tree.text_content()

    tabl = tree.xpath(xp)[0]

    syms_data = collect_paragraphs(tabl)
    syms_data = pd.DataFrame(syms_data)
    data2 = squeeze_meaning_by_symbol(syms_data)
    data3 = data2.explode('meaning')
    data3['mean_len'] = data3.meaning.str.len()
    data3.dropna(inplace=True)
    return data3



And executed on all the letters htmls to extract the data 

In [None]:


dfs = []
save_dir = 'html/dreamdictionary'
tstp = datetime.now().strftime(r"%y.%m.%d-%H")
for f in os.listdir(save_dir):
    if f.endswith(".html"):
        try:
            existing_df = extract_letter(os.path.join(save_dir, f))
            existing_df["filename"] = f
            dfs.append(existing_df)
        except Exception:
            continue

dataset = pd.concat(dfs)
dataset = dataset[dataset['mean_len'] > 0]
dataset

In [None]:
tstp = datetime.now().strftime(r"%y.%m.%d-%H")
fname = f"datasets/rescraped_{tstp}"

save_df_as_pretty_html(dataset, fname + ".html")
dataset.to_csv(fname + ".csv", index=False)

In [None]:
dataset #[dataset['mean_len'] ==0]

In [None]:
px.histogram(dataset.mean_len)

### Potential data augmentation
the dream interpretations generally have a rather consistent structure. Maybe this can be leveraged to extract a more detailed dream symbol from the data, without running it through LLM. for further development.

In [None]:
# potential regex to split symbol and meaning

"(suggest|represent|symbolize|indicate|signif|mean|analogous|implies|denote|refers to)y?(ie)?s?ze? "

## Dreambank

On this godforsaken website there is a section called dreambank, which has some dreams descriptions and interpretations of them by the moderators.  
We'd like to use this as our test data, so we need to scrape them as well  



### Scrape the list of dreams
First, We need top scrape the dreambank.html page, which holds the links to all the interpreted dream. 

In [None]:
file = f"html/dreambank/dreambank.html"

url = "http://www.dreammoods.com/dreambank/"
download_page(url, file)

In [None]:
def extract_bank(file, xpath):
    with open(file, "r") as f:
        html = f.read()       

    tree = lhtml.fromstring(html)

    #tree.text_content()

    tabl = tree.xpath(xpath)[0]

    global syms_data
    syms_data = []
    r,c,t = breakdown(tabl)
    syms_data = pd.DataFrame(syms_data)
    data2 = squeeze_meaning_by_symbol(syms_data)
    data3 = data2.explode('meaning')
    data3['mean_len'] = data3.meaning.str.len()
    return data3

xps = [ "/html/body/table[2]/tr/td/div/center/table/tr[3]/td[1]" ,
        "/html/body/table[2]/tr/td/div/center/table/tr[3]/td[2]",
        "/html/body/table[2]/tr/td/div/center/table/tr[3]/td[3]"
]
sd = []
for xp in xps:
    extract_bank(file, xp)
    sd.append(syms_data)

df = pd.concat(sd)


In [None]:
df

Next, we extract the actual links and their description text. 

In [None]:
df = df[ df.clas == "link"]
df[['_', 'link', 'name']] = df['cont'].str.split(',', n=2, expand=True)
df

The links contain a search term. 
We'll use this search term as filename to store the downloaded page

In [None]:
df["srch"] = df.link.str.split('search=', n=2, expand=True)[1].str.split('&', n=2, expand=True)[0]

Some links do not conform to this structure, so we'll use their page name verbatim

In [None]:
wat = df[df.srch.isna()].link.str.split("/", n=5, expand=True)
df.loc[df.srch.isna(),'srch'] = wat.loc[:,4]
df

Majority of those links do not work. By analyzing the url structure, we came up with an educated guess as to how they might be fixed. Some of it succeeded.

In [None]:
df.link =  df.apply(lambda x: x.link.replace("dreambank", x.srch[:-1] + "s") , axis=1)

In [None]:
df

Then downloaded any pages that were available. 

In [None]:
pref = "html/dreambank/"

results = []
for i, r in df.iterrows():
    res = download_page(r['link'], pref + r['srch'] + r['name']  + ".html")
    results.append(res)
    time.sleep(.2)  # Sleep for 1 second to avoid overwhelming the server

df['res'] = results

In [None]:
df[[ 'res', 'link', 'name']]

In [None]:
df.sort_values(by=['res','link'], ascending=True, inplace=True)
df[[ 'res', 'link', 'name']].to_csv("dreamdic.csv", index=False)

### Fixing html

Being an ancient dinosaur poop that it is, the website is a horrendous case of html tag soup.  
Even our highly customized extractor fails to properly parse the pages.  
We'll summon modern web browser technology in the form of html5lib library to repair these pages.

In [None]:


def fix_html_file(input_filepath, output_filepath):
    """
    Reads a broken HTML file, parses it using html5lib (like a browser),
    and writes the corrected HTML to a new file.
    """
    try:
        with open(input_filepath, 'r', encoding='utf-8') as f:
            broken_html_content = f.read()
    except FileNotFoundError:
        print(f"Error: Input file not found at '{input_filepath}'")
        return
    except Exception as e:
        print(f"Error reading file '{input_filepath}': {e}")
        return

    # Parse the HTML using html5lib (browser-like parsing)
    # This is where the "fixing" happens internally
    soup = BeautifulSoup(broken_html_content, 'html5lib')

    # Get the corrected HTML. .prettify() adds nice indentation.
    # You can also use str(soup) for a less formatted output.
    corrected_html_content = soup.prettify()

    try:
        with open(output_filepath, 'w', encoding='utf-8') as f:
            f.write(corrected_html_content)
        print(f"Successfully fixed HTML and saved to '{output_filepath}'")
    except Exception as e:
        print(f"Error writing file '{output_filepath}': {e}")


In [None]:
broken_dir = 'html/dreambank'
fixed_dir = 'html/dreambank/fixed'

for f in os.listdir(broken_dir):
    if f.endswith(".html"):
        input_file = os.path.join(broken_dir, f)
        output_file = os.path.join(fixed_dir, f)
        fix_html_file(input_file, output_file)



### Extracting from Dream Bank
Once the pages are fixed, our scraper is again able to parse them 

In [None]:
def extract_bank(file, xpath):
    
    with open(file, "r") as f:
        html = f.read()
        

    tree = lhtml.fromstring(html)
    #tree.text_content()

    tabl = tree.xpath(xpath)[0]
    
    r,sd,c,t = breakdown(tabl)

    
    return r,sd,c,t
    # syms_data = pd.DataFrame([c,t])
    # # data2 = squeeze_meaning_by_symbol(syms_data)
    # # data3 = data2.explode('meaning')
    # # data3['mean_len'] = data3.meaning.str.len()
    # return pd.DataFrame(), syms_data


In [None]:
dfs = []
sds = []
save_dir = 'html/dreambank/fixed'

#xpath = "/html/body/table[2]/tr/td/div/center/table/tr[3]/td[1]"
xpath = "/html/body/table[2]/tbody/tr/td/div/center/table/tbody/tr[3]/td[1]"

breakpointstring = "I keep having this recurring"

for f in os.listdir(save_dir):
    if f.endswith(".html"):
        path = os.path.join(save_dir, f)
        #print(path)
        r,sd,c,t = extract_bank(path,xpath)
        # data3["filename"] = f
        dfs.append(r)
        sd = pd.DataFrame(sd)
        #sd = squeeze_meaning_by_symbol(syms_data=sd)
        sds.append(sd)
#sds = sds)
#sds

In [None]:
sds

In [None]:
save_df_as_pretty_html(sds[0], 'example.html')

In [None]:
r
with open("sandbox.txt", "w") as f:
    f.write(r)

# original code

In [None]:
import os
import subprocess
from IPython.display import FileLink, display

def download_file(path, download_file_name):
    os.chdir('/kaggle/working/')
    zip_name = f"/kaggle/working/{download_file_name}.zip"
    command = f"zip {zip_name} {path} -r"
    result = subprocess.run(command, shell=True, capture_output=True, text=True)
    if result.returncode != 0:
        print("Unable to run zip command!")
        print(result.stderr)
        return
    display(FileLink(f'{download_file_name}.zip'))

In [None]:
download_file("/kaggle/working/dreams_interpretations.csv", "download")