In [None]:
# Additional packages needed only for scraping

! pip install lxml BeautifulSoup4


In [218]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import string
import lxml
import time
import os
import lxml.html as lhtml
from datetime import datetime
import pandas as pd
import re
from utils import save_df_as_pretty_html
from plotly import express as px
# Base URL pattern for dream moods dictionary


# List to store dream symbols and meanings
dreams = []
meanings = []

## Data Scraping from dreammoods.com

The website structure is fairly simple. Dream symbols are grouped into pages of their first letter:  
http://www.dreammoods.com/dreamdictionary/c_all.htm

Thus we can download the whole raw data simply with few http requests :

In [219]:

def download_page(url, filename ):
    if os.path.exists(filename):
        print (f"File {filename} already exists, skipping download.")
        return 'skip'
    
    response = requests.get(url)

    if response.status_code == 200:  # Check if the page was successfully fetched
        with open(filename, "w") as f:
            f.write(response.text)
        print(f"Downloaded {url} to {filename}")
        return 'ok'
    
    else:        
        newurl = url.replace("_all","")
        if newurl != url:
            print(f"Failed to retrieve {url}, trying alternative")
            download_page(newurl, filename)
        else:
            print(f"Failed to retrieve {url} and no alternative available.")
            return 'fail'
        



In [220]:
dirs_to_create = [ \
"html/dreamdictionary/",
"datasets",
"output"] 

for path in dirs_to_create:
    os.makedirs(path, exist_ok=True)

In [222]:

base_url = "http://www.dreammoods.com/dreamdictionary/"
suffix = "{}_all.htm"

for letter in string.ascii_lowercase:

    file = f"html/dreamdictionary/{letter}_all.html"

    url = base_url + suffix.format(letter)

    download_page(url, file)
    time.sleep(.5)  # Sleep for 1 second to avoid overwhelming the server


File html/dreamdictionary/a_all.html already exists, skipping download.
File html/dreamdictionary/b_all.html already exists, skipping download.
File html/dreamdictionary/c_all.html already exists, skipping download.


KeyboardInterrupt: 

### Custom scraper
Their HTML structure, however, is anything but simple. So we had to build a highly customized scraper.  
By manually inspecting the (highly inconsistent) html structure, we surmized that the required data is stored in a series of `<p>` paragraph tags under a single `<td>` tag, which can be accessed with a constant xpath.  
We came up with a recursive approach to scan the html tree and infer the type of content in a paragraph:  

 - A dream symbol (represented by a bold header)  
or  
 - A sentence containing description and interpretation of one of the contexts  
or  
 - A reference to a synonym of the symbol  
or  
 - Trash content, that can be discarded

In [245]:
# use this string to debug code near specific text in html
breakpointstring = "your anchor text"
debugging = False

def brpt_anchor(val):
    if not debugging:
        return
    #if re.search(breakpointstring, tc): 
    if breakpointstring.lower() in str(val).lower():
        print("found anchor text: ", breakpointstring, ", value: ", val)

In [224]:

def break_attr(dic):
    drop = ['style','align','face']
    for dr in drop:
        if dr in dic:
            dic.pop(dr)
    return str(dic)

def classify(t):
    
    brpt_anchor(t.text)
        
    
    if type(t) is lxml.html.HtmlComment:
        return "trsh"

    if t.tag == "b":
        return "mean" #"bold"
    
    trashtags = ["div","script","iframe","img"]

    if t.tag == "strong":
        return "sym" #"bold"    

    elif t.tag == "font":

        if 'size' in t.attrib:
            sz = t.attrib['size']
            if sz == "4" or sz == "+1":
                return "sym" #"size4"
            elif sz == "3":
                return "mean" #"size3"
        else:
            return "mean" # no size defined
            
    elif t.tag == "a":
        if 'href' in t.attrib:
            if t.attrib['href'] == "#Top":
                return "trsh"
            return "link"
        elif 'name' in t.attrib:
            return "sym" #"header"
    
    elif t.tag in trashtags:
        return "trsh"
    
    return "mean" # "desc"

def fetch_link(chld):

    if chld.tag == "a":
        if 'href' in chld.attrib:
            return ',' + chld.attrib['href'] + ','
            
    else:
        for c in chld.getchildren():
            href = fetch_link(c)
            if href != "":
                return href
        return ""


def collect(node, cl):

    brpt_anchor(node.text)

    if "trsh" in cl:
        return ""    
    
    if "link" in cl:
        res = fetch_link(node)
        return res
    
    if node.text == None:
        return ""

    else:
        #res = node.text_content()
        res = str(node.text  or '')+  str(node.tail or'')
        res = res.replace("\xa0"," ")
        res = re.sub(r"\n+","\n",res)
        res = re.sub(r'\s+', ' ', res)
        # if cl == "mean":
        #     res = res + " "
        return res


def breakdown(node):
    
    brpt_anchor(node.text_content())

    clas = [classify(node)]
    cont = [collect(node, clas[0])]

    if "trsh" in clas[-1]:
        return ["trsh"], ""
    
    for c in node.getchildren():
        
        cl, co = breakdown(c)

        clas += cl
        cont += co

    return clas, cont


def extract_reference(cont):
    syms = []
    for c in cont:
        c= c.lower()
        if "dreammoods.com/dreamdictionary" in c and "#" in c:
            syms.append("#Ref:" + c.split("#")[-1].strip(","))
        elif "lease" in c or "see" in c:
            if "also" in c:
                return None
            #syms.append(c)
    return syms[0] if len(syms)>0 else None



def collect_paragraphs(r):
    syms_data = []
    for c in r.getchildren():
        clas, cont = breakdown(c)
        fclas = "sym" if "sym" in clas else "link" if "link" in clas else "mean"
        
        if fclas == "link":
            text = extract_reference(cont)
        else:
            text = "".join(cont).strip()
            text = re.sub(r'\s+', ' ', text)
        syms_data.append({"clas": clas, "final_class":fclas, "cont": cont, "text": text})

    return syms_data



Test with a single file, e.g. c_all.html

In [225]:
breakpointstring = "door kno"
letter = "b"

with open(f"html/dreamdictionary/{letter}_all.html", "r") as f:
    html = f.read()

html = html.replace("\xa0"," ")

xp = "/html/body/table[2]/tr/td/div/center/table/tr[4]/td[1]"

tree = lhtml.fromstring(html)

tabl = tree.xpath(xp)[0]

sd = collect_paragraphs(tabl)


In [226]:
sd = pd.DataFrame(sd)
sd

Unnamed: 0,clas,final_class,cont,text
0,"[mean, sym, trsh]",sym,"[, ]",
1,"[mean, mean, sym]",sym,"[, , B]",B
2,"[mean, mean, mean, mean, mean, trsh]",mean,"[, To see the letter B in your dream is a pun ...",To see the letter B in your dream is a pun on ...
3,"[mean, mean, sym]",sym,"[, , Baboon]",Baboon
4,"[mean, mean]",mean,"[, To see a baboon in your dream suggests that...",To see a baboon in your dream suggests that yo...
...,...,...,...,...
1314,"[mean, mean, sym]",sym,"[, , Buzzard]",Buzzard
1315,"[mean, mean, mean, mean, mean, mean, trsh]",mean,"[, To see a buzzard in your dream symbolizes d...",To see a buzzard in your dream symbolizes deat...
1316,[mean],mean,[ ],
1317,[mean],mean,[ ],


At this point we've collected the contents of all the `<p>` tags under the aforementioned xpath in the order of their appearance and classified them to either a symbol, a meaning or a link.  
Next, we squash these contents into a dataset of pairs `symbol`, `multiple meanings` 

In [227]:
def squeeze_meaning_by_symbol(syms_data):
    # convert scraped data to symbol: meaning data 
    data2 =[]
    sym = ""
    meanings = []

    for i, r in syms_data.iterrows():
        if r['final_class'] == "sym":
            
            data2.append({"symbol": sym, "meaning": meanings})
            meanings = []
            sym = r['text']
        else:
            if r['text'] is not None:
                meanings.append(r['text'])
        

    data2 = pd.DataFrame(data2)
    data2['n_meanings'] = data2.meaning.apply(len)
    return data2



Then explode the dataset into pairs of `symbol` and `meaning`  
Some meanings are just **#Ref**erences to synonyms

In [228]:
breakpointstring = "animals being abused"

data2 = squeeze_meaning_by_symbol(sd)
data3 = data2.explode('meaning')
data3['mean_len'] = data3.meaning.str.len()
data3.dropna(inplace=True)
data3

Unnamed: 0,symbol,meaning,n_meanings,mean_len
2,B,To see the letter B in your dream is a pun on ...,1,103.0
3,Baboon,To see a baboon in your dream suggests that yo...,2,244.0
3,Baboon,"In particular, dreaming of a white baboon mean...",2,91.0
4,Baby,To see a baby in your dream signifies innocenc...,12,987.0
4,Baby,If you dream that you are on your way to the h...,12,391.0
...,...,...,...,...
498,Buttons,To dream that you are unbuttoning your clothes...,3,143.0
498,Buttons,To lose a button in your dream signifies your ...,3,124.0
499,Buying,To dream that you or someone is buying somethi...,2,292.0
499,Buying,To buy a car in your dream refers to your comm...,2,99.0


In [231]:
tc = int(data3['mean_len'].sum())
tstp = datetime.now().strftime(r"%y.%m.%d-%H")
print(f"total chars extracted from c_all.html: {tc} ")

total chars extracted from c_all.html: 153666 


In [232]:
data3.to_csv(f"datasets/{letter}_scraped_{tstp}_{tc}.csv", index=False)

Once the correctness of the logic is validated on one letter Html, the above logic can be summarized in a function: 

In [233]:
def extract_letter(file):
    with open(file, "r") as f:
        html = f.read()
        
    xp = "/html/body/table[2]/tr/td/div/center/table/tr[4]/td[1]"

    tree = lhtml.fromstring(html)

    #tree.text_content()

    tabl = tree.xpath(xp)[0]

    syms_data = collect_paragraphs(tabl)
    syms_data = pd.DataFrame(syms_data)
    data2 = squeeze_meaning_by_symbol(syms_data)
    data3 = data2.explode('meaning')
    data3['mean_len'] = data3.meaning.str.len()
    data3.dropna(inplace=True)
    return data3



And executed on all the letters HTMLs to extract the data 

In [246]:


dfs = []
save_dir = 'html/dreamdictionary'
tstp = datetime.now().strftime(r"%y.%m.%d-%H")
for f in os.listdir(save_dir):
    if f.endswith(".html"):
        try:
            existing_df = extract_letter(os.path.join(save_dir, f))
            existing_df["filename"] = f
            dfs.append(existing_df)
        except Exception:
            continue

dataset = pd.concat(dfs)
dataset = dataset[dataset['mean_len'] > 0]
dataset

Unnamed: 0,symbol,meaning,n_meanings,mean_len,filename
2,A,"To see the letter ""A"" in your dream represents...",2,247.0,a_all.html
3,Aardvark,To see an aardvark in your dream indicates tha...,1,110.0,a_all.html
4,Abacus,To see or use an abacus in your dream refers t...,1,123.0,a_all.html
5,Abalone,To see or eat abalone in your dream indicates ...,1,158.0,a_all.html
6,Abandonment,To dream that you are abandoned suggests that ...,2,778.0,a_all.html
...,...,...,...,...,...
365,Mutilate,To dream that you are being mutilated indicate...,2,431.0,m_all.html
365,Mutilate,To dream that someone or something is mutilate...,2,99.0,m_all.html
366,Muzzle,To see or use a muzzle in your dream suggests ...,1,203.0,m_all.html
367,Myrrh,To see myrrh in your dream signifies punishmen...,1,113.0,m_all.html


In [None]:
tstp = datetime.now().strftime(r"%y.%m.%d-%H")
fname = f"datasets/rescraped_{tstp}"

save_df_as_pretty_html(dataset, fname + ".html")
dataset.to_csv(fname + ".csv", index=False)

In [235]:
px.histogram(dataset.mean_len)

### Data augmentation
The dream interpretations generally have a rather consistent structure:

` To dream that <something> <means/indicates/is a ...> <interpretation> ` 

 This can be leveraged to extract a more detailed dream symbol from the data, without running it through LLM.

In [236]:
#dataset = pd.read_csv("datasets/rescraped_25.05.13-15.csv")

In [None]:
dataset

In [237]:
dataset.columns = ['symbol', 'interp', 'n_meanings', 'mean_len', 'filename']

We employ a relatively simple regex to separate the symbol in context from the interpretation

In [238]:
# regexes to split context from meaning

expr = "(\s(?:suggest|represent|symbolize|indicate|mean|denote|forewarn|highlight|reflect|foretell|parallel|impl|signif)(?:y|ies|s|ze)?\s)"

expr2 = """
(\s(?:(?:analogous|refer|point|related|relate)s? to)

|(?:is (?:a|a sign|symbolic|indicative|representative) (?:of|that))

|(?:is a (?:sign|reminder|reflection|pun|way|warning))

|then (?:you|it|the)

|may (?:just )?be
|Alternatively,
\s)""".strip("\n").replace("\n\n","").replace("\n","")

print(expr, "  ")
print(expr2)


(\s(?:suggest|represent|symbolize|indicate|mean|denote|forewarn|highlight|reflect|foretell|parallel|impl|signif)(?:y|ies|s|ze)?\s)   


In [240]:

def custom_split(x):
    incoming = x["interp"]
    
    if "#Ref" in incoming:
        return incoming, incoming
    res1 = re.split(expr, incoming, maxsplit=1)
    
    res2 = re.split(expr2, incoming, maxsplit=1)

    # of the two, take the one where splitting keyword appears earliest in the sentence
    res = res1 if len(res1[0]) < len(res2[0]) else res2

    if(len(res) == 1 or len(res[0]) > 250):
        return None, incoming

    return res[0], "".join(res[1:]) 


dataset[["context","meaning"]] = dataset.apply(custom_split, result_type='expand', axis=1) 
dataset["context_len"] = dataset.context.str.len()
dataset[["interp","context","meaning"]]

Unnamed: 0,interp,context,meaning
2,"To see the letter ""A"" in your dream represents...","To see the letter ""A"" in your dream",represents the beginning of a new stage. You ...
3,To see an aardvark in your dream indicates tha...,To see an aardvark in your dream,indicates that you are being very secretive a...
4,To see or use an abacus in your dream refers t...,To see or use an abacus in your dream,refers to your outdated views. You have an ol...
5,To see or eat abalone in your dream indicates ...,To see or eat abalone in your dream,indicates a transitional period in your life....
6,To dream that you are abandoned suggests that ...,To dream that you are abandoned,suggests that it is time to leave behind past...
...,...,...,...
365,To dream that you are being mutilated indicate...,To dream that you are being mutilated,indicates that there is a waking situation th...
365,To dream that someone or something is mutilate...,To dream that someone or something is mutilated,indicates that your integrity is put into que...
366,To see or use a muzzle in your dream suggests ...,To see or use a muzzle in your dream,suggests that you need to show better restrai...
367,To see myrrh in your dream signifies punishmen...,To see myrrh in your dream,signifies punishment or suffering. You are un...


#### Iterative development
Use this ever shrinking dataset to discover what interps were not picked up by regex.
Modify regexes above to catch them and re-run the code.

In [241]:
smol = dataset[["symbol", "context","meaning"]][dataset.context.isna()] 
smol.to_csv("datasets/smol.csv", index=False)
smol

Unnamed: 0,symbol,context,meaning
184,Anger,,Being angry in your dream may have been carrie...
185,Angling,,To dream that you are angling is a good omen o...
186,Animal,,A dream about a baby animal is often symbolic ...
213,Antlers,,To see antlers in your dream are representativ...
352,Awakening,,You may also be on the verge of lucid dreaming.
71,Sauce,,"The dream may also be a pun on being ""saucy"". ..."
440,Soda,,"If the soda is bitter, sour or off-tasting, th..."
500,Spell,,Dreaming of a protection spell written in red ...
626,Stones,,"For various cultures, stones have spiritual si..."
276,Places,,To dream of a certain place in your dream is t...


Once the amount of contexts that were not picked up by regex is small enough, we can replace these with just the dream symbol. 

In [None]:

dataset.loc[dataset.context.isna(),"context"] = dataset.loc[dataset.context.isna(),"symbol"]
dataset["context_len"] = dataset.context.str.len()
dataset

Unnamed: 0,symbol,interp,n_meanings,mean_len,filename,context,meaning,context_len
2,A,"To see the letter ""A"" in your dream represents...",2,247.0,a_all.html,"To see the letter ""A"" in your dream",represents the beginning of a new stage. You ...,35
3,Aardvark,To see an aardvark in your dream indicates tha...,1,110.0,a_all.html,To see an aardvark in your dream,indicates that you are being very secretive a...,32
4,Abacus,To see or use an abacus in your dream refers t...,1,123.0,a_all.html,To see or use an abacus in your dream,refers to your outdated views. You have an ol...,37
5,Abalone,To see or eat abalone in your dream indicates ...,1,158.0,a_all.html,To see or eat abalone in your dream,indicates a transitional period in your life....,35
6,Abandonment,To dream that you are abandoned suggests that ...,2,778.0,a_all.html,To dream that you are abandoned,suggests that it is time to leave behind past...,31
...,...,...,...,...,...,...,...,...
365,Mutilate,To dream that you are being mutilated indicate...,2,431.0,m_all.html,To dream that you are being mutilated,indicates that there is a waking situation th...,37
365,Mutilate,To dream that someone or something is mutilate...,2,99.0,m_all.html,To dream that someone or something is mutilated,indicates that your integrity is put into que...,47
366,Muzzle,To see or use a muzzle in your dream suggests ...,1,203.0,m_all.html,To see or use a muzzle in your dream,suggests that you need to show better restrai...,36
367,Myrrh,To see myrrh in your dream signifies punishmen...,1,113.0,m_all.html,To see myrrh in your dream,signifies punishment or suffering. You are un...,26


Save the dataset with current timestamp. Then use some diff tool to compare between the versions of the dataset. This is the best way to validate that your changes in the scraping/augmenting logic bear the desired effect on the dataset.

In [None]:
tstp = datetime.now().strftime(r"%y.%m.%d-%H")
fname = f"datasets/augmented_{tstp}"
outcols = ["symbol", "context","meaning","context_len","n_meanings"]
#subset = dataset.loc[list(range(0,len(dataset),10)),outcols]

subset = dataset[outcols]

save_df_as_pretty_html(subset, fname + ".html")
subset.to_csv(fname + ".csv", index=False)

### Analyze the results of scraping an augmenting

Check out the distribution of the lengths of context. The bump on the left are the #Ref to synonyms 

In [243]:
px.histogram(dataset.context.str.len(), title="Context Lengths")

Check out the rows with longest context to probably stumble upon more separators examples

In [244]:
subset.sort_values("context_len", ascending=False)

Unnamed: 0,symbol,context,meaning,context_len,n_meanings
4220,Coffee,If you dream that you are trying to make coffe...,then it suggests that you are trying too hard ...,172.0,5
4519,Crush,To dream that you have a crush on somebody is ...,represents your current infatuation with her ...,150.0,6
4039,Choking,Choking dreams are often a fearful experience ...,suggests that you may find some advice/remark...,145.0,3
3041,Bathroom,To dream that you are in a public restroom wit...,signifies your frustrations about getting eno...,139.0,5
6604,Tofu,To see or eat tofu in your dream presents your...,suggests that you need to start adapting a mo...,132.0,1
...,...,...,...,...,...
6567,Tingling,,To feel a tingling sensation in your dream may...,,1
6638,Tools,,The tool in your dream may also be a pun on ho...,,2
6685,Tower,,To dream that a tower is falling or crumbling ...,,4
7565,Miscarriage,,"If you are currently pregnant, then dreams of ...",,2


## Dreambank

On this godforsaken website there is a section called dreambank, which has some dreams descriptions and interpretations of them by the moderators.  
We'd like to use this as our test data, so we need to scrape them as well  



### Scrape the list of dreams
First, We need top scrape the dreambank.html page, which holds the links to all the interpreted dream. 

In [None]:
file = f"html/dreambank/dreambank.html"

url = "http://www.dreammoods.com/dreambank/"
download_page(url, file)

In [None]:
def extract_bank(file, xpath):
    with open(file, "r") as f:
        html = f.read()       

    tree = lhtml.fromstring(html)

    #tree.text_content()

    tabl = tree.xpath(xpath)[0]

    global syms_data
    syms_data = []
    r,c,t = breakdown(tabl)
    syms_data = pd.DataFrame(syms_data)
    data2 = squeeze_meaning_by_symbol(syms_data)
    data3 = data2.explode('meaning')
    data3['mean_len'] = data3.meaning.str.len()
    return data3

xps = [ "/html/body/table[2]/tr/td/div/center/table/tr[3]/td[1]" ,
        "/html/body/table[2]/tr/td/div/center/table/tr[3]/td[2]",
        "/html/body/table[2]/tr/td/div/center/table/tr[3]/td[3]"
]
sd = []
for xp in xps:
    extract_bank(file, xp)
    sd.append(syms_data)

df = pd.concat(sd)


In [None]:
df

Next, we extract the actual links and their description text. 

In [None]:
df = df[ df.clas == "link"]
df[['_', 'link', 'name']] = df['cont'].str.split(',', n=2, expand=True)
df

The links contain a search term. 
We'll use this search term as filename to store the downloaded page

In [None]:
df["srch"] = df.link.str.split('search=', n=2, expand=True)[1].str.split('&', n=2, expand=True)[0]

Some links do not conform to this structure, so we'll use their page name verbatim

In [None]:
wat = df[df.srch.isna()].link.str.split("/", n=5, expand=True)
df.loc[df.srch.isna(),'srch'] = wat.loc[:,4]
df

Majority of those links do not work. By analyzing the url structure, we came up with an educated guess as to how they might be fixed. Some of it succeeded.

In [None]:
df.link =  df.apply(lambda x: x.link.replace("dreambank", x.srch[:-1] + "s") , axis=1)

In [None]:
df

Then downloaded any pages that were available. 

In [None]:
pref = "html/dreambank/"

results = []
for i, r in df.iterrows():
    res = download_page(r['link'], pref + r['srch'] + r['name']  + ".html")
    results.append(res)
    time.sleep(.2)  # Sleep for 1 second to avoid overwhelming the server

df['res'] = results

In [None]:
df[[ 'res', 'link', 'name']]

In [None]:
df.sort_values(by=['res','link'], ascending=True, inplace=True)
df[[ 'res', 'link', 'name']].to_csv("dreamdic.csv", index=False)

### Fixing html

Being an ancient dinosaur poop that it is, the website is a horrendous case of html tag soup.  
Even our highly customized extractor fails to properly parse the pages.  
We'll summon modern web browser technology in the form of html5lib library to repair these pages.

In [None]:


def fix_html_file(input_filepath, output_filepath):
    """
    Reads a broken HTML file, parses it using html5lib (like a browser),
    and writes the corrected HTML to a new file.
    """
    try:
        with open(input_filepath, 'r', encoding='utf-8') as f:
            broken_html_content = f.read()
    except FileNotFoundError:
        print(f"Error: Input file not found at '{input_filepath}'")
        return
    except Exception as e:
        print(f"Error reading file '{input_filepath}': {e}")
        return

    # Parse the HTML using html5lib (browser-like parsing)
    # This is where the "fixing" happens internally
    soup = BeautifulSoup(broken_html_content, 'html5lib')

    # Get the corrected HTML. .prettify() adds nice indentation.
    # You can also use str(soup) for a less formatted output.
    corrected_html_content = soup.prettify()

    try:
        with open(output_filepath, 'w', encoding='utf-8') as f:
            f.write(corrected_html_content)
        print(f"Successfully fixed HTML and saved to '{output_filepath}'")
    except Exception as e:
        print(f"Error writing file '{output_filepath}': {e}")


In [None]:
broken_dir = 'html/dreambank'
fixed_dir = 'html/dreambank/fixed'

for f in os.listdir(broken_dir):
    if f.endswith(".html"):
        input_file = os.path.join(broken_dir, f)
        output_file = os.path.join(fixed_dir, f)
        fix_html_file(input_file, output_file)



### Extracting from Dream Bank
Once the pages are fixed, our scraper is again able to parse them 

In [None]:
def extract_bank(file, xpath):
    
    with open(file, "r") as f:
        html = f.read()
        

    tree = lhtml.fromstring(html)
    #tree.text_content()

    tabl = tree.xpath(xpath)[0]
    
    r,sd,c,t = breakdown(tabl)

    
    return r,sd,c,t
    # syms_data = pd.DataFrame([c,t])
    # # data2 = squeeze_meaning_by_symbol(syms_data)
    # # data3 = data2.explode('meaning')
    # # data3['mean_len'] = data3.meaning.str.len()
    # return pd.DataFrame(), syms_data


In [None]:
dfs = []
sds = []
save_dir = 'html/dreambank/fixed'

#xpath = "/html/body/table[2]/tr/td/div/center/table/tr[3]/td[1]"
xpath = "/html/body/table[2]/tbody/tr/td/div/center/table/tbody/tr[3]/td[1]"

breakpointstring = "I keep having this recurring"

for f in os.listdir(save_dir):
    if f.endswith(".html"):
        path = os.path.join(save_dir, f)
        #print(path)
        r,sd,c,t = extract_bank(path,xpath)
        # data3["filename"] = f
        dfs.append(r)
        sd = pd.DataFrame(sd)
        #sd = squeeze_meaning_by_symbol(syms_data=sd)
        sds.append(sd)
#sds = sds)
#sds

In [None]:
sds

In [None]:
save_df_as_pretty_html(sds[0], 'example.html')

In [None]:
r
with open("sandbox.txt", "w") as f:
    f.write(r)