In [76]:
from make_data import get_all_data
import pandas as pd

In [77]:
def add_dicts(dict1,dict2):
    #Adds two dicts by adding the values
    res = {}
    res.update(dict2)
    for key,value in dict1.items():
        if key in res: res[key] += value
        else: res[key] = value
    
    return res

def add_to_dict(dict,key,value = 1):
    #Add a value to a dict at "key"
    if key in dict: dict[key] +=value
    else: dict[key] = value

def unpack_nested_dict(d,sep = "_"):
    #Unpacks the dicts within a dict. Concats the keys with a seperator and returns a "flat" dict
    res = {}
    for key,value in d.items():
        if isinstance(value,dict):
            for k,v in value.items():
                res[key+sep+k] = v
        else:
            res[key] = value

    return res

def propergate_embeddings(soup,depth = 0,parents = {},y_pos = 0):
    #Goes trough the soup tree and sets the embeddings for each soup
    all_children_counts = {}
    direct_children_counts = {}

    embeddings = {}

    #y_pos describes the y-position of an element on the page
    embeddings["y_pos"] = y_pos

    if soup.children:
        for child in soup.children:
            if child.name:
                add_to_dict(direct_children_counts,child.name)
                add_to_dict(all_children_counts,child.name)
                all_children_counts = add_dicts(all_children_counts,propergate_embeddings(child,depth+1,parents = add_dicts(parents,{soup.name:1}),y_pos=y_pos))
                y_pos += 1
                pass

    
    #The number of words that are directly in the element
    embeddings["direct_word_count"] = len((soup.find(text=True,recursive = False) or "").split())

    #The number of words from all elements
    embeddings["all_word_count"] = len(soup.text.split())
    
    #A dict with the numbers of elements by "name"
    embeddings["child_counts"] = all_children_counts

    #A dict with the numbers of elements by "name" of direct children
    embeddings["direct_child_counts"] = direct_children_counts

    #A dict with the key and value of the current element
    embeddings["class"] = {soup.name:1}

    #The depth of an element
    embeddings["depth"] = depth

    #A dict of parents "name" with counts of the element 
    embeddings["parents"] = parents
    


    soup.embeddings = unpack_nested_dict(embeddings)

    #For recursive execution, the child_counts are returned
    return all_children_counts

In [78]:
#Loads the soups with labels
soups = get_all_data()

In [79]:
labeled_and_classified  =[]

for soup in soups:
    propergate_embeddings(soup)

    labeled_and_classified += soup.find_all()

df = pd.DataFrame([s.embeddings for s in labeled_and_classified])

df = df.fillna(0)

df["label"] = [s.label for s in labeled_and_classified]

In [80]:
#The data is split up into positive and negative to then be balanced
neg = df[df.label != "article"]
pos = df[df.label == "article"]

n = min(len(pos),len(neg))

data = pd.concat([pos.sample(n=n),neg.sample(n=n)])

data["label"] = data.label.map(lambda x: "article" if x == "article" else "not_article")