In [28]:
import os
import lxml.html as lx

import DefaultFeatures
from FeaturesExtractor import *

def print_csv_line(features, other=[]):
    f = []
    for k, v in sorted(other, key=lambda pair: pair[0]):
        f.append(v)
        
    for name in sorted(features.keys()):
        f.append(str(features[name]))
        
    print("\t".join(f))

def print_csv_header_from_features(features, other=[]):
    header = []
    for k in sorted(other):
        header.append(k)
    
    header = header + sorted(features.keys())
    print("\t".join(header))

In [36]:
dataset_path = os.sep.join(["..", "camera"])
#dataset_path = "/run/media/nan/agiw/camera/"


# table_xpath_ground_truth --> dict[domain] -> [xpath]
ground_xpath_file = "data/table_xpath_ground_truth.txt"
domain2xpath = {}
with open(ground_xpath_file, "r") as gf:
    for line in gf.readlines():
        domain = line.split("\t")[0]
        xpath = line.split("\t")[1]
        
        if domain not in domain2xpath:
            domain2xpath[domain] = []
            
        domain2xpath[domain].append(xpath.strip())

        
# feature extraction
ft = FeaturesExtractor()

print_csv_header_from_features(DefaultFeatures.table, other=['file', 'relevant'])

#for domain in domain2xpath:
for domain in ["www.submarino.com.br"]:   
    for page in os.listdir(os.sep.join([dataset_path, domain])):
        if not page.endswith("html"):
            continue
        
        file_name = os.sep.join([domain, page])
        full_file_path = os.sep.join([dataset_path, file_name])

        dom = lx.parse(full_file_path)
        
        # apply any known xpath and save features of any item
        relevants_items = set()
        for xpath in domain2xpath[domain]:
            items = dom.xpath(xpath)
            
            for item in items:
                relevants_items.add(item)
                features = ft.extract(item, DefaultFeatures.table)
                print_csv_line(features, other=[('file', file_name),
                                                ('relevant', "1")])
        relevants_items = set()
        # save features of any other table
        for item in dom.xpath("//table"):
            if item not in relevants_items:
                features = ft.extract(item, DefaultFeatures.table)
                print_csv_line(features, other=[('file', file_name),
                                                 ('relevant', "0")])


file	relevant	depth	number_links	number_relevants	number_td	number_th	number_tr
www.submarino.com.br/95.html	1	3	0	53	77	0	39
www.submarino.com.br/95.html	0	3	0	53	77	0	39
www.submarino.com.br/375.html	1	4	1	45	79	0	40
www.submarino.com.br/375.html	0	4	1	45	79	0	40
www.submarino.com.br/374.html	1	4	1	48	73	0	37
www.submarino.com.br/374.html	0	4	1	48	73	0	37
www.submarino.com.br/250.html	1	4	1	57	75	0	38
www.submarino.com.br/250.html	0	4	1	57	75	0	38
www.submarino.com.br/315.html	1	4	1	46	67	0	34
www.submarino.com.br/315.html	0	4	1	46	67	0	34
www.submarino.com.br/105.html	1	4	1	42	67	0	34
www.submarino.com.br/105.html	0	4	1	42	67	0	34
www.submarino.com.br/115.html	1	4	1	50	71	0	36
www.submarino.com.br/115.html	0	4	1	50	71	0	36
www.submarino.com.br/133.html	1	4	1	48	69	0	35
www.submarino.com.br/133.html	0	4	1	48	69	0	35
www.submarino.com.br/32.html	1	4	1	34	71	0	36
www.submarino.com.br/32.html	0	4	1	34	71	0	36
www.submarino.com.br/237.html	1	4	1	41	69	0	35
www.submarino.com.br/237.html	0

www.submarino.com.br/233.html	0	3	0	40	69	0	35


## note
- ho notato che nel footer di amazon (una grande tabella), ci sono molti termini di dominio (81):
  (11, 0, 83, 0, 40, 81)
  il che suggerisce che il vocabolario non è molto verticale. Sono presenti molti link (83) che dovrebbero
  suggerire al classificatore di scartare questa tabella. Ad ogni modo va perfezionato il vocabolario