In [3]:
import os
import lxml.html as lx

import DefaultFeatures
import DomUtils 
from FeaturesExtractor import *

def csv_line(features, other=[]):
    f = []
    for k, v in sorted(other, key=lambda pair: pair[0]):
        f.append(v)
        
    for name in sorted(features.keys()):
        f.append(str(features[name]))
        
    return "\t".join(f)

def csv_header_from_features(features, other=[]):
    header = []
    for k in sorted(other):
        header.append(k)
    
    header = header + sorted(features.keys())
    return "\t".join(header)
    
def read_xpath_ground_truth(path):
    domain2xpath = {}
    with open(path, "r") as gf:
        for line in gf.readlines():
            domain = line.split("\t")[0]
            xpath = line.split("\t")[1]
            
            if domain not in domain2xpath:
                domain2xpath[domain] = []
            domain2xpath[domain].append(xpath.strip()) 
    return domain2xpath

In [5]:
#dataset_path = os.sep.join(["..", "camera"])
dataset_path = "/run/media/nan/agiw/camera/"

domain2xpath = read_xpath_ground_truth("data/table_xpath_ground_truth.txt")

# Extract default features
features = DefaultFeatures.table
list(features.keys())

['number_tr',
 'number_th',
 'number_td',
 'number_links',
 'depth',
 'number_relevants',
 'relevants_ratio',
 'number_bold',
 'number_p',
 'number_br',
 'number_img',
 'number_li',
 'number_div']

In [6]:
# ...and/or extend with new features

In [7]:
with open("table_features.txt", "w") as tf:
    ft = FeaturesExtractor()
    
    # write header
    header = csv_header_from_features(features, other=['file', 'relevant'])
    tf.write("{}\n".format(header))
    
    #analyze each domain
    #for domain in ["www.submarino.com.br"]:
    for domain in domain2xpath:
            
        relevant_count = 0
        not_relevant_count = 0
    
        for page in os.listdir(os.sep.join([dataset_path, domain])):
            if not page.endswith("html"):
                continue
            
            file_name = os.sep.join([domain, page])
            full_file_path = os.sep.join([dataset_path, file_name])

            dom = lx.parse(full_file_path)
        
            # apply any known xpath and save features of any item
            relevants_items = set()
            for xpath in domain2xpath[domain]:
            
                for node in dom.xpath(xpath):
                    relevant_count += 1
                    relevants_items.add(node)
                    
                    extracted = ft.extract(node, selected=list(features.keys()), 
                                                 features_descriptor=features)
                    
                    line = csv_line(extracted, other=[('file', file_name),
                                                      ('relevant', "1")])
                    tf.write("{}\n".format(line))
                    
            # save features of any other not relevant node
            for node in dom.xpath("//table"):
                if node not in relevants_items:
                    not_relevant_count += 1
                    
                    extracted = ft.extract(node, selected=list(features.keys()), 
                                                 features_descriptor=features)
                    
                    line = csv_line(extracted, other=[('file', file_name),
                                                     ('relevant', "0")])
                    tf.write("{}\n".format(line))
        print("{}, relevant: {}, not_relevant: {}".format(domain, relevant_count, not_relevant_count))

amazon.com, relevant: 2658, not_relevant: 9345
walmart.com, relevant: 199, not_relevant: 0
www.amazon.fr, relevant: 26, not_relevant: 168
www.amazon.in, relevant: 396, not_relevant: 1795
www.amazon.co.uk, relevant: 28, not_relevant: 241
www.ebay.ie, relevant: 273, not_relevant: 10011
www.submarino.com.br, relevant: 90, not_relevant: 0
www.darty.com, relevant: 14, not_relevant: 0
www.govgroup.com, relevant: 22, not_relevant: 84
www.highpointscientific.com, relevant: 5, not_relevant: 0
www.techdna.co.uk, relevant: 20, not_relevant: 0
www.tigerdirect.ca, relevant: 14, not_relevant: 125
www.futureshop.ca, relevant: 3, not_relevant: 0
www.gmcamera.com, relevant: 28, not_relevant: 56
www.happii.dk, relevant: 34, not_relevant: 510
www.harrisoncameras.co.uk, relevant: 30, not_relevant: 0
www.rakuten.com, relevant: 7, not_relevant: 9


## note
- ho notato che nel footer di amazon (una grande tabella), ci sono molti termini di dominio (81):
  (11, 0, 83, 0, 40, 81)
  il che suggerisce che il vocabolario non è molto verticale. Sono presenti molti link (83) che dovrebbero
  suggerire al classificatore di scartare questa tabella. Ad ogni modo va perfezionato il vocabolario

In [8]:
import pandas

In [9]:
d = pandas.read_csv("table.txt", sep="\t")

In [10]:
d.head()

Unnamed: 0,relevant,depth,number_bold,number_br,number_div,number_img,number_li,number_links,number_p,number_relevants,number_td,number_th,number_tr,relevants_ratio
0,0,10,0,0,0,0,6,3,0,10,5,0,2,0.37
1,0,10,0,0,0,0,6,3,0,10,5,0,2,0.38
2,0,10,0,0,0,0,6,3,0,11,5,0,2,0.4
3,0,10,1,0,5,3,0,0,0,11,4,0,2,0.35
4,0,10,1,0,5,3,0,0,0,11,4,0,2,0.36


In [46]:
all_keys = list(d.keys())
all_keys.remove("relevants_ratio")

In [64]:
import random
k = d.groupby(all_keys)
with open("table_with_ratio.txt", "w") as tf:
    for a, b in k["relevants_ratio"].apply(list).iteritems():
        r = random.choice(b)
        values = list(a) + [r]
        
        values = [str(v) for v in values]
        
        line = "\t".join(values)
        
        tf.write(line + "\n")

In [39]:

k.apply(print)

     relevant  depth  number_bold  number_br  number_div  number_img  \
87          0      3            0          0           0           0   
88          0      3            0          0           0           0   
89          0      3            0          0           0           0   
90          0      3            0          0           0           0   
91          0      3            0          0           0           0   
92          0      3            0          0           0           0   
93          0      3            0          0           0           0   
94          0      3            0          0           0           0   
95          0      3            0          0           0           0   
96          0      3            0          0           0           0   
97          0      3            0          0           0           0   
98          0      3            0          0           0           0   
99          0      3            0          0           0        

      relevant  depth  number_bold  number_br  number_div  number_img  \
1462         1      2            0          0           0           0   
1463         1      2            0          0           0           0   
1464         1      2            0          0           0           0   
1465         1      2            0          0           0           0   
1466         1      2            0          0           0           0   
1467         1      2            0          0           0           0   
1468         1      2            0          0           0           0   
1469         1      2            0          0           0           0   
1470         1      2            0          0           0           0   
1471         1      2            0          0           0           0   
1472         1      2            0          0           0           0   
1473         1      2            0          0           0           0   
1474         1      2            0          0      