In [65]:
import os
import lxml.html as lx

import DefaultFeatures
import DomUtils 
from FeaturesExtractor import *

def csv_line(features, other=[]):
    f = []
    for k, v in sorted(other, key=lambda pair: pair[0]):
        f.append(v)
        
    for name in sorted(features.keys()):
        f.append(str(features[name]))
        
    return "\t".join(f)

def csv_header_from_features(features, other=[]):
    header = []
    for k in sorted(other):
        header.append(k)
    
    header = header + sorted(features.keys())
    return "\t".join(header)
    
def read_xpath_ground_truth(path):
    domain2xpath = {}
    with open(path, "r") as gf:
        for line in gf.readlines():
            domain = line.split("\t")[0].strip()
            xpath = line.split("\t")[1]
            
            if domain not in domain2xpath:
                domain2xpath[domain] = []
            domain2xpath[domain].append(xpath.strip()) 
    return domain2xpath

In [66]:
#dataset_path = os.sep.join(["..", "camera"])
dataset_path = "/run/media/nan/agiw/camera/"

domain2xpath = read_xpath_ground_truth("data/list_xpath_ground_truth.txt")

# Extract default features
features = DefaultFeatures.list
list(features.keys())

['number_row',
 'number_relevants',
 'relevants_ratio',
 'number_links',
 'number_bold',
 'number_img',
 'number_div',
 'number_p',
 'number_br',
 'depth',
 'avg_tag_in_li']

In [67]:
# ...and/or extend with new features

In [68]:
with open("list_features.txt", "w") as tf:
    ft = FeaturesExtractor()
    
    # write header
    header = csv_header_from_features(features, other=['file', 'relevant'])
    tf.write("{}\n".format(header))
    
    #analyze each domain
    #for domain in ["walmart.com"]:
    for domain in domain2xpath:
        if (domain == "walmart.com" or domain == "ebay.com" or domain == "amazon.com" or
            domain == "www.amazon.fr" or domain == "www.amazon.in" or domain == "www.amazon.co.uk"):
            continue
        
        domain = domain.strip()
        relevant_count = 0
        not_relevant_count = 0
    
        for page in os.listdir(os.sep.join([dataset_path, domain])):
            if not page.endswith("html"):
                continue
            
            file_name = os.sep.join([domain, page])
            full_file_path = os.sep.join([dataset_path, file_name])

            dom = lx.parse(full_file_path)
        
            # apply any known xpath and save features of any item
            relevants_items = set()
            for xpath in domain2xpath[domain]:
                
                for node in dom.xpath(xpath):
                    relevant_count += 1
                    relevants_items.add(node)
                    
                    extracted = ft.extract(node, selected=list(features.keys()), 
                                                 features_descriptor=features)
                    
                    line = csv_line(extracted, other=[('file', file_name),
                                                      ('relevant', "1")])
                    tf.write("{}\n".format(line))
                    
            # save features of any other not relevant node
            #nodes = dom.xpath("//table")
            
            nodes = set(dom.xpath("//ol"))
            nodes = nodes.union(set(dom.xpath("//ul")))
            nodes = nodes.union(set(dom.xpath("//dl")))
            
            for node in nodes:
                if node not in relevants_items:
                    not_relevant_count += 1
                    
                    extracted = ft.extract(node, selected=list(features.keys()), 
                                                 features_descriptor=features)
                    
                    line = csv_line(extracted, other=[('file', file_name),
                                                     ('relevant', "0")])
                    tf.write("{}\n".format(line))
        print("{}, relevant: {}, not_relevant: {}".format(domain, relevant_count, not_relevant_count))

www.crazool.com, relevant: 0, not_relevant: 1028
www.pontofrio.com.br, relevant: 4794, not_relevant: 18770
www.futureshop.ca, relevant: 9, not_relevant: 398
www.gmcamera.com, relevant: 28, not_relevant: 1669
www.rakuten.com, relevant: 25, not_relevant: 21
digitalcamerabug.com, relevant: 146, not_relevant: 441
hbh-woolacotts.co.uk, relevant: 23, not_relevant: 1350
currys.co.uk, relevant: 2144, not_relevant: 6530
www.cdw.com, relevant: 294, not_relevant: 1860
www.uniquephoto.com, relevant: 306, not_relevant: 3374


In [69]:
import pandas

In [70]:
d = pandas.read_csv("list_features.txt", sep="\t")

In [71]:
d.head()

Unnamed: 0,file,relevant,avg_tag_in_li,depth,number_bold,number_br,number_div,number_img,number_links,number_p,number_relevants,number_row,relevants_ratio
0,www.crazool.com/1.html,0,0.0,1,0,0,0,0,0,0,5,4,0.294118
1,www.crazool.com/2.html,0,0.0,1,0,0,0,0,0,0,22,5,0.44
2,www.crazool.com/3.html,0,0.0,1,0,0,0,0,0,0,10,3,0.357143
3,www.crazool.com/4.html,0,0.0,1,0,0,0,0,0,0,36,8,0.521739
4,www.crazool.com/5.html,0,0.0,1,0,0,0,0,0,0,9,3,0.6


In [72]:
d = d.drop('file', 1)

group_by_keys = list(d.keys())
group_by_keys.remove("relevants_ratio")

d.head()

Unnamed: 0,relevant,avg_tag_in_li,depth,number_bold,number_br,number_div,number_img,number_links,number_p,number_relevants,number_row,relevants_ratio
0,0,0.0,1,0,0,0,0,0,0,5,4,0.294118
1,0,0.0,1,0,0,0,0,0,0,22,5,0.44
2,0,0.0,1,0,0,0,0,0,0,10,3,0.357143
3,0,0.0,1,0,0,0,0,0,0,36,8,0.521739
4,0,0.0,1,0,0,0,0,0,0,9,3,0.6


In [73]:
import random

k = d.groupby(group_by_keys)
with open("list.txt", "w") as tf:
    
    tf.write("\t".join(list(d.keys())) + "\n") 
    
    l = list(k["relevants_ratio"].apply(list).iteritems())
    random.shuffle(l)
    
    for values, relenvats_ratios in l:
        
        chosen_ratio = random.choice(relenvats_ratios)
        
        values = list(values) + [chosen_ratio]
        values = [str(round(v, 2)) for v in values]
        
        line = "\t".join(values)
        
        tf.write(line + "\n")