In [1]:
import pandas as pd
import re
import random
from tqdm import tqdm
import csv

# Generating random names, addresses, etc.
# http://pbpython.com/barnum.html
from barnum import gen_data

# create html
# http://www.yattag.org/
from yattag import Doc
# display html
from IPython.core.display import display, HTML
# save html to png
import imgkit

In [2]:
def line_break(times):
    s = []
    for i in range(times):
        s.append("<br />")
    return s

In [3]:
def append_func(arr):
    l = []
    for item in arr:
        if(type(item) == list):
            for itm in item:
                l.append(itm)
        else:
            l.append(item)
    return l

In [4]:
def generate_invoice_str():
    # random data lists
    domain_endings = [".com", ".org", ".net", ".us", ".co", ".ca", ".cn", ".fr", ".ch", ".at", ".au",
                      ".in", ".de", ".jp", ".nl", ".uk", ".mx", ".no", ".ru", ".br", ".se", ".es"]

    domain_beginnings = ["http://", "https://", "http://www.", "https://www."]

    customer_address_titles = ["address"] # , "Adresse", "indirizzo", "direcció", "dirección", "Adresses"]

    headers = ["Invoice", "Bill"]
    #            , "Rechnung", "faturamento", "fatturazione", "laskutus", "naplate",
    #           "facturation", "facturación"]

    customer_number_titles = ["Customer no.", "Customer Number", "Customer #", "Customer  Id.", "Customer ID."]
    #                           ,"Kundennummer", "Kundennr.", "Kunden Nr.", "numéro de client", "num. de client",
    #                           "número de cliente", "broj kupca", "zákaznické číslo",
    #                           "asiakasnumero", "numero cliente","número de cliente"]

    invoice_number_titles = ["Invoice no.", "Invoice Number", "Invoice #", "Invoice  Id.", "Invoice ID"]
    #                          ,"Rechnungsnummer", "Rechnungsnr.", "Rechnungs Nr.", "faturamento número",
    #                          "fatturazione numero", "laskutus numero", "naplate broj",
    #                          "facturation nombre", "facturación número",]

    currencies = ["EUR", "€", "USD", "$"] # , "JPY", "¥", "GBP", "£", "AUD", "A$", "CAD", "C$", "CHF", "Fr"
    #              "CNY", "元", "SEK", "kr",
    #              "NZD", "NZ$", "MXN", "$", "SGD", "S$", "HKD", "HK$", "NOK", "kr", "KRW", "₩", "TRY", "₺", "RUB",
    #              "₽", "INR", "₹", "BRL", "R$", "ZAR", "R"]

    price_titles = ["Tot.", "price", "Total EPS", "Balance Due"]
    # , "Preis", "Ges.", "Gesamt", "Total", "A payer", "A pagar", "Per pagar", "Per pagare"
    
    sub_total_titles = ["Sub Tot.", "Sub t.", " "]
    
    
    # Random Styles
    alignments = ['left', 'center', 'right']

    font_families = ['Arial', 'Helvetica', 'Times New Roman',
                     'Times', 'Courier New', 'Courier ']

    
    #
    #
    #### generate random entities ####
    spacings = [line_break(1), line_break(2), line_break(3), line_break(4)]

    company_font = 'font-family: "%s"; font-size: %i;' % (
        random.choice(font_families), random.randint(12, 18))
    company_block_align = 'text-align: %s;' % random.choice(alignments)

    company_name = {'content': gen_data.create_company_name(),
                    'desc': 'company_name', 'style': '%s %s' % (company_font, company_block_align)}

    company_address = "%s %s %s, %s %s" % (gen_data.create_name()[0],
                                           gen_data.create_name()[1],
                                           gen_data.create_street(),
                                           gen_data.create_city_state_zip()[0],
                                           gen_data.create_city_state_zip()[1])
    company_address = {'content': company_address, 'desc': 'company_address',
                       'style': '%s %s' % (company_font, company_block_align)}

    company_email = {'content': gen_data.create_email(), 'desc': 'company_email',
                     'style': '%s %s' % (company_font, company_block_align)}

    company_website = "%s%s%s" % (random.choice(domain_beginnings),
                                  re.sub(
                                      ' ', '_', company_name['content'].lower()),
                                  random.choice(domain_endings))
    company_website = {'content': company_website, 'desc': 'company_website',
                       'style': '%s %s' % (company_font, company_block_align)}

    customer_address_title = random.choice(customer_address_titles)

    customer_address = "%s %s %s, %s %s" % (gen_data.create_name()[0],
                                            gen_data.create_name()[1],
                                            gen_data.create_street(),
                                            gen_data.create_city_state_zip()[0],
                                            gen_data.create_city_state_zip()[1])

    header = {'content': random.choice(headers), 'desc': 'header',
              'style': 'text-align: %s; font-size: %s;' %
              (random.choice(['left', 'center']), str(random.randint(18, 30)))}

    invoice_date = random.choice([
        "%s.%s.%s" % (gen_data.create_date().day,
                      gen_data.create_date().month, gen_data.create_date().year),
        "%s-%s-%s" % (gen_data.create_date().year, gen_data.create_date().day, gen_data.create_date().month)])
    invoice_date = {'content': invoice_date, 'desc': 'invoice_date', 'label': invoice_date,
              'style': 'text-align: %s; font-size: %s;' %
              (random.choice(['left', 'center']), str(random.randint(12, 20)))}

    customer_number_title = random.choice(customer_number_titles)
    customer_number = str(random.randint(1, 10 ** random.randint(7, 12)))
    invoice_number_title = random.choice(invoice_number_titles)
    invoice_number = str(random.randint(1, 10 ** random.randint(7, 12)))

    price_title = random.choice(price_titles)
    sub_total_title = random.choice(sub_total_titles)
    price_currency = random.choice(currencies)
    total_price = random.randint(1, 10 ** random.randint(2, 6))/100
    price = str(total_price)

    num_products = random.randint(1,5)

    product_price_percentage = []
    for item in range(num_products):
        product_price_percentage.append(random.randint(1,100))

    sum_percentage_tmp = sum(product_price_percentage)
    for i in range(num_products):
        product_price_percentage[i] = round(product_price_percentage[i]/sum_percentage_tmp,2)

    #
    #
    #### create block segments ####
    spacing = random.choice(spacings)

    company_block = append_func([company_name, company_address, company_email, company_website])

    customer_address_block = "%s : %s" % (customer_address_title, customer_address)
    customer_address_block = {'content': customer_address_block, 'desc': 'customer_address_block',
              'style': 'text-align: %s; font-size: %s;' %
              (random.choice(['left', 'center']), str(random.randint(12, 20)))}

    customer_num_block = []
    customer_num_block.append(customer_number_title)
    if(random.random() < 0.5):
        customer_num_block.append(" :")
    customer_num_block.append(" ")
    customer_num_block.append(customer_number)
    customer_num_block = "".join(customer_num_block)
    customer_num_block = {'content': customer_num_block, 'desc': 'customer_num_block',
              'style': 'text-align: %s; font-size: %s;' %
              (random.choice(['left', 'center']), str(random.randint(12, 20)))}

    invoice_numb_block = []
    if(random.random() < 0.8):
        invoice_numb_block.append(invoice_number_title)
        if(random.random() < 0.5):
            invoice_numb_block.append(" :")
    invoice_numb_block.append(" ")
    invoice_numb_block.append(invoice_number)
    invoice_numb_block = "".join(invoice_numb_block)
    invoice_numb_block = {'content': invoice_numb_block, 'desc': 'invoice_numb_block',
              'style': 'text-align: %s; font-size: %s;' %
              (random.choice(['left', 'center']), str(random.randint(12, 20)))}

    id_block = append_func([customer_num_block, invoice_numb_block])

    price_block = "%s %s %s" % (price_title, price_currency, price)
    price_block = {'content': price_block, 'desc': 'price_block', 'label': price,
                   'style': 'text-align: %s; font-size: %s;' %
                   (random.choice(['left', 'center', 'right']), str(random.randint(14, 20)))} # , 'style':'background-color: red;'

    sub_total_block = "%s %s %s" % (sub_total_title, price_currency, price)
    sub_total_block = {'content': sub_total_block, 'desc': 'sub_total_block',
                   'style': 'text-align: %s; font-size: %s;' %
                   (random.choice(['left', 'center', 'right']), str(random.randint(14, 20)))} # , 'style':'background-color: red;'

    product_block = []
    tmp_style = 'text-align: %s; font-size: %s;' % (
        random.choice(['left', 'center']), str(random.randint(12, 20)))

    for item in product_price_percentage:
        tmp_str = gen_data.create_nouns(random.randint(1,6))
        if(random.random() < 0.5):
            tmp_str += ":"
        tmp_str += " "
        tmp_str += str(round(total_price*item,2)) 

        product_block.append({
            'content': tmp_str,
            'desc': 'product',
            'style': tmp_style
        })
    
    #
    #
    #### create structure ####
    spacing_num = random.randint(0,2)
#     spacing_num = 0

    # add upper section
    upper_section_tmp = [company_block, customer_address_block, header, invoice_date]
    random.shuffle(upper_section_tmp)

    upper_section = []
    for item in upper_section_tmp:
        upper_section.append(item)
        upper_section.append(line_break(spacing_num))
    upper_section.append(line_break(1))

    s = append_func(upper_section)

    # introduce horizontal bars
    if(random.random() < 0.9):
        s.append("SPECCHAR#BAR#")

    # introduce randomness
    if(random.random() < 0.333):
        s.append({'content': gen_data.create_sentence(), 'desc': 'random_sentence'})
        s = append_func([s, line_break(1)])

    # add middle section
    middle_section_tmp = [id_block, {'content': gen_data.create_sentence(), 'desc': 'random_sentence'}]
    random.shuffle(middle_section_tmp)

    middle_section = []
    for item in middle_section_tmp:
        middle_section.append(item)
        middle_section.append(line_break(spacing_num))

    middle_section = append_func(middle_section)
    s = append_func([s, middle_section])

    # introduce horizontal bars
    if(random.random() < 0.4):
        s.append("SPECCHAR#BAR#")
        
    # add price block
    s = append_func([s, product_block, line_break(1)])

    # introduce horizontal bars
    if(random.random() < 0.9):
        if(random.random() < 0.6):
            s.append("SPECCHAR#BAR#")

        # add price block
        s = append_func([s, sub_total_block, line_break(1)])

    # introduce horizontal bars
    if(random.random() < 0.9):
        s.append("SPECCHAR#BAR#")

    # add price block
    s = append_func([s, price_block, line_break(1)])

    # introduce horizontal bars
    if(random.random() < 0.9):
        s.append("SPECCHAR#BAR#")

    # introduce randomness
    if(random.random() < 0.666):
        s.append({'content': gen_data.create_sentence(), 'desc': 'random_sentence'})
        s = append_func([s, line_break(1)])

    # add ending
    s.append({'content': gen_data.create_sentence(), 'desc': 'random_sentence'}) 
    
    
    return s, spacing_num

In [5]:
def generate_html_and_save_invoice(s, out_name, spacing_num, doc_size_rand, label=False):
    doc, tag, text = Doc().tagtext()

    if (label):
        body_noise = "background-image : url(http://api.thumbr.it/whitenoise-361x370.png?background=ffffffff&noise=5c5c5c&density=0&opacity=0);"
    else:
        body_noise = "background-image : url(http://api.thumbr.it/whitenoise-361x370.png?background=ffffffff&noise=5c5c5c&density=%s&opacity=%s);" % (
        random.randint(0,40), random.randint(10,90))
    
    with tag('html'):
        with tag('body', style = body_noise):
            with tag('div', style="padding: 15"):
                for item in s:
                    if (item == "<br />"):
                        with tag('br'):
                            text("")
                    elif (item == "SPECCHAR#BAR#"):
                        with tag('hr'): text("")
                    else:
                        if 'style' in item.keys():
                                curr_style = item['style']
                        else:
                            curr_style = ""
                            
                        if(label):
                            curr_style += "color: white;"

                        with tag('p', id = 'main', style="margin: 2; padding: 0; %s" % curr_style):
                            if(item['desc'] == 'price_block'):
                                text(item['content'].replace(item['label'], ""))
                                with tag('span', style= "background-color: red; color: red;" if label else ""):
                                    text('%s' % item['label'])
                            elif(item['desc'] == 'invoice_date'):
                                text(item['content'].replace(item['label'], ""))
                                with tag('span', style= "background-color: green; color: green;" if label else ""):
                                    text('%s' % item['label'])
                            else:
                                text(item['content'])

    result = doc.getvalue()
#     display(HTML(result))

    # document format
    IMG_QUALITY = 10

    if(spacing_num == 0):
        if(doc_size_rand < 0.5):
            options = {'width': 300, 'quality': IMG_QUALITY}
        else:
            options = {'width': 350, 'quality': IMG_QUALITY}
    elif(spacing_num == 1):
        if(doc_size_rand < 0.5):
            options = {'width': 450, 'quality': IMG_QUALITY}
        else:
            options = {'width': 500, 'quality': IMG_QUALITY}
    else:
        options = {'width': 900, 'height': 1250, 'quality': IMG_QUALITY}

#     print(spacing_num, options)
    imgkit.from_string(result, '%s.png' % out_name, options=options)

    return None

In [6]:
# test
i = 0
s, spacing_num = generate_invoice_str()
doc_size_rand = random.random()
generate_html_and_save_invoice(s, "test/out_%i_label" % i, spacing_num, doc_size_rand, True)
generate_html_and_save_invoice(s, "test/out_%i" % i, spacing_num, doc_size_rand, False)

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


In [7]:
out_folder = "invoice_data"

# create data folder
try:
    os.mkdir(out_folder)
except:
    print("%s folder already created!" % out_folder)

# Open/Create a file to append data
csvFile = open('%s/gen_invoices_labels.csv' % out_folder, 'w')
# Use csv Writer
csvWriter = csv.writer(csvFile)
csvWriter.writerow(['file_name', 'label_file_name', 'date', 'price'])
    
for i in tqdm(range(1000)):
    s, spacing_num = generate_invoice_str()
    
    doc_size_rand = random.random()
    file_name = "%s/out_%i" % (out_folder, i)
    label_file_name = "%s/out_%i_label" % (out_folder, i)
    generate_html_and_save_invoice(s, file_name, spacing_num, doc_size_rand, True)
    generate_html_and_save_invoice(s, label_file_name, spacing_num, doc_size_rand, False)
    
    # TODO: save as dict because order might get fucked up
    row = []
    row.append(file_name)
    row.append(label_file_name)
    for item in s:
        if (type(item) == dict):
            if 'label' in item.keys():
                row.append(item['label'])
    print(row)
    csvWriter.writerow(row)

invoice_data folder already created!


FileNotFoundError: [Errno 2] No such file or directory: 'invoice_data/gen_invoices_labels.csv'