In [10]:
import pandas as pd
import re
import random
from tqdm import tqdm
import csv
import os

# Generating random names, addresses, etc.
# http://pbpython.com/barnum.html
from barnum import gen_data

# create html
# http://www.yattag.org/
from yattag import Doc
# display html
from IPython.core.display import display, HTML
# save html to png
import imgkit

In [11]:
def line_break(times):
    s = []
    for i in range(times):
        s.append("<br />")
    return s

In [12]:
def append_func(arr):
    l = []
    for item in arr:
        if(type(item) == list):
            for itm in item:
                l.append(itm)
        else:
            l.append(item)
    return l

In [13]:
def generate_invoice_str():
    # random data lists
    domain_endings = [".com", ".org", ".net", ".us", ".co", ".ca", ".cn", ".fr", ".ch", ".at", ".au",
                      ".in", ".de", ".jp", ".nl", ".uk", ".mx", ".no", ".ru", ".br", ".se", ".es"]

    domain_beginnings = ["http://", "https://", "http://www.", "https://www."]

    customer_address_titles = ["address"] # , "Adresse", "indirizzo", "direcció", "dirección", "Adresses"]

    headers = ["Invoice", "Bill"]
    #            , "Rechnung", "faturamento", "fatturazione", "laskutus", "naplate",
    #           "facturation", "facturación"]

    customer_number_titles = ["Customer no.", "Customer Number", "Customer #", "Customer  Id.", "Customer ID."]
    #                           ,"Kundennummer", "Kundennr.", "Kunden Nr.", "numéro de client", "num. de client",
    #                           "número de cliente", "broj kupca", "zákaznické číslo",
    #                           "asiakasnumero", "numero cliente","número de cliente"]

    invoice_number_titles = ["Invoice no.", "Invoice Number", "Invoice #", "Invoice  Id.", "Invoice ID"]
    #                          ,"Rechnungsnummer", "Rechnungsnr.", "Rechnungs Nr.", "faturamento número",
    #                          "fatturazione numero", "laskutus numero", "naplate broj",
    #                          "facturation nombre", "facturación número",]

    currencies = ["EUR", "€", "USD", "$"] # , "JPY", "¥", "GBP", "£", "AUD", "A$", "CAD", "C$", "CHF", "Fr"
    #              "CNY", "元", "SEK", "kr",
    #              "NZD", "NZ$", "MXN", "$", "SGD", "S$", "HKD", "HK$", "NOK", "kr", "KRW", "₩", "TRY", "₺", "RUB",
    #              "₽", "INR", "₹", "BRL", "R$", "ZAR", "R"]

    price_titles = ["Tot.", "price", "Total EPS", "Balance Due"]
    # , "Preis", "Ges.", "Gesamt", "Total", "A payer", "A pagar", "Per pagar", "Per pagare"
    
    sub_total_titles = ["Sub Tot.", "Sub t.", " "]
    
    
    # Random Styles
    alignments = ['left', 'center', 'right']

    font_families = ['Arial', 'Helvetica', 'Times New Roman',
                     'Times', 'Courier New', 'Courier ']

    
    #
    #
    #### generate random entities ####
    spacings = [line_break(1), line_break(2), line_break(3), line_break(4)]

    company_font = 'font-family: "%s"; font-size: %i;' % (
        random.choice(font_families), random.randint(12, 18))
    company_block_align = 'text-align: %s;' % random.choice(alignments)

    company_name = {'content': gen_data.create_company_name(),
                    'desc': 'company_name', 'style': '%s %s' % (company_font, company_block_align)}

    company_address = "%s %s %s, %s %s" % (gen_data.create_name()[0],
                                           gen_data.create_name()[1],
                                           gen_data.create_street(),
                                           gen_data.create_city_state_zip()[0],
                                           gen_data.create_city_state_zip()[1])
    company_address = {'content': company_address, 'desc': 'company_address',
                       'style': '%s %s' % (company_font, company_block_align)}

    company_email = {'content': gen_data.create_email(), 'desc': 'company_email',
                     'style': '%s %s' % (company_font, company_block_align)}

    company_website = "%s%s%s" % (random.choice(domain_beginnings),
                                  re.sub(
                                      ' ', '_', company_name['content'].lower()),
                                  random.choice(domain_endings))
    company_website = {'content': company_website, 'desc': 'company_website',
                       'style': '%s %s' % (company_font, company_block_align)}

    customer_address_title = random.choice(customer_address_titles)

    customer_address = "%s %s %s, %s %s" % (gen_data.create_name()[0],
                                            gen_data.create_name()[1],
                                            gen_data.create_street(),
                                            gen_data.create_city_state_zip()[0],
                                            gen_data.create_city_state_zip()[1])

    header = {'content': random.choice(headers), 'desc': 'header',
              'style': 'text-align: %s; font-size: %s;' %
              (random.choice(['left', 'center']), str(random.randint(18, 30)))}

    invoice_date = random.choice([
        "%s.%s.%s" % (gen_data.create_date().day,
                      gen_data.create_date().month, gen_data.create_date().year),
        "%s-%s-%s" % (gen_data.create_date().year, gen_data.create_date().day, gen_data.create_date().month)])
    invoice_date = {'content': invoice_date, 'desc': 'invoice_date', 'label': invoice_date,
              'style': 'text-align: %s; font-size: %s;' %
              (random.choice(['left', 'center']), str(random.randint(12, 20)))}

    customer_number_title = random.choice(customer_number_titles)
    customer_number = str(random.randint(1, 10 ** random.randint(7, 12)))
    invoice_number_title = random.choice(invoice_number_titles)
    invoice_number = str(random.randint(1, 10 ** random.randint(7, 12)))

    price_title = random.choice(price_titles)
    sub_total_title = random.choice(sub_total_titles)
    price_currency = random.choice(currencies)
    total_price = random.randint(1, 10 ** random.randint(2, 6))/100
    price = str(total_price)

    num_products = random.randint(1,5)

    product_price_percentage = []
    for item in range(num_products):
        product_price_percentage.append(random.randint(1,100))

    sum_percentage_tmp = sum(product_price_percentage)
    for i in range(num_products):
        product_price_percentage[i] = round(product_price_percentage[i]/sum_percentage_tmp,2)

    #
    #
    #### create block segments ####
    spacing = random.choice(spacings)

    company_block = append_func([company_name, company_address, company_email, company_website])

    customer_address_block = "%s : %s" % (customer_address_title, customer_address)
    customer_address_block = {'content': customer_address_block, 'desc': 'customer_address_block',
              'style': 'text-align: %s; font-size: %s;' %
              (random.choice(['left', 'center']), str(random.randint(12, 20)))}

    customer_num_block = []
    customer_num_block.append(customer_number_title)
    if(random.random() < 0.5):
        customer_num_block.append(" :")
    customer_num_block.append(" ")
    customer_num_block.append(customer_number)
    customer_num_block = "".join(customer_num_block)
    customer_num_block = {'content': customer_num_block, 'desc': 'customer_num_block',
              'style': 'text-align: %s; font-size: %s;' %
              (random.choice(['left', 'center']), str(random.randint(12, 20)))}

    invoice_numb_block = []
    if(random.random() < 0.8):
        invoice_numb_block.append(invoice_number_title)
        if(random.random() < 0.5):
            invoice_numb_block.append(" :")
    invoice_numb_block.append(" ")
    invoice_numb_block.append(invoice_number)
    invoice_numb_block = "".join(invoice_numb_block)
    invoice_numb_block = {'content': invoice_numb_block, 'desc': 'invoice_numb_block',
              'style': 'text-align: %s; font-size: %s;' %
              (random.choice(['left', 'center']), str(random.randint(12, 20)))}

    id_block = append_func([customer_num_block, invoice_numb_block])

    price_block = "%s %s %s" % (price_title, price_currency, price)
    price_block = {'content': price_block, 'desc': 'price_block', 'label': price,
                   'style': 'text-align: %s; font-size: %s;' %
                   (random.choice(['left', 'center', 'right']), str(random.randint(14, 20)))} # , 'style':'background-color: red;'

    sub_total_block = "%s %s %s" % (sub_total_title, price_currency, price)
    sub_total_block = {'content': sub_total_block, 'desc': 'sub_total_block',
                   'style': 'text-align: %s; font-size: %s;' %
                   (random.choice(['left', 'center', 'right']), str(random.randint(14, 20)))} # , 'style':'background-color: red;'

    product_block = []
    tmp_style = 'text-align: %s; font-size: %s;' % (
        random.choice(['left', 'center']), str(random.randint(12, 20)))

    for item in product_price_percentage:
        tmp_str = gen_data.create_nouns(random.randint(1,6))
        if(random.random() < 0.5):
            tmp_str += ":"
        tmp_str += " "
        tmp_str += str(round(total_price*item,2)) 

        product_block.append({
            'content': tmp_str,
            'desc': 'product',
            'style': tmp_style
        })
    
    #
    #
    #### create structure ####
    spacing_num = random.randint(0,2)
#     spacing_num = 0

    # add upper section
    upper_section_tmp = [company_block, customer_address_block, header, invoice_date]
    random.shuffle(upper_section_tmp)

    upper_section = []
    for item in upper_section_tmp:
        upper_section.append(item)
        upper_section.append(line_break(spacing_num))
    upper_section.append(line_break(1))

    s = append_func(upper_section)

    # introduce horizontal bars
    if(random.random() < 0.9):
        s.append("SPECCHAR#BAR#")

    # introduce randomness
    if(random.random() < 0.333):
        s.append({'content': gen_data.create_sentence(), 'desc': 'random_sentence'})
        s = append_func([s, line_break(1)])

    # add middle section
    middle_section_tmp = [id_block, {'content': gen_data.create_sentence(), 'desc': 'random_sentence'}]
    random.shuffle(middle_section_tmp)

    middle_section = []
    for item in middle_section_tmp:
        middle_section.append(item)
        middle_section.append(line_break(spacing_num))

    middle_section = append_func(middle_section)
    s = append_func([s, middle_section])

    # introduce horizontal bars
    if(random.random() < 0.4):
        s.append("SPECCHAR#BAR#")
        
    # add price block
    s = append_func([s, product_block, line_break(1)])

    # introduce horizontal bars
    if(random.random() < 0.9):
        if(random.random() < 0.6):
            s.append("SPECCHAR#BAR#")

        # add price block
        s = append_func([s, sub_total_block, line_break(1)])

    # introduce horizontal bars
    if(random.random() < 0.9):
        s.append("SPECCHAR#BAR#")

    # add price block
    s = append_func([s, price_block, line_break(1)])

    # introduce horizontal bars
    if(random.random() < 0.9):
        s.append("SPECCHAR#BAR#")

    # introduce randomness
    if(random.random() < 0.666):
        s.append({'content': gen_data.create_sentence(), 'desc': 'random_sentence'})
        s = append_func([s, line_break(1)])

    # add ending
    s.append({'content': gen_data.create_sentence(), 'desc': 'random_sentence'}) 
    
    
    return s, spacing_num

In [33]:
def generate_html_and_save_invoice(s, out_name, spacing_num, doc_size_rand, label=False):
    doc, tag, text = Doc().tagtext()

    if (label):
        body_noise = "background-image : url(http://api.thumbr.it/whitenoise-361x370.png?background=ffffffff&noise=5c5c5c&density=0&opacity=0);"
    else:
        body_noise = "background-image : url(http://api.thumbr.it/whitenoise-361x370.png?background=ffffffff&noise=5c5c5c&density=%s&opacity=%s);" % (
        random.randint(0,100), random.randint(0,30))
    
    with tag('html'):
        with tag('body', style = body_noise):
            with tag('div', style="padding: 15"):
                for item in s:
                    if (item == "<br />"):
                        with tag('br'):
                            text("")
                    elif (item == "SPECCHAR#BAR#"):
                        with tag('hr'): text("")
                    else:
                        if 'style' in item.keys():
                                curr_style = item['style']
                        else:
                            curr_style = ""
                            
                        if(label):
                            curr_style += "color: white;"

                        with tag('p', id = 'main', style="margin: 2; padding: 0; %s" % curr_style):
                            if(item['desc'] == 'price_block'):
                                text(item['content'].replace(item['label'], ""))
                                with tag('span', style= "background-color: red; color: red;" if label else ""):
                                    text('%s' % item['label'])
                            elif(item['desc'] == 'invoice_date'):
                                text(item['content'].replace(item['label'], ""))
                                with tag('span', style= "background-color: green; color: green;" if label else ""):
                                    text('%s' % item['label'])
                            else:
                                text(item['content'])

    result = doc.getvalue()
#     display(HTML(result))

    # document format
    IMG_QUALITY = 10

    if(spacing_num == 0):
        if(doc_size_rand < 0.5):
            options = {'width': 300, 'quality': IMG_QUALITY}
        else:
            options = {'width': 350, 'quality': IMG_QUALITY}
    elif(spacing_num == 1):
        if(doc_size_rand < 0.5):
            options = {'width': 450, 'quality': IMG_QUALITY}
        else:
            options = {'width': 500, 'quality': IMG_QUALITY}
    else:
        options = {'width': 900, 'height': 1250, 'quality': IMG_QUALITY}

#     print(spacing_num, options)
    imgkit.from_string(result, '%s.png' % out_name, options=options)

    return None

In [34]:
def generate_invoice_loop(out_folder, num_invoices, start_idx = 0, nlp = True):
    if (start_idx == 0):
        csvFile = open('%s/gen_invoices_labels.csv' % out_folder, 'w', encoding='utf-8')
        # Use csv Writer
        csvWriter = csv.writer(csvFile)
        if (nlp):
            csvWriter.writerow(['date', 'price', 'str_input', 'str_label'])
        else:
            csvWriter.writerow(['file_name', 'label_file_name', 'date', 'price', 'str_input', 'str_label'])
    else:
        num_invoices = num_invoices + start_idx
        csvFile = open('%s/gen_invoices_labels.csv' % out_folder, 'a', encoding='utf-8')
        csvWriter = csv.writer(csvFile)
        
    new_line_char = "\\n"

    for i in tqdm(range(start_idx+1, num_invoices+1)):
        s, spacing_num = generate_invoice_str()

        doc_size_rand = random.random()
        file_name = "%s/out_%i" % (out_folder, i)
        label_file_name = "%s/out_%i_label" % (out_folder, i)
        
        if (nlp == False):
            generate_html_and_save_invoice(s, file_name, spacing_num, doc_size_rand, False)
            generate_html_and_save_invoice(s, label_file_name, spacing_num, doc_size_rand, True)

        # TODO: save as dict because order might get fucked up
        row = []
        
        if (nlp == False):
            row.append(file_name)
            row.append(label_file_name)
            
        str_input = ""
        str_label = ""
        
        for item in s:
            if (type(item) == dict):
                str_input += item['content'] + new_line_char
                if 'label' in item.keys():
                    row.append(item['label'])

                    if(item['desc'] == 'price_block'):
                        str_label += "0" * len(item['content'].replace(item['label'], ""))
                        str_label += "1" * len(item['label']) + new_line_char
                    elif(item['desc'] == 'invoice_date'):
                        str_label += "2" * len(item['label']) + new_line_char
                    else:
                        None
                else:
                    str_label += "0" * len(item['content']) + new_line_char

        row.append(str_input)
        row.append(str_label)
    #     print(row)
        csvWriter.writerow(row)

    return None

# Generate IMG Data

In [None]:
img_out_folder = "invoice_img_data"

# create data folder
try:
    os.mkdir(img_out_folder)
except FileExistsError:
    print("%s folder already created!" % img_out_folder)

img_list = [x for x in os.listdir(img_out_folder) if ".png" in x and "label" not in x]
if (len(img_list) > 0):
    start_idx = max([int(re.sub('\D+', '', x)) for x in img_list])
else:
    start_idx = 0

generate_invoice_loop(img_out_folder, 100, start_idx, False)

  0%|          | 0/100 [00:00<?, ?it/s]

invoice_img_data folder already created!
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


  1%|          | 1/100 [00:06<10:25,  6.32s/it]

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


  2%|▏         | 2/100 [00:10<09:26,  5.78s/it]

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


  3%|▎         | 3/100 [00:16<09:11,  5.69s/it]

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


  4%|▍         | 4/100 [00:20<08:23,  5.25s/it]

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


  5%|▌         | 5/100 [00:26<08:30,  5.37s/it]

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


  6%|▌         | 6/100 [00:31<08:23,  5.36s/it]

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


  7%|▋         | 7/100 [00:36<08:17,  5.35s/it]

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


  8%|▊         | 8/100 [00:42<08:32,  5.57s/it]

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


  9%|▉         | 9/100 [00:47<08:01,  5.29s/it]

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


 10%|█         | 10/100 [00:53<08:11,  5.46s/it]

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


 11%|█         | 11/100 [00:59<08:21,  5.63s/it]

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


 12%|█▏        | 12/100 [01:07<09:06,  6.21s/it]

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


 13%|█▎        | 13/100 [01:13<08:57,  6.18s/it]

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


 14%|█▍        | 14/100 [01:18<08:36,  6.01s/it]

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


 15%|█▌        | 15/100 [01:23<08:00,  5.65s/it]

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


 16%|█▌        | 16/100 [01:28<07:27,  5.32s/it]

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


 17%|█▋        | 17/100 [01:33<07:11,  5.20s/it]

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


 18%|█▊        | 18/100 [01:37<07:00,  5.13s/it]

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


 19%|█▉        | 19/100 [01:42<06:47,  5.03s/it]

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


 20%|██        | 20/100 [01:48<07:02,  5.29s/it]

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


 21%|██        | 21/100 [01:57<08:14,  6.26s/it]

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


 22%|██▏       | 22/100 [02:01<07:21,  5.66s/it]

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


 23%|██▎       | 23/100 [02:06<07:12,  5.61s/it]

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


 24%|██▍       | 24/100 [02:13<07:17,  5.76s/it]

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


 25%|██▌       | 25/100 [02:18<06:59,  5.60s/it]

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


 26%|██▌       | 26/100 [02:24<06:59,  5.67s/it]

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


 27%|██▋       | 27/100 [02:31<07:28,  6.15s/it]

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


 28%|██▊       | 28/100 [02:39<08:13,  6.86s/it]

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


 29%|██▉       | 29/100 [02:46<08:02,  6.79s/it]

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


 30%|███       | 30/100 [02:50<06:51,  5.88s/it]

Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               
Loading page (1/2)
Rendering (2/2)                                                    
Done                                                               


 31%|███       | 31/100 [02:58<07:28,  6.50s/it]

In [32]:
# test
pd.read_csv("%s/gen_invoices_labels.csv" % img_out_folder)#.head()

Unnamed: 0,file_name,label_file_name,date,price,str_input,str_label
0,invoice_img_data/out_0,invoice_img_data/out_0_label,3.2.2019,1.05,3.2.2019\nBill\naddress : Guadalupe Cobb 8737 ...,22222222\n0000\n000000000000000000000000000000...
1,invoice_img_data/out_1,invoice_img_data/out_1_label,15.12.2021,6309.23,Invoice\naddress : Ike Lance 4460 Bubbling Bro...,0000000\n0000000000000000000000000000000000000...
2,invoice_img_data/out_2,invoice_img_data/out_2_label,17.1.2023,0.3,Max Net East\nDaniel Royster 2696 Summer Field...,000000000000\n00000000000000000000000000000000...
3,invoice_img_data/out_3,invoice_img_data/out_3_label,23.8.2027,0.9,Invoice\naddress : Dalia Harr 8238 Fair Harbor...,0000000\n0000000000000000000000000000000000000...
4,invoice_img_data/out_4,invoice_img_data/out_4_label,17.11.2020,60.73,Bill\nContract Advanced Studio\nPatrice Ostran...,0000\n000000000000000000000000\n00000000000000...
5,invoice_img_data/out_5,invoice_img_data/out_5_label,2023-9-2,272.36,2023-9-2\nBill\nElectronic People Data\nMillar...,22222222\n0000\n0000000000000000000000\n000000...
6,invoice_img_data/out_6,invoice_img_data/out_6_label,29.10.2020,1.47,Building Atlantic Direct\nZachary Bachman 4155...,000000000000000000000000\n00000000000000000000...
7,invoice_img_data/out_7,invoice_img_data/out_7_label,2021-26-9,1847.02,2021-26-9\nBill\naddress : Danelle Dulaney 500...,222222222\n0000\n00000000000000000000000000000...
8,invoice_img_data/out_8,invoice_img_data/out_8_label,12.2.2020,757.4,12.2.2020\naddress : Madeline Hickman 4069 Wil...,222222222\n00000000000000000000000000000000000...
9,invoice_img_data/out_9,invoice_img_data/out_9_label,2025-10-5,41.89,Bill\nAtlantic Max Provider\nMirna Dionne 7565...,0000\n000000000000000000000\n00000000000000000...


# Generate NLP Data

In [18]:
nlp_out_folder = "invoice_nlp_data"

# create data folder
try:
    os.mkdir(nlp_out_folder)
except FileExistsError:
    print("%s folder already created!" % nlp_out_folder)


# to do write start idx by checking existing csv
generate_invoice_loop(nlp_out_folder, 300, 0, True)



  0%|          | 0/10000 [00:00<?, ?it/s][A[A

  0%|          | 21/10000 [00:00<00:49, 203.55it/s][A[A

invoice_nlp_data folder already created!




  0%|          | 39/10000 [00:00<00:51, 192.79it/s][A[A

  1%|          | 56/10000 [00:00<00:54, 183.31it/s][A[A

  1%|          | 78/10000 [00:00<00:51, 192.17it/s][A[A

  1%|          | 93/10000 [00:00<01:00, 164.63it/s][A[A

  1%|          | 109/10000 [00:00<01:01, 161.20it/s][A[A

  1%|          | 124/10000 [00:00<01:08, 144.48it/s][A[A

  1%|▏         | 138/10000 [00:00<01:21, 120.57it/s][A[A

  2%|▏         | 160/10000 [00:01<01:10, 138.82it/s][A[A

  2%|▏         | 182/10000 [00:01<01:02, 155.84it/s][A[A

  2%|▏         | 199/10000 [00:01<01:06, 147.05it/s][A[A

  2%|▏         | 220/10000 [00:01<01:00, 160.97it/s][A[A

  2%|▏         | 241/10000 [00:01<00:56, 172.57it/s][A[A

  3%|▎         | 260/10000 [00:01<00:57, 170.12it/s][A[A

  3%|▎         | 279/10000 [00:01<00:56, 172.97it/s][A[A

  3%|▎         | 297/10000 [00:01<01:03, 151.85it/s][A[A

  3%|▎         | 314/10000 [00:01<01:05, 148.52it/s][A[A

  3%|▎         | 330/10000 [00:02<01:06, 1

 68%|██████▊   | 6840/10000 [00:28<00:11, 269.53it/s][A[A

 69%|██████▊   | 6867/10000 [00:28<00:11, 267.32it/s][A[A

 69%|██████▉   | 6894/10000 [00:28<00:11, 267.23it/s][A[A

 69%|██████▉   | 6921/10000 [00:28<00:11, 266.84it/s][A[A

 70%|██████▉   | 6950/10000 [00:28<00:11, 271.80it/s][A[A

 70%|██████▉   | 6978/10000 [00:29<00:11, 273.76it/s][A[A

 70%|███████   | 7006/10000 [00:29<00:11, 269.33it/s][A[A

 70%|███████   | 7033/10000 [00:29<00:11, 269.02it/s][A[A

 71%|███████   | 7061/10000 [00:29<00:10, 271.22it/s][A[A

 71%|███████   | 7089/10000 [00:29<00:11, 257.10it/s][A[A

 71%|███████   | 7117/10000 [00:29<00:10, 263.15it/s][A[A

 71%|███████▏  | 7147/10000 [00:29<00:10, 271.44it/s][A[A

 72%|███████▏  | 7176/10000 [00:29<00:10, 275.15it/s][A[A

 72%|███████▏  | 7205/10000 [00:29<00:10, 277.11it/s][A[A

 72%|███████▏  | 7234/10000 [00:30<00:09, 278.89it/s][A[A

 73%|███████▎  | 7264/10000 [00:30<00:09, 283.99it/s][A[A

 73%|███████▎  | 7293/10

In [15]:
print(str_input)

Hardware Vision Telecom\nJung Hills 4207 Effingham Avenue, 29581 Jefferson City\nJenna.Roth@aliquamquis.tv\nhttp://www.hardware_vision_telecom.de\n21.5.2027\naddress : Patsy Kimble 1086 Cordova Trail, 52307 Caldwell\nBill\nEsse lorem blandit dolor duis suscipit et eufeugiat facilisis nisl nibh erat et etaccumsan.\nCustomer Number 35080723484\nInvoice ID 569169530878\nnest tomato anatomy health: 0.05\nbrandy Patricia plywood freeze bait fine 0.31\nSub Tot. € 0.36\nprice € 0.36\nConsectetuer iusto exerci accumsan vel nonummy ad in et exerci.\nEt volutpat duis iusto facilisi facilisi qui vel nonummy dolore duis accumsan in laoreet.\n


In [16]:
print(str_label)

00000000 000000 0000000\n0000 00000 0000 000000000 0000000 00000 000000000 0000\n0000000000000000000000000\n0000000000000000000000000000000000000\n000000000\n0000000 0 00000 000000 0000 0000000 000000 00000 00000000\n0000\n0000 00000 0000000 00000 0000 00000000 00 000000000 000000000 0000 0000 0000 00 00000000000\n00000000 000000 00000000000\n0000000 00 000000000000\n0000 000000 0000000 0000000 0000\n000000 00000000 0000000 000000 0000 0000 0000\n000 0000 0 0000\n00000 0 0000\n000000000000 00000 000000 00000000 000 0000000 00 00 00 0000000\n00 00000000 0000 00000 00000000 00000000 000 000 0000000 000000 0000 00000000 00 00000000\n


In [None]:
# test
pd.read_csv("%s/gen_invoices_labels.csv" % nlp_out_folder).head()