In [93]:
from keras.preprocessing.text import Tokenizer
import numpy as np

def load_tokenizers():
  with open("tokens.txt", "r", encoding="utf-8") as file:
      file_content = file.read()
  tokens = file_content.encode("ascii", "ignore").decode("utf-8")
  tokenizer = Tokenizer(filters='\n')
  tokenizer.fit_on_texts([tokens])

  return tokenizer

tokenizer = load_tokenizers()

print(len(tokenizer.word_index))
tokenizer.word_index

546


{'!stop!': 1,
 '!': 2,
 '#': 3,
 '$': 4,
 '%': 5,
 '&': 6,
 "'": 7,
 '(': 8,
 ')': 9,
 '*': 10,
 ',': 11,
 '.': 12,
 '/': 13,
 '?': 14,
 '@': 15,
 '[': 16,
 '\\': 17,
 ']': 18,
 '^': 19,
 '_': 20,
 '`': 21,
 ':': 22,
 ';': 23,
 '{': 24,
 '|': 25,
 '}': 26,
 '~': 27,
 '00spaces': 28,
 '02spaces': 29,
 '04spaces': 30,
 '06spaces': 31,
 '08spaces': 32,
 '10spaces': 33,
 '12spaces': 34,
 '14spaces': 35,
 '16spaces': 36,
 '18spaces': 37,
 '20spaces': 38,
 '22spaces': 39,
 '24spaces': 40,
 '26spaces': 41,
 '28spaces': 42,
 '30spaces': 43,
 '32spaces': 44,
 '34spaces': 45,
 '36spaces': 46,
 '38spaces': 47,
 '40spaces': 48,
 '>': 49,
 '<': 50,
 '<!doctype': 51,
 '</a': 52,
 '</abbr': 53,
 '</address': 54,
 '</area': 55,
 '</article': 56,
 '</aside': 57,
 '</audio': 58,
 '</b': 59,
 '</base': 60,
 '</bdi': 61,
 '</bdo': 62,
 '</blockquote': 63,
 '</body': 64,
 '</br': 65,
 '</button': 66,
 '</canvas': 67,
 '</caption': 68,
 '</cite': 69,
 '</code': 70,
 '</col': 71,
 '</colgroup': 72,
 '</data'

In [94]:
import re

def find_identifiers(string):
    pattern = r"\bobject\b\s+(\w+)"
    object_words = re.findall(pattern, string, flags=re.IGNORECASE)
    unique_words = list(set(object_words)) #shuould be not necessary
    object_dict = {f"id{i+1}": word for i, word in enumerate(unique_words)}
    #object_dict = {k: v for k, v in sorted(object_dict.items(), key=lambda item: len(item[1]), reverse=True)}

    return object_dict

In [95]:
import re

def split_PlantUML(string):
    string = string.replace("@startuml", "")
    string = string.replace("@enduml", "")
    substrings = string.split("'SPLIT")

    return substrings[0].strip(), substrings[1].strip(), substrings[2].strip()

In [96]:
import re
import numpy as np

def prepair_uml_input(string, identifiers):
    string = string.replace("--> ", "")
    string = string.replace(">", "")
    string = string.replace(": ", "")
    string = string.replace("\n", " ")

    for key, value in identifiers.items():
      pattern = r'\b{}\b'.format(re.escape(value))
      string = re.sub(pattern, key, string)

    #tokens = plant_uml_tokenizer.texts_to_sequences([string])[0]
    #tokens = np.array(tokens)
    #tokens = np.reshape(tokens, (-1, 2))

    return string

In [97]:
def plant_uml_preprocessing(plant_uml_string):
  plant_uml_tokenizer = load_tokenizers()
  identifiers = find_identifiers(plant_uml_string)
  stuff_for_post_processing, element_type, element_order = split_PlantUML(plant_uml_string)
  element_type_tokens = prepair_uml_input(element_type, identifiers)
  element_order_tokens = prepair_uml_input(element_order, identifiers)

  return stuff_for_post_processing, identifiers, element_type_tokens, element_order_tokens

---

**1.** we need to remove stuff that is inserted after generating (or placeholders):
- [X] **text**
- [X] **indents**
- [X] **comments**
- [X] href = " "
- [X] rel = " "
- [X] type = " "
- [X] method = " "
- [X] alt = " "
- [X] names = " "
- [X] for = " "
- [X] action = " "
- [X] placeholder = ""
- [X] src = ""
- [X] value = ""
- [X] script = ""
- [X] target = ""

we need placeholder id's:
- [X] id = " "

**2.** we use spaces to seperate the sting at certain places for a better tokenizer

**3.** we count the number of indentations and replace them with "00SPACES"

**4.** we are adding a \<STOP> token to the html

---

In [98]:
def cut_body(string):
    start_tag = '<body'
    end_tag = '/body>'

    start_index = string.find(start_tag)
    if start_index == -1:
        return ""  # Return an empty string if start tag not found

    end_index = string.find(end_tag, start_index + len(start_tag))
    if end_index == -1:
        return ""  # Return an empty string if end tag not found

    return string[start_index:end_index + len(end_tag)].strip()

In [99]:
import re

def remove_html_comments(string):
    pattern = r"<!--(.*?)-->"
    return re.sub(pattern, "", string)

In [100]:
def add_stop_token(string):
  return string + "\n!STOP!"

In [101]:
import re
def whitespace_seperator(string):

  string = string.replace("<", " <")
  string = string.replace("  <", " <")

  string = string.replace(">", " >  ")
  string = string.replace(">  ", "> ")

  string = string.replace("-", " - ")

  string = string.replace("=", "  =  ")
  string = string.replace("  =", " =")
  string = string.replace("=  ", "= ")

  string = string.replace("\"", " \" ")
  string = string.replace("  \"", " \"")
  string = string.replace("\"  ", "\" ")

  string = string.replace("\'", " \' ")
  string = string.replace("  \'", " \'")
  string = string.replace("\'  ", "\' ")

  string = string.replace("\\n", " \\n ")

  return string



In [102]:
import re

def remove_content(string):
    string = string.replace("\n", "<\n")
    pattern = r'>[^<]*<'
    string = re.sub(pattern, '> <', string)
    string = string.replace("<\n", "\n")
    return string

In [103]:
import re

def remove_attribute(attribute, string):
    pattern = r'{}\s*=\s*"[^"]*"'.format(re.escape(attribute))
    return re.sub(pattern, '', string)

In [104]:
import re

def id_placeholders(string, identifiers):
  for key, value in identifiers.items():
    string = string.replace("id = \" " + value + " \"", key)

  return string

In [105]:
def remove_script(string):
  string = string.replace("</script >", "")
  string = string.replace("</script>", "")
  string = string.replace("<script  >", "")
  string = string.replace("<script >", "")
  string = string.replace("<script>", "")
  string = string.replace("script", "")
  string = string.replace("<>", "")
  string = string.replace("< >", "")
  return string

In [106]:
import re

def indent_tokenize(string):

  # replace \t
  string = string.replace("\t", "    ")

  # replace double breaks
  string = string.replace("\n\n", "\n")
  string = string.replace("\n \n", "\n")

  # stack indents
  lines = string.split("\n")
  new_lines = []
  for i in range(len(lines)):
      line = lines[i]
      if line.strip() == '':
        continue
      indent_count = len(line) - len(line.lstrip(' '))
      if indent_count % 2 != 0:
        indent_count -= 1
      indent_spaces = f"{indent_count:02d}SPACES "
      new_lines.append(line.replace(" " * indent_count, indent_spaces, 1))
  string = "\n".join(new_lines)

  # remove remaining multiple instances of spaces
  string = re.sub(r' +', ' ', string.replace('\n', ' \n')).strip()

  return string

In [107]:
def html_preprocessing(string, identifiers): # identifiers from PlantUML
  string = cut_body(string)
  string = remove_html_comments(string)
  string = add_stop_token(string)
  string = whitespace_seperator(string)
  string = remove_content(string)
  string = remove_attribute("href", string)
  string = remove_attribute("rel", string)
  string = remove_attribute("type", string)
  string = remove_attribute("method", string)
  string = remove_attribute("alt", string)
  string = remove_attribute("name", string)
  string = remove_attribute("for", string)
  string = remove_attribute("action", string)
  string = remove_attribute("placeholder", string)
  string = remove_attribute("src", string)
  string = remove_attribute("value", string)
  string = remove_attribute("style", string)
  string = remove_attribute("script", string)
  string = remove_attribute("target", string)
  string = remove_script(string)
  string = indent_tokenize(string)
  string = id_placeholders(string, identifiers) # identifiers from PlantUML
  string = remove_attribute("id", string)

  return string

In [108]:
number = 2

with open(f"training_pairs/{number}.txt", "r", encoding="windows-1255") as file:
    file_content = file.read()
input = file_content.encode("ascii", "ignore").decode("utf-8")

with open(f"training_pairs/{number}.html", "r", encoding="utf-8") as file:
    file_content = file.read()
output = file_content.encode("ascii", "ignore").decode("utf-8")

stuff_for_post_processing, identifiers, element_type, element_order = plant_uml_preprocessing(input)
stript_html = html_preprocessing(output, identifiers)

TypeError: 'list' object is not callable

In [None]:
print(element_type)

id18 <body id4 <header id9 <main id8 <p id16 <p id27 <footer id7 <li id20 <li id24 <li id5 <ul id14 <nav id21 <section id15 <link id3 <form id1 <img id2 <img id12 <Button id10 <input id22 <input id25 <select id6 <options id28 <options id19 <options id17 <link id26 <link id23 <link id13 <link id11 <link


In [None]:
print(element_order)

id18 id4 id4 id1 id4 id14 id18 id9 id9 id21 id21 id8 id21 id16 id8 id2 id16 id15 id8 id5 id5 id7 id5 id20 id5 id24 id16 id3 id3 id10 id3 id22 id3 id12 id18 id27 id27 id25 id25 id6 id25 id28 id25 id19 id14 id17 id14 id26 id14 id23 id14 id13 id14 id11


In [None]:
print(identifiers)

{'id1': 'Logo', 'id2': 'Image', 'id3': 'Form', 'id4': 'Header', 'id5': 'List', 'id6': 'Option1', 'id7': 'ListItem1', 'id8': 'Paragraph1', 'id9': 'Main', 'id10': 'InputName', 'id11': 'LinkTeams', 'id12': 'SubmitButton', 'id13': 'LinkPlayers', 'id14': 'Navigation', 'id15': 'Link', 'id16': 'Paragraph2', 'id17': 'LinkHOME', 'id18': 'Body', 'id19': 'Option3', 'id20': 'ListItem2', 'id21': 'Section1', 'id22': 'InputEmail', 'id23': 'LinkMatches', 'id24': 'ListItem3', 'id25': 'Select', 'id26': 'LinkNews', 'id27': 'Footer', 'id28': 'Option2'}


In [None]:
print(stuff_for_post_processing)

object Body

object Header

object Logo {
+ href = "logo.png"
}

object Navigation

object Main

object Section1

object Paragraph1{
- Welcome to the World of Football
}

object Paragraph2{
- Explore the latest football news, match highlights, and more!
}

object Image {
+ href = "football.jpg"
+ alt = "Football Image"
- Goaaaaal!
}

object Link {
+ href = "youtube.com/footballvideos"
+ rel =  "bla"
- Link to Videos
}

object List

object ListItem1{
- Premier League
}

object ListItem2{
- UEFA Champions League
}

object ListItem3{
- FIFA World Cup
}

object Form {
+ action = "submit.php"
+ method = "POST"
}

object InputName {
+ type = "text"
+ name = "name"
+ placeholder = "name"
}

object InputEmail {
+ type = "email"
+ name = "email"
+ placeholder = "email"
}

object SubmitButton {
+ type = "submit"
+ value = "Submit"
- Submit
}

object Select {
+ name = "dropdown"
}

object Option1 {
+ value = "option1"
- Option 1
}

object Option2 {
+ value = "option2"
- Option 2
}

object Option3

In [None]:
import tensorflow as tf
def test_plant_ulm_preprocessing(stript_html, identifiers, type_input, order_input):
        # tokenizing
    # tokenizer = load_tokenizers()
    type_input  = np.array(tokenizer.texts_to_sequences([type_input] ))[0]
    order_input = np.array(tokenizer.texts_to_sequences([order_input]))[0]
    html_output = np.array(tokenizer.texts_to_sequences([stript_html]       ))[0]

    # DATA SIZE: Get the number of tokens in html - token sequence
    data_size = len(html_output)

    # Transform type_input into a two-dimensional numpy array
    # Soft Copy
    type_input = np.tile(type_input, (1, 1))
    type_input = tf.repeat(type_input, repeats=data_size, axis=0)

    # Transform order_input into a two-dimensional numpy array
    # Soft Copy
    order_input = np.tile(order_input, (1, 1))
    order_input = tf.repeat(order_input, repeats=data_size, axis=0)

    # Transform html_input into a two-dimensional numpy array
    # Hard Copy
    html_input = np.tile(html_output, (data_size, 1))

    # html_input Padding
    for i in range(data_size):
      html_input[i, i:] = 0

    return [type_input, order_input, html_input]

In [None]:
[type_input, order_input, html_input] = test_plant_ulm_preprocessing(stript_html,identifiers, element_type, element_order)

In [None]:
print(type_input)

tf.Tensor(
[[393 174 379 ... 219 386 219]
 [393 174 379 ... 219 386 219]
 [393 174 379 ... 219 386 219]
 ...
 [393 174 379 ... 219 386 219]
 [393 174 379 ... 219 386 219]
 [393 174 379 ... 219 386 219]], shape=(376, 53), dtype=int32)


In [None]:
import os
def createNewFile(file_name, folder_path,fileInput):
    # Specify the folder path and file name
    # Create the full file path
    file_path = os.path.join(folder_path, file_name)

    # Check if the folder exists, if not create it
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    # Create the new file
    with open(file_path, 'w') as file:
        # Perform any necessary operations on the file
        file.write(fileInput)

    # Print a confirmation message
    print(f"New file '{ file_name }' created in '{folder_path}'.")

In [None]:
createNewFile("1.html", "Test",stript_html)
createNewFile("1.stuff_for_post_processing", "Test",stuff_for_post_processing)

New file '1.html' created in 'Test'.
New file '1.stuff_for_post_processing' created in 'Test'.


In [None]:
print(identifiers)

{'id1': 'Logo', 'id2': 'Image', 'id3': 'Form', 'id4': 'Header', 'id5': 'List', 'id6': 'Option1', 'id7': 'ListItem1', 'id8': 'Paragraph1', 'id9': 'Main', 'id10': 'InputName', 'id11': 'LinkTeams', 'id12': 'SubmitButton', 'id13': 'LinkPlayers', 'id14': 'Navigation', 'id15': 'Link', 'id16': 'Paragraph2', 'id17': 'LinkHOME', 'id18': 'Body', 'id19': 'Option3', 'id20': 'ListItem2', 'id21': 'Section1', 'id22': 'InputEmail', 'id23': 'LinkMatches', 'id24': 'ListItem3', 'id25': 'Select', 'id26': 'LinkNews', 'id27': 'Footer', 'id28': 'Option2'}


In [None]:
print(element_type)

id18 <body id4 <header id9 <main id8 <p id16 <p id27 <footer id7 <li id20 <li id24 <li id5 <ul id14 <nav id21 <section id15 <link id3 <form id1 <img id2 <img id12 <Button id10 <input id22 <input id25 <select id6 <options id28 <options id19 <options id17 <link id26 <link id23 <link id13 <link id11 <link


In [None]:
print(element_order)

id18 id4 id4 id1 id4 id14 id18 id9 id9 id21 id21 id8 id21 id16 id8 id2 id16 id15 id8 id5 id5 id7 id5 id20 id5 id24 id16 id3 id3 id10 id3 id22 id3 id12 id18 id27 id27 id25 id25 id6 id25 id28 id25 id19 id14 id17 id14 id26 id14 id23 id14 id13 id14 id11


## TODO
### 00SPACES !STOP! wegmachen
### Einrücken
### <!DOCTYPE html>  etc dran klatschen....
### Head reinmachen
### 

In [None]:
identifiers

{'id1': 'Logo',
 'id2': 'Image',
 'id3': 'Form',
 'id4': 'Header',
 'id5': 'List',
 'id6': 'Option1',
 'id7': 'ListItem1',
 'id8': 'Paragraph1',
 'id9': 'Main',
 'id10': 'InputName',
 'id11': 'LinkTeams',
 'id12': 'SubmitButton',
 'id13': 'LinkPlayers',
 'id14': 'Navigation',
 'id15': 'Link',
 'id16': 'Paragraph2',
 'id17': 'LinkHOME',
 'id18': 'Body',
 'id19': 'Option3',
 'id20': 'ListItem2',
 'id21': 'Section1',
 'id22': 'InputEmail',
 'id23': 'LinkMatches',
 'id24': 'ListItem3',
 'id25': 'Select',
 'id26': 'LinkNews',
 'id27': 'Footer',
 'id28': 'Option2'}

In [None]:
stript_html

'00SPACES <body > \n04SPACES <header id4 class = " bg - dark " > \n08SPACES <div class = " container " > \n12SPACES <img id1 > \n12SPACES <nav id14 > \n16SPACES <a id17 class = " btn btn - primary " > </a > <br > \n16SPACES <a id26 class = " btn btn - primary " > </a > <br > \n16SPACES <a id23 class = " btn btn - primary " > </a > <br > \n16SPACES <a id13 class = " btn btn - primary " > </a > <br > \n16SPACES <a id11 class = " btn btn - primary " > </a > <br > \n12SPACES </nav > \n08SPACES </div > \n04SPACES </header > \n04SPACES <main id9 class = " bg - light " > \n08SPACES <section id21 > \n12SPACES <div class = " container " > \n16SPACES <div class = " card bg - light " > \n20SPACES <div class = " card - body " > \n24SPACES <h1 class = " card - title " > </h1 > \n24SPACES <p id16 > </p > \n24SPACES <img id2 class = " img - fluid " > <br > \n24SPACES <a id15 class = " btn btn - primary " > </a > <br > \n24SPACES <ul id5 > \n28SPACES <li id7 > </li > \n28SPACES <li id20 > </li > \n28S

In [None]:
#  00SPACES !STOP! wegmachen

In [None]:
# Ziel aus stuff_for_processing mit Logo den shit dazu rausfiltern.

In [None]:
class Object:
    def __init__(self, id):
        self.id = id
        self.elements = None
        self.text = None

    def get_string(self):
        res = "id = " + "\"" + self.id + "\""
        if self.elements == None:
            return res
        return res + self.elements   


In [None]:
def get_object_dict(stuff_for_post_processing):
    object_dict = {}
    all_lines = stuff_for_post_processing.splitlines()
    for i in range(len(all_lines)):
        line = all_lines[i]
        if "object" in line:
            if ('{') in line:
                # abfangen von object objectname{ wenn nicht object name
                id = line.split('{')[0].split()[1]
                new_object = Object(id)
                #print(id)
                for i in range(i+1,len(all_lines)):
                    #print(i)
                    if '}' in all_lines[i]:
                        object_dict[id] = new_object
                        break
                    if '+' in all_lines[i]:
                        if new_object.elements == None:
                            new_object.elements = ""
                        new_object.elements = new_object.elements + " " + all_lines[i].split("+")[1]
                    if '-' in all_lines[i]:
                        new_object.text =   all_lines[i].split("-")[1]      
            else:
                id = line.split()[1]
                new_object = Object(id)
                object_dict[id] = new_object
    return object_dict
    
#object_dict = get_object_dict(stuff_for_post_processing)

#for id in object_dict:
    #print(id + "..." + object_dict[id].get_string()) 

In [None]:
#TODO hier rein bei dem plus abfangen Probleme 

In [None]:
list = [x for x in identifiers.keys()]
print(list)

['id1', 'id2', 'id3', 'id4', 'id5', 'id6', 'id7', 'id8', 'id9', 'id10', 'id11', 'id12', 'id13', 'id14', 'id15', 'id16', 'id17', 'id18', 'id19', 'id20', 'id21', 'id22', 'id23', 'id24', 'id25', 'id26', 'id27', 'id28']


In [None]:
def start_post_processing(stuff_for_post_processing, identifiers, element_type, element_order,stript_html):
    object_dict = get_object_dict(stuff_for_post_processing)
    for element in identifiers.keys():
        if (element + " " not in stript_html):
            continue
        id = identifiers[element]
        #print(element)
        fill_text = object_dict[id].get_string()
        #print(element)
        if object_dict[id].text is not None:
            cuttetElemement = stript_html.split(element + " ")
            #print(cuttetElemement[1])
            only_inside = cuttetElemement[1].split("<")#
            #print(element + " " + only_inside[0])
            #print(fill_text + " " + only_inside[0] + object_dict[id].text)
            stript_html = stript_html.replace(element + " " + only_inside[0], fill_text + " " + only_inside[0] + object_dict[id].text)
        else:
            stript_html = stript_html.replace(element + " ", fill_text + " ")
    return stript_html    

In [None]:
def remove_all_spaces_tokens(filled_HTML):
    filled_HTML = filled_HTML.replace("!STOP!","")
    filled_HTML = filled_HTML.replace("00SPACES ","")
    spaces = ""
    for i in range(10):
        filled_HTML = filled_HTML.replace("0"+ str(i) +  "SPACES ",spaces).replace(" - ", "-")
        spaces = spaces + " "    
    for i in range(10,64):
        filled_HTML = filled_HTML.replace(str(i) +  "SPACES ",spaces).replace(" - ", "-")
        spaces = spaces + " "    
    return filled_HTML

In [None]:
filled_HTML = start_post_processing(stuff_for_post_processing, identifiers, element_type, element_order,stript_html)
filled_HTML = remove_all_spaces_tokens(filled_HTML)
createNewFile("post_processed1.html", "Test",filled_HTML)