In [2]:
import pandas
import numpy as np
import tensorflow.keras as keras
import tensorflow as tf
import matplotlib.pyplot as plt

In [3]:
def list_2D(r, c):
    l = []
    for i in range(r):
        x = [0] * c
        l.append(x)
    return np.array(l)

In [4]:
#Convert labels from a list of names into one hot encodings
#Have one null character for the full one hot encoding of the label
def labelsToOneHotEncodings(arr):
    #Add 'null' special character to each label
    arr = [each_string + "@" for each_string in arr]
    
    MAX_NUM_CHARS = 32 #Maxium number of characters a label can be
    SPECIAL_CHARS = 69 #Maxium number of special characters the one hot encoding will have
    
    #Array to contain each label's one hot encoding, 2-D arrays
    encodings = []
    
    #Convert each label into 2-D array to represent the one hot encoding
    for label in arr:
        i,j = -1,0 #Indexes for the postion to mark 1 into the 2-D array
        labelMat = list_2D(MAX_NUM_CHARS,SPECIAL_CHARS) #Special Characters by Max Number of Characters
        
        #Find the location to mark 1 into the one hot encoding that represents each character in the label
        for character in label:
            #Increase the position of the 2-D matrix to mark by each character in the label
            i+=1
                        
            #Check the character to determine the index of where to mark in the 2-D matrix
            if (((ord(character) - ord('a')) >= 0) and ((ord(character) - ord('a')) < 26)):#Lowercase Alphabet
                j = ord(character) - ord('a')
            elif (((ord(character) - ord('A')) >= 0) and ((ord(character) - ord('A')) < 26)):#Uppercase Alphabet
                j = 26 + (ord(character) - ord('A'))
            elif (((ord(character) - ord('0')) >= 0) and ((ord(character) - ord('0')) < 10)):#0-9
                j = 52 + (ord(character) - ord('0'))
            elif (ord(character) == 32):#' '
                j = 62
            elif (ord(character) == 44):#','
                j = 63
            elif (ord(character) == 45):#'-'
                j = 64
            elif (ord(character) == 39):#"'"
                j = 65
            elif (ord(character) == 47):#'/'
                j = 66
            elif (ord(character) == 64):#'@'
                j = 68
            else: #Garbage
                j = 67
            
            #Mark 1 into the position of the one hot encoding based on the indexes
            try:
                labelMat[i][j] = 1
            except:
                #Error Message for determing the value of this index for special character debugging
                print(j)
        
        #Add the one hot encoded matrix of the label into the list of labels' one hot encodings
        encodings.append(labelMat)
        
    #Return the encoded labels, the max number of characters, and the max number of special characters
    return np.array(encodings), MAX_NUM_CHARS, SPECIAL_CHARS

In [5]:
#Convert labels from a list of names into one hot encodings
#Have the rest of the characters passed the length of the label to a 'null' one hot encoding
def labelsToOneHotEncodings_2(arr):    
    MAX_NUM_CHARS = 32 #Maxium number of characters a label can be
    SPECIAL_CHARS = 69 #Maxium number of special characters the one hot encoding will have
    
    #Array to contain each label's one hot encoding, 2-D arrays
    encodings = []
    
    #Convert each label into 2-D array to represent the one hot encoding
    for label in arr:
        i,j = -1,0 #Indexes for the postion to mark 1 into the 2-D array
        labelMat = list_2D(MAX_NUM_CHARS,SPECIAL_CHARS) #Special Characters by Max Number of Characters
        
        #Find the location to mark 1 into the one hot encoding that represents each character in the label
        for character in label:
            #Increase the position of the 2-D matrix to mark by each character in the label
            i+=1
                        
            #Check the character to determine the index of where to mark in the 2-D matrix
            if (((ord(character) - ord('a')) >= 0) and ((ord(character) - ord('a')) < 26)):#Lowercase Alphabet
                j = ord(character) - ord('a')
            elif (((ord(character) - ord('A')) >= 0) and ((ord(character) - ord('A')) < 26)):#Uppercase Alphabet
                j = 26 + (ord(character) - ord('A'))
            elif (((ord(character) - ord('0')) >= 0) and ((ord(character) - ord('0')) < 10)):#0-9
                j = 52 + (ord(character) - ord('0'))
            elif (ord(character) == 32):#' '
                j = 62
            elif (ord(character) == 44):#','
                j = 63
            elif (ord(character) == 45):#'-'
                j = 64
            elif (ord(character) == 39):#"'"
                j = 65
            elif (ord(character) == 47):#'/'
                j = 66
            elif (ord(character) == 64):#'@' null character
                j = 68
            else: #Garbage
                j = 67
            
            #Mark 1 into the position of the one hot encoding based on the indexes
            try:
                labelMat[i][j] = 1
            except:
                #Error Message for determing the value of this index for special character debugging
                print(j)
                
                
        #Fill the rest of the missing characters of the label with the 'null' character one hot encodings
        for row in range(len(label),MAX_NUM_CHARS):
            labelMat[row][SPECIAL_CHARS-1] = 1
        
        #Add the one hot encoded matrix of the label into the list of labels' one hot encodings
        encodings.append(labelMat)
    
    #Return the encoded labels, the max number of characters, and the max number of special characters
    return np.array(encodings), MAX_NUM_CHARS, SPECIAL_CHARS

In [6]:
def linearsearch(arr, x):
   for i in range(len(arr)):
      if arr[i] == x:
         return i
   return -1

In [7]:
def oneHotEncodingToWord(encoding):
    #String to capture the word decoded from the one hot encodings
    word = ""
    
    #Decode each character in the onehot encoding and add the character to the word
    for i in range(len(encoding)):
        #Find the index that is the character represented by the one hot encoding
        #j = linearsearch(encoding[i],1)
        j = np.argmax(encoding[i],-1)
        
        #Decode the index to represent a special character
        #Then apply the character to the end of the word
        if (j == 68 or j == 67): #Null or garbage characters
            break
        elif (j >= 0 and j < 26): #Lowercase Aplhabet
            word = word + chr(97 + j)
        elif (j >= 26 and j < 52): #Uppercase Aplhabet
            j = j - 26
            word = word + chr(65 + j)
        elif (j >= 52 and j < 62): #Digits (0-9)
            j = j - 52
            word = word + chr(48 + j)
        elif (j == 62): #Space character
            word = word + " "
        elif (j == 63): #Comma character
            word = word + ','
        elif (j == 64): #Dash character
            word = word + '-'
        elif (j == 65): #Hyphen charcter
            word = word + "'"
        elif (j == 66): #Foward slash character
            word = word + '/'

    return word

In [8]:
x = ["Magical Hacker","Asmoranomardicadaistinaculdacar","Jar Jar Binks","Darth Vader","Struggle // Survive",
     "abcdefghijklmnopqrstuvwxyz","ABCDEF123456789"]

In [9]:
test1, rows, cols = labelsToOneHotEncodings(x)
test2, rows, cols = labelsToOneHotEncodings_2(x)

In [10]:
oneHotEncodingToWord(test2[4])

'Struggle // Survive'

In [11]:
x[4]

'Struggle // Survive'

In [12]:
test2[4]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1]])

In [14]:
test1[4]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])