In [81]:
import numpy as np
from keras.preprocessing.sequence import pad_sequences
import requests

In [112]:
fEmbeddings = open("../embeddings/glove.6B.100d.txt", encoding="utf-8")

def addCharInfo(Sentence):
    for i, data in enumerate(Sentence):
        chars = [c for c in data]
        Sentence[i] = [data, chars]
    return Sentence

def createMatrices(sentence, word2Idx, case2Idx, char2Idx):
    unknownIdx = word2Idx['UNKNOWN_TOKEN']
    paddingIdx = word2Idx['PADDING_TOKEN']

    dataset = []

    wordCount = 0
    unknownWordCount = 0


    wordIndices = []
    caseIndices = []
    charIndices = []
    for word, char in sentence:
        wordCount += 1
        if word in word2Idx:
            wordIdx = word2Idx[word]
        elif word.lower() in word2Idx:
            wordIdx = word2Idx[word.lower()]
        else:
            wordIdx = unknownIdx
            unknownWordCount += 1
        charIdx = []
        for x in char:
            charIdx.append(char2Idx[x])
        # Get the label and map to int
        wordIndices.append(wordIdx)
        caseIndices.append(getCasing(word, case2Idx))
        charIndices.append(charIdx)
    
    return [wordIndices, caseIndices, charIndices]

# 0-pads all words
def padding(Sentence):    
    maxlen = 52
    char = Sentence[2]
    for x in char:
        maxlen = max(maxlen, len(x))
    Sentence[0][2] = pad_sequences(Sentence[0][2], 52, padding='post')
    return Sentence

# define casing s.t. NN can use case information to learn patterns
def getCasing(word, caseLookup):
    casing = 'other'

    numDigits = 0
    for char in word:
        if char.isdigit():
            numDigits += 1

    digitFraction = numDigits / float(len(word))

    if word.isdigit():  # Is a digit
        casing = 'numeric'
    elif digitFraction > 0.5:
        casing = 'mainly_numeric'
    elif word.islower():  # All lower case
        casing = 'allLower'
    elif word.isupper():  # All upper case
        casing = 'allUpper'
    elif word[0].isupper():  # is a title, initial char upper, then all lower
        casing = 'initialUpper'
    elif numDigits > 0:
        casing = 'contains_digit'

    return caseLookup[casing]

In [113]:
class DDLT(object):
    def embed(self, Sentence):

        words = {}
        for token, char in Sentence:
            words[token.lower()] = True

        # Map token cases
        # PADDING_TOKEN: pad sentences to make them the same length
        case2Idx = {'numeric': 0, 'allLower': 1, 'allUpper': 2, 'initialUpper': 3, 'other': 4, 'mainly_numeric': 5,
                    'contains_digit': 6, 'PADDING_TOKEN': 7}
        self.caseEmbeddings = np.identity(len(case2Idx), dtype='float32')  # identity matrix used 

        word2Idx = {}
        self.wordEmbeddings = []
        # Loop through words in embeddings
        for line in fEmbeddings:
            split = line.strip().split(" ")
            word = split[0]  # embedding word entry

            if len(word2Idx) == 0:  # add padding+unknown
                word2Idx["PADDING_TOKEN"] = len(word2Idx)
                vector = np.zeros(len(split) - 1)  # zero vector for 'PADDING' word
                self.wordEmbeddings.append(vector)

                word2Idx["UNKNOWN_TOKEN"] = len(word2Idx)
                vector = np.random.uniform(-0.25, 0.25, len(split) - 1)
                self.wordEmbeddings.append(vector)

            if split[0].lower() in words:
                vector = np.array([float(num) for num in split[1:]])
                self.wordEmbeddings.append(vector)  # word embedding vector
                word2Idx[split[0]] = len(word2Idx)  # corresponding word dict

        self.wordEmbeddings = np.array(self.wordEmbeddings)

        # Create string with possible characters
        chars = " 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.,-_()[]{}!?:;#'\"/\\%$`&=*+@^~|<>"

        # Create dictionary of possible characters
        self.char2Idx = {"PADDING": 0, "UNKNOWN": 1}
        for c in chars:
            self.char2Idx[c] = len(self.char2Idx)
        
        #self.fuckFuck = padding(createMatrices(Sentence, word2Idx, case2Idx, self.char2Idx))
        self.embedded = createMatrices(Sentence, word2Idx, case2Idx, self.char2Idx)
        flat = []
        for sublist in self.embedded[2]:
            for item in sublist:
                flat.append(item)
        self.embedded[2] = flat

In [114]:
text = 'I am from London'  # Python 3
text = text.split()
text = addCharInfo(text)

text

[['I', ['I']],
 ['am', ['a', 'm']],
 ['from', ['f', 'r', 'o', 'm']],
 ['London', ['L', 'o', 'n', 'd', 'o', 'n']]]

In [115]:
ddlt = DDLT()

In [116]:
ddlt.embed(text)

In [132]:
words = ddlt.embedded[0]
while(len(words) < 52):
    words.append(0)
casings = ddlt.embedded[1]
chars = ddlt.embedded[2]
while(len(chars) < 52):
    chars.append(0)

In [133]:
print(str(chars))

[47, 13, 25, 18, 30, 27, 25, 50, 27, 26, 16, 27, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [134]:
url = 'http://10.0.75.2:8501/v1/models/my_model:predict'
data = '''{{
    "instances": [ 
        {{ 
            "words" : {words},
            "casing" : {casings},
            "characters" : {chars} 
        }} 
    ] 
}}'''.format(words=str(words), casings=str(casings), chars=str(chars))
response = requests.post(url, data=data)
#print(response)
#print(response.text)

In [135]:
response.text

'{ "error": "Input to reshape is a tensor with 1560 values, but the requested shape requires a multiple of 81120\\n\\t [[{{node Character_embedding_21/Reshape_1}} = Reshape[T=DT_FLOAT, Tshape=DT_INT32, _output_shapes=[[?,?,52,30]], _device=\\"/job:localhost/replica:0/task:0/device:CPU:0\\"](Character_embedding_21/embedding_lookup, Character_embedding_21/Reshape_1/shape)]]" }'

In [136]:
data

'{\n    "instances": [ \n        { \n            "words" : [3, 5, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n            "casing" : [2, 1, 1, 3],\n            "characters" : [47, 13, 25, 18, 30, 27, 25, 50, 27, 26, 16, 27, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \n        } \n    ] \n}'