# Imports

In [1]:
import pandas as pd
import numpy as np
import sklearn

import tensorflow_hub as hub
import tensorflow as tf
import bert
import math

from tensorflow.keras.models import Model
from tensorflow.keras import datasets, layers, models
from sklearn.model_selection import train_test_split
from bert.tokenization import bert_tokenization

FullTokenizer = bert_tokenization.FullTokenizer

In [2]:
np.set_printoptions(threshold=np.inf)

In [3]:
sampleDF = pd.read_csv('/Your/Path/to/hd2021.csv', encoding='unicode_escape')

# BERT Setup

In [51]:
#Model Setup
max_seq_length = 256  # Your choice here.

input_word_ids = tf.keras.layers.Input(shape = (max_seq_length,), dtype = tf.int32, name = "input_word_ids")
input_mask = tf.keras.layers.Input(shape = (max_seq_length,), dtype = tf.int32, name = "input_mask")
segment_ids = tf.keras.layers.Input(shape = (max_seq_length,), dtype = tf.int32, name = "segment_ids")

bert_layer = hub.KerasLayer("/Your/BERT/Model/Path/Here", trainable = True)

outputs = bert_layer({'input_mask': input_mask, 'input_type_ids': segment_ids, 'input_word_ids': input_word_ids})
pooled_output = outputs["pooled_output"]
sequence_output = outputs["sequence_output"]

model = Model(inputs = [input_word_ids, input_mask, segment_ids], outputs = [pooled_output, sequence_output])

In [5]:
#Manual processing of tokens
def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))


def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))


def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

In [6]:
#Create tokenizer that will create tokens
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [9]:
def documentEmbed(d):
    dtokens = tokenizer.tokenize(d)
    dtokens = ["[CLS]"] + dtokens + ["[SEP]"]

    input_ids_d = get_ids(dtokens, tokenizer, max_seq_length)
    input_masks_d = get_masks(dtokens, max_seq_length)
    input_segments_d = get_segments(dtokens, max_seq_length)

    input_ids_d=np.array(input_ids_d)
    input_masks_d = np.array(input_masks_d)
    input_segments_d = np.array(input_segments_d)

    pool_embs_d, all_embs_d = model.predict([input_ids_d.reshape(1,max_seq_length), np.zeros((1,max_seq_length)), np.zeros((1,max_seq_length))])

    return pool_embs_d, all_embs_d

In [10]:
#Example
s = "This is a nice sentence."
stokens = tokenizer.tokenize(s)
stokens = ["[CLS]"] + stokens + ["[SEP]"]

input_ids = get_ids(stokens, tokenizer, max_seq_length)
input_masks = get_masks(stokens, max_seq_length)
input_segments = get_segments(stokens, max_seq_length)

input_ids=np.array(input_ids)
input_masks = np.array(input_masks)
input_segments = np.array(input_segments)
pool_embs, all_embs = model.predict([input_ids.reshape(1,max_seq_length),input_masks.reshape(1,max_seq_length),input_segments.reshape(1,max_seq_length)])



# hd2021 Setup

In [11]:
#Dropping numeric Columns
sampleDF = sampleDF[['INSTNM', 'IALIAS', 'ADDR', 'CITY', 'STABBR', 'CHFNM', 'CHFTITLE', 'COUNTYNM']]
sampleDF = sampleDF.fillna("")

In [12]:
#Removing special characters
alphabetFilterString = ' abcdefghijklmnopqrstuvwxyz'
sampleDF = sampleDF.applymap(lambda x: ''.join([y for y in x.lower() if y in alphabetFilterString]))

# Embedding Generation

In [13]:
#Convert column segments of length 8 into sentence embeddings + labels
x = np.empty((0,768))

y = np.zeros((8,1000))
y[0] = 1
for num in range(1000):
    nextX = sampleDF.INSTNM.sample(n=8, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[0]
    x = np.append(x, nextX, axis=0)
print("INSTNM DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[1]=1
y = np.concatenate((y,yPlaceholder), axis=1)
sampleDF.IALIAS = sampleDF.IALIAS.replace(r'^\s*$', np.nan, regex=True)
for num in range(1000):
    nextX = sampleDF.IALIAS.dropna().sample(n=8, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[0]
    x = np.append(x, nextX, axis=0)
print("IALIAS DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[2]=1
ADDR = sampleDF.ADDR.replace(r'^\s*$', np.nan, regex=True)
y = np.concatenate((y,yPlaceholder), axis=1)
for num in range(1000):
    nextX = ADDR.dropna().sample(n=8, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[0]
    x = np.append(x, nextX, axis=0)
print("ADDR DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[3]=1
y = np.concatenate((y,yPlaceholder), axis=1)
for num in range(1000):
    nextX = sampleDF.CITY.sample(n=8, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[0]
    x = np.append(x, nextX, axis=0)
print("CITY DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[4]=1
y = np.concatenate((y,yPlaceholder), axis=1)
for num in range(1000):
    nextX = sampleDF.STABBR.sample(n=8, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[0]
    x = np.append(x, nextX, axis=0)
print("STABBR DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[5]=1
CHFNM = sampleDF.CHFNM.replace(r'^\s*$', np.nan, regex=True)
y = np.concatenate((y,yPlaceholder), axis=1)
for num in range(1000):
    nextX = CHFNM.dropna().sample(n=8, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[0]
    x = np.append(x, nextX, axis=0)
print("CHFNM DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[6]=1
CHFTITLE = sampleDF.CHFTITLE.replace(r'^\s*$', np.nan, regex=True)
y = np.concatenate((y,yPlaceholder), axis=1)
for num in range(1000):
    nextX = CHFTITLE.dropna().sample(n=8, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[0]
    x = np.append(x, nextX, axis=0)
print("CHFTITLE DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[7]=1
COUNTYNM = sampleDF.COUNTYNM.replace(r'^\s*$', np.nan, regex=True)
y = np.concatenate((y,yPlaceholder), axis=1)
for num in range(1000):
    nextX = COUNTYNM.dropna().sample(n=8, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[0]
    x = np.append(x, nextX, axis=0)
print("COUNTYNM DONE")







INSTNM DONE






IALIAS DONE






ADDR DONE






CITY DONE








STABBR DONE






CHFNM DONE






CHFTITLE DONE






COUNTYNM DONE


In [40]:
xWord = np.empty((0,768))

yWord = np.zeros((8,1000))
yWord[0] = 1
sampleDF.INSTNM = sampleDF.INSTNM.replace(r'^\s*$', np.nan, regex=True)
for num in range(1000):
    nextX = sampleDF.INSTNM.dropna().sample(n=1, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[1]
    nextX = np.sum(nextX, axis=1)
    xWord = np.append(xWord, nextX, axis=0)
print("INSTNM DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[1]=1
yWord = np.concatenate((yWord,yPlaceholder), axis=1)
sampleDF.IALIAS = sampleDF.IALIAS.replace(r'^\s*$', np.nan, regex=True)
for num in range(1000):
    nextX = sampleDF.IALIAS.dropna().sample(n=1, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[1]
    nextX = np.sum(nextX, axis=1)
    xWord = np.append(xWord, nextX, axis=0)
print("IALIAS DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[2]=1
yWord = np.concatenate((yWord,yPlaceholder), axis=1)
sampleDF.ADDR = sampleDF.ADDR.replace(r'^\s*$', np.nan, regex=True)
for num in range(1000):
    nextX = sampleDF.ADDR.dropna().sample(n=1, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[1]
    nextX = np.sum(nextX, axis=1)
    xWord = np.append(xWord, nextX, axis=0)
print("ADDR DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[3]=1
yWord = np.concatenate((yWord,yPlaceholder), axis=1)
sampleDF.CITY = sampleDF.CITY.replace(r'^\s*$', np.nan, regex=True)
for num in range(1000):
    nextX = sampleDF.CITY.dropna().sample(n=1, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[1]
    nextX = np.sum(nextX, axis=1)
    xWord = np.append(xWord, nextX, axis=0)
print("CITY DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[4]=1
yWord = np.concatenate((yWord,yPlaceholder), axis=1)
sampleDF.STABBR = sampleDF.STABBR.replace(r'^\s*$', np.nan, regex=True)
for num in range(1000):
    nextX = sampleDF.STABBR.dropna().sample(n=1, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[1]
    nextX = np.sum(nextX, axis=1)
    xWord = np.append(xWord, nextX, axis=0)
print("STABBR DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[5]=1
yWord = np.concatenate((yWord,yPlaceholder), axis=1)
sampleDF.CHFNM = sampleDF.CHFNM.replace(r'^\s*$', np.nan, regex=True)
for num in range(1000):
    nextX = sampleDF.CHFNM.dropna().sample(n=1, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[1]
    nextX = np.sum(nextX, axis=1)
    xWord = np.append(xWord, nextX, axis=0)
print("CHFNM DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[6]=1
yWord = np.concatenate((yWord,yPlaceholder), axis=1)
sampleDF.CHFTITLE = sampleDF.CHFTITLE.replace(r'^\s*$', np.nan, regex=True)
for num in range(1000):
    nextX = sampleDF.CHFTITLE.dropna().sample(n=1, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[1]
    nextX = np.sum(nextX, axis=1)
    xWord = np.append(xWord, nextX, axis=0)
print("CHFTITLE DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[7]=1
yWord = np.concatenate((yWord,yPlaceholder), axis=1)
sampleDF.COUNTYNM = sampleDF.COUNTYNM.replace(r'^\s*$', np.nan, regex=True)
for num in range(1000):
    nextX = sampleDF.COUNTYNM.dropna().sample(n=1, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[1]
    nextX = np.sum(nextX, axis=1)
    xWord = np.append(xWord, nextX, axis=0)
print("COUNTYNM DONE")







INSTNM DONE






IALIAS DONE






ADDR DONE






CITY DONE








STABBR DONE






CHFNM DONE






CHFTITLE DONE






COUNTYNM DONE


# Training Model

### Context Window

In [33]:
input_shape = x.shape

model= models.Sequential()
model.add(layers.Dense(1000, input_shape = input_shape[1:], activation='relu'))
model.add(layers.Dense(1000, activation='relu'))
model.add(layers.Dense(8, activation = 'softmax'))

In [34]:
xTrain, xTest, yTrain, yTest = train_test_split(x, y.transpose(), test_size=0.45, random_state=42)

In [35]:
model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy'])
model.fit(xTrain, yTrain, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.src.callbacks.History at 0x285159d50>

In [37]:
test_loss, test_acc = model.evaluate(xTest, yTest, verbose=2)
predictionSeries = np.argmax(model.predict(xTest), axis=-1)
print(predictionSeries.shape)

113/113 - 0s - loss: 0.0424 - accuracy: 0.9361 - 153ms/epoch - 1ms/step
(3600,)


### Single Word

In [52]:
input_shape = xWord.shape

modelWord= models.Sequential()
modelWord.add(layers.Dense(1000, input_shape = input_shape[1:], activation='relu'))
modelWord.add(layers.Dense(1000, activation='relu'))
modelWord.add(layers.Dense(8, activation = 'softmax'))

In [53]:
xTrainWord, xTestWord, yTrainWord, yTestWord = train_test_split(xWord, yWord.transpose(), test_size=0.45, random_state=42)

In [54]:
modelWord.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy'])
modelWord.fit(xTrainWord, yTrainWord, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x3e15ca610>