# Imports

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import sklearn

import tensorflow_hub as hub
import math
import bert

from tensorflow.keras.models import Model
from tensorflow.keras import datasets, layers, models
from sklearn.model_selection import train_test_split
from bert.tokenization import bert_tokenization

FullTokenizer = bert_tokenization.FullTokenizer 

In [2]:
np.set_printoptions(threshold=np.inf)

In [3]:
#Replace with personal paths
sampleDF = pd.read_csv('/Your/Path/to/hd2021.csv', encoding='unicode_escape')
sampleDF2 = pd.read_csv('/Your/Path/to/CCD2004.csv', encoding='unicode_escape', dtype={'LATCOD': str, 'LONCOD': str, 'ULOCALE':str})
sampleDF3 = pd.read_csv('/Your/Path/to/CCD2005.csv', encoding='unicode_escape', dtype={'LATCOD': str, 'LONCOD': str, 'ULOCALE':str})

# BERT Setup

In [4]:
#Model Setup
max_seq_length = 256  # Your choice here.

input_word_ids = tf.keras.layers.Input(shape = (max_seq_length,), dtype = tf.int32, name = "input_word_ids")
input_mask = tf.keras.layers.Input(shape = (max_seq_length,), dtype = tf.int32, name = "input_mask")
segment_ids = tf.keras.layers.Input(shape = (max_seq_length,), dtype = tf.int32, name = "segment_ids")

#Replace with own path
bert_layer = hub.KerasLayer("/Your/BERT/Model/Path/Here", trainable = True)

outputs = bert_layer({'input_mask': input_mask, 'input_type_ids': segment_ids, 'input_word_ids': input_word_ids})
pooled_output = outputs["pooled_output"]
sequence_output = outputs["sequence_output"]

model = Model(inputs = [input_word_ids, input_mask, segment_ids], outputs = [pooled_output, sequence_output])

In [5]:
#Manual processing of tokens
def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))


def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))


def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

In [6]:
#Create tokenizer that will create tokens
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [7]:
#Example
s = "This is a nice sentence."
stokens = tokenizer.tokenize(s)
stokens = ["[CLS]"] + stokens + ["[SEP]"]

input_ids = get_ids(stokens, tokenizer, max_seq_length)
input_masks = get_masks(stokens, max_seq_length)
input_segments = get_segments(stokens, max_seq_length)

input_ids=np.array(input_ids)
input_masks = np.array(input_masks)
input_segments = np.array(input_segments)

In [8]:
pool_embs, all_embs = model.predict([input_ids.reshape(1,max_seq_length),input_masks.reshape(1,max_seq_length),input_segments.reshape(1,max_seq_length)])
print(pool_embs.shape)
print(all_embs.shape)

(1, 768)
(1, 256, 768)


# hd2021 Setup

In [9]:
#Dropping numeric Columns
sampleDF = sampleDF[['INSTNM', 'IALIAS', 'ADDR', 'CITY', 'STABBR', 'CHFNM', 'CHFTITLE', 'COUNTYNM']]
sampleDF = sampleDF.fillna("")

In [10]:
#Removing special characters
alphabetFilterString = ' abcdefghijklmnopqrstuvwxyz'
sampleDF = sampleDF.applymap(lambda x: ''.join([y for y in x.lower() if y in alphabetFilterString]))

In [11]:
def documentEmbed(d):
    dtokens = tokenizer.tokenize(d)
    dtokens = ["[CLS]"] + dtokens + ["[SEP]"]

    input_ids_d = get_ids(dtokens, tokenizer, max_seq_length)
    input_masks_d = get_masks(dtokens, max_seq_length)
    input_segments_d = get_segments(dtokens, max_seq_length)

    input_ids_d=np.array(input_ids_d)
    input_masks_d = np.array(input_masks_d)
    input_segments_d = np.array(input_segments_d)

    pool_embs_d, all_embs_d = model.predict([input_ids_d.reshape(1,max_seq_length),input_masks_d.reshape(1,max_seq_length),input_segments_d.reshape(1,max_seq_length)])
    return pool_embs_d, all_embs_d

# Using BERT and Sample of Column to Predict Column

### Sequence Embedding Model (Variable: x, y)

In [12]:
#Convert column segments of length 8 into sentence embeddings + labels
x = np.empty((0,768))

y = np.zeros((8,1000))
y[0] = 1
for num in range(1000):
    nextX = sampleDF.INSTNM.sample(n=8, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[0]
    x = np.append(x, nextX, axis=0)
print("INSTNM DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[1]=1
y = np.concatenate((y,yPlaceholder), axis=1)
sampleDF.IALIAS = sampleDF.IALIAS.replace(r'^\s*$', np.nan, regex=True)
for num in range(1000):
    nextX = sampleDF.IALIAS.dropna().sample(n=8, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[0]
    x = np.append(x, nextX, axis=0)
print("IALIAS DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[2]=1
ADDR = sampleDF.ADDR.replace(r'^\s*$', np.nan, regex=True)
y = np.concatenate((y,yPlaceholder), axis=1)
for num in range(1000):
    nextX = ADDR.dropna().sample(n=8, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[0]
    x = np.append(x, nextX, axis=0)
print("ADDR DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[3]=1
y = np.concatenate((y,yPlaceholder), axis=1)
for num in range(1000):
    nextX = sampleDF.CITY.sample(n=8, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[0]
    x = np.append(x, nextX, axis=0)
print("CITY DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[4]=1
y = np.concatenate((y,yPlaceholder), axis=1)
for num in range(1000):
    nextX = sampleDF.STABBR.sample(n=8, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[0]
    x = np.append(x, nextX, axis=0)
print("STABBR DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[5]=1
CHFNM = sampleDF.CHFNM.replace(r'^\s*$', np.nan, regex=True)
y = np.concatenate((y,yPlaceholder), axis=1)
for num in range(1000):
    nextX = CHFNM.dropna().sample(n=8, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[0]
    x = np.append(x, nextX, axis=0)
print("CHFNM DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[6]=1
CHFTITLE = sampleDF.CHFTITLE.replace(r'^\s*$', np.nan, regex=True)
y = np.concatenate((y,yPlaceholder), axis=1)
for num in range(1000):
    nextX = CHFTITLE.dropna().sample(n=8, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[0]
    x = np.append(x, nextX, axis=0)
print("CHFTITLE DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[7]=1
COUNTYNM = sampleDF.COUNTYNM.replace(r'^\s*$', np.nan, regex=True)
y = np.concatenate((y,yPlaceholder), axis=1)
for num in range(1000):
    nextX = COUNTYNM.dropna().sample(n=8, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[0]
    x = np.append(x, nextX, axis=0)
print("COUNTYNM DONE")







INSTNM DONE






IALIAS DONE






ADDR DONE






CITY DONE








STABBR DONE






CHFNM DONE






CHFTITLE DONE






COUNTYNM DONE


In [47]:
input_shape = x.shape

model= models.Sequential()
model.add(layers.Dense(1000, input_shape = input_shape[1:], activation='relu'))
model.add(layers.Dense(1000, activation='relu'))
model.add(layers.Dense(8, activation = 'softmax'))

In [14]:
xTrain, xTest, yTrain, yTest = train_test_split(x, y.transpose(), test_size=0.55, random_state=42)

In [48]:
model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy'])
model.fit(xTrain, yTrain, epochs=9)

Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9


<keras.src.callbacks.History at 0x446f244d0>

In [16]:
test_loss, test_acc = model.evaluate(xTest, yTest, verbose=2)
predictionSeries = np.argmax(model.predict(xTest), axis=-1)
print(predictionSeries)
print(predictionSeries.shape)

138/138 - 0s - loss: 0.0166 - accuracy: 0.9764 - 226ms/epoch - 2ms/step
[2 2 1 3 4 2 1 1 4 3 1 2 2 4 4 4 7 4 2 2 3 5 6 7 0 3 7 3 0 3 0 7 6 1 6 0 5
 4 1 1 2 2 5 2 3 5 1 6 4 6 2 4 4 6 1 7 2 6 7 0 5 2 0 2 4 7 6 5 1 3 3 0 3 4
 7 1 1 5 2 3 6 3 0 2 0 6 5 4 2 0 3 4 0 6 1 0 1 6 6 2 1 1 2 7 6 4 3 1 4 1 1
 5 2 0 4 4 7 1 0 5 2 6 6 1 5 0 1 7 4 4 5 0 4 6 3 5 1 6 3 6 6 6 2 7 1 2 0 2
 6 3 2 3 3 3 3 2 6 6 0 0 6 7 0 1 4 0 5 5 7 5 4 1 0 6 1 6 7 4 2 5 5 5 0 7 4
 5 3 1 6 1 0 3 0 0 3 2 4 1 3 7 4 1 5 0 5 3 4 7 7 0 2 3 6 3 0 0 4 3 0 0 1 1
 4 7 6 1 7 6 1 5 0 7 2 0 5 4 3 0 4 1 7 5 0 7 1 1 2 3 7 4 0 3 6 2 0 6 2 0 7
 7 2 2 2 0 7 2 7 2 0 6 7 5 2 2 5 3 3 0 6 2 2 0 6 1 7 1 4 6 1 0 7 2 1 7 4 4
 3 6 0 1 1 0 2 2 3 6 0 6 0 0 3 4 3 7 0 3 5 2 0 1 1 1 7 3 6 2 7 6 7 7 7 1 6
 4 5 1 2 1 5 7 6 3 2 4 6 4 1 7 7 5 4 4 7 0 5 2 4 6 6 7 5 3 4 7 2 1 0 4 4 7
 4 4 3 6 5 5 0 2 0 7 1 1 5 1 6 7 6 5 4 1 6 4 5 0 2 6 1 6 7 3 0 0 0 7 1 0 6
 2 6 0 2 0 7 6 1 7 4 2 3 7 0 4 1 1 7 3 0 6 2 7 0 5 5 5 0 6 6 2 3 5 3 6 7 6
 3 2 2 1 6 7 7 2 3 2 3 7 6 7

### Adding More Classes from CCD Data (Variable: xNew, yNew)

In [17]:
#Dropping numeric Columns
sampleDF2 = sampleDF2[['LEANM04', 'SCHNAM04', 'LCITY04', 'LSTATE04']]
sampleDF = sampleDF.fillna("")

In [18]:
#Removing special characters
alphabetFilterString = ' abcdefghijklmnopqrstuvwxyz'
sampleDF2 = sampleDF2.applymap(lambda x: ''.join([y for y in x.lower() if y in alphabetFilterString]))

In [28]:
yNew = np.concatenate((y,np.zeros((2,8000))), axis=0)
xNew = x

In [29]:
print(yNew.shape)

(10, 8000)


In [30]:
yPlaceholder= np.zeros((10,1000))
yPlaceholder[8]=1
yNew = np.concatenate((yNew,yPlaceholder), axis=1)
sampleDF2.LEANM04 = sampleDF2.LEANM04.replace(r'^\s*$', np.nan, regex=True)
for num in range(1000):
    nextX = sampleDF2.LEANM04.dropna().sample(n=8, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[0]
    xNew = np.append(xNew, nextX, axis=0)
print("LEANM04 DONE")

yPlaceholder= np.zeros((10,1000))
yPlaceholder[9]=1
yNew = np.concatenate((yNew,yPlaceholder), axis=1)
sampleDF2.SCHNAM04 = sampleDF2.SCHNAM04.replace(r'^\s*$', np.nan, regex=True)
for num in range(1000):
    nextX = sampleDF2.SCHNAM04.dropna().sample(n=8, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[0]
    xNew = np.append(xNew, nextX, axis=0)
print("SCHNAM04 DONE")

yPlaceholder= np.zeros((10,1000))
yPlaceholder[3]=1
yNew = np.concatenate((yNew,yPlaceholder), axis=1)
sampleDF2.LCITY04 = sampleDF2.LCITY04.replace(r'^\s*$', np.nan, regex=True)
for num in range(1000):
    nextX = sampleDF2.LCITY04.dropna().sample(n=8, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[0]
    xNew = np.append(xNew, nextX, axis=0)
print("LCITY04 DONE")

yPlaceholder= np.zeros((10,1000))
yPlaceholder[4]=1
yNew = np.concatenate((yNew,yPlaceholder), axis=1)
sampleDF2.LSTATE04 = sampleDF2.LSTATE04.replace(r'^\s*$', np.nan, regex=True)
for num in range(1000):
    nextX = sampleDF2.LSTATE04.dropna().sample(n=8, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[0]
    xNew = np.append(xNew, nextX, axis=0)
print("LSTATE04 DONE")







LEANM04 DONE






SCHNAM04 DONE






LCITY04 DONE






LSTATE04 DONE


In [31]:
print(xNew.shape)
print(yNew.shape)

(12000, 768)
(10, 12000)


In [32]:
input_shape_new = xNew.shape

modelNew = models.Sequential()
modelNew.add(layers.Dense(1000, input_shape = input_shape_new[1:], activation='relu'))
modelNew.add(layers.Dense(10, activation = 'softmax'))

In [33]:
xTrainNew, xTestNew, yTrainNew, yTestNew = train_test_split(xNew, yNew.transpose(), test_size=0.75, random_state=42)

In [34]:
modelNew.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy'])
modelNew.fit(xTrainNew, yTrainNew, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x295a05210>

In [35]:
test_loss, test_acc = modelNew.evaluate(xTestNew, yTestNew, verbose=2)
predictionSeries = np.argmax(modelNew.predict(xTestNew), axis=-1)
#print(predictionSeries)
print(predictionSeries.shape)

282/282 - 0s - loss: 0.0242 - accuracy: 0.9698 - 244ms/epoch - 864us/step
(9000,)


### Word Embeddings Model (Variable: xSummed, ySummed)

In [12]:
xSummed = np.empty((0,768))

ySummed = np.zeros((8,1000))
ySummed[0] = 1
sampleDF.INSTNM = sampleDF.INSTNM.replace(r'^\s*$', np.nan, regex=True)
for num in range(1000):
    nextX = sampleDF.INSTNM.dropna().sample(n=8, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[1]
    nextX = np.sum(nextX, axis=1)
    xSummed = np.append(xSummed, nextX, axis=0)
print("INSTNM DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[1]=1
ySummed = np.concatenate((ySummed,yPlaceholder), axis=1)
sampleDF.IALIAS = sampleDF.IALIAS.replace(r'^\s*$', np.nan, regex=True)
for num in range(1000):
    nextX = sampleDF.IALIAS.dropna().sample(n=8, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[1]
    nextX = np.sum(nextX, axis=1)
    xSummed = np.append(xSummed, nextX, axis=0)
print("IALIAS DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[2]=1
ySummed = np.concatenate((ySummed,yPlaceholder), axis=1)
sampleDF.ADDR = sampleDF.ADDR.replace(r'^\s*$', np.nan, regex=True)
for num in range(1000):
    nextX = sampleDF.ADDR.dropna().sample(n=8, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[1]
    nextX = np.sum(nextX, axis=1)
    xSummed = np.append(xSummed, nextX, axis=0)
print("ADDR DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[3]=1
ySummed = np.concatenate((ySummed,yPlaceholder), axis=1)
sampleDF.CITY = sampleDF.CITY.replace(r'^\s*$', np.nan, regex=True)
for num in range(1000):
    nextX = sampleDF.CITY.dropna().sample(n=8, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[1]
    nextX = np.sum(nextX, axis=1)
    xSummed = np.append(xSummed, nextX, axis=0)
print("CITY DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[4]=1
ySummed = np.concatenate((ySummed,yPlaceholder), axis=1)
sampleDF.STABBR = sampleDF.STABBR.replace(r'^\s*$', np.nan, regex=True)
for num in range(1000):
    nextX = sampleDF.STABBR.dropna().sample(n=8, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[1]
    nextX = np.sum(nextX, axis=1)
    xSummed = np.append(xSummed, nextX, axis=0)
print("STABBR DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[5]=1
ySummed = np.concatenate((ySummed,yPlaceholder), axis=1)
sampleDF.CHFNM = sampleDF.CHFNM.replace(r'^\s*$', np.nan, regex=True)
for num in range(1000):
    nextX = sampleDF.CHFNM.dropna().sample(n=8, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[1]
    nextX = np.sum(nextX, axis=1)
    xSummed = np.append(xSummed, nextX, axis=0)
print("CHFNM DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[6]=1
ySummed = np.concatenate((ySummed,yPlaceholder), axis=1)
sampleDF.CHFTITLE = sampleDF.CHFTITLE.replace(r'^\s*$', np.nan, regex=True)
for num in range(1000):
    nextX = sampleDF.CHFTITLE.dropna().sample(n=8, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[1]
    nextX = np.sum(nextX, axis=1)
    xSummed = np.append(xSummed, nextX, axis=0)
print("CHFTITLE DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[7]=1
ySummed = np.concatenate((ySummed,yPlaceholder), axis=1)
sampleDF.COUNTYNM = sampleDF.COUNTYNM.replace(r'^\s*$', np.nan, regex=True)
for num in range(1000):
    nextX = sampleDF.COUNTYNM.dropna().sample(n=8, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[1]
    nextX = np.sum(nextX, axis=1)
    xSummed = np.append(xSummed, nextX, axis=0)
print("COUNTYNM DONE")







INSTNM DONE






IALIAS DONE






ADDR DONE






CITY DONE








STABBR DONE






CHFNM DONE






CHFTITLE DONE






COUNTYNM DONE


In [13]:
print(xSummed.shape)
print(ySummed.shape)

(8000, 768)
(8, 8000)


In [21]:
input_shape_summed = xSummed.shape

modelSummed = models.Sequential()
modelSummed.add(layers.Dense(1000, input_shape = input_shape_summed[1:], activation='relu'))
modelSummed.add(layers.Dense(8, activation = 'softmax'))

In [22]:
xTrainSummed, xTestSummed, yTrainSummed, yTestSummed = train_test_split(xSummed, ySummed.transpose(), test_size=0.75, random_state=42)

In [24]:
modelSummed.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy'])
modelSummed.fit(xTrainSummed, yTrainSummed, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2e65bd890>

In [25]:
test_loss, test_acc = modelSummed.evaluate(xTestSummed, yTestSummed, verbose=2)
predictionSeries = np.argmax(modelSummed.predict(xTestSummed), axis=-1)
#print(predictionSeries)
print(predictionSeries.shape)

188/188 - 0s - loss: 0.2424 - accuracy: 0.9868 - 147ms/epoch - 782us/step
(6000,)


# Using BERT and Singular Word to Predict Column

### Word Embeddings Model (Variable: xWord, yWord)

In [37]:
xWord = np.empty((0,768))

yWord = np.zeros((8,1000))
yWord[0] = 1
sampleDF.INSTNM = sampleDF.INSTNM.replace(r'^\s*$', np.nan, regex=True)
for num in range(1000):
    nextX = sampleDF.INSTNM.dropna().sample(n=1, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[1]
    nextX = np.sum(nextX, axis=1)
    xWord = np.append(xWord, nextX, axis=0)
print("INSTNM DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[1]=1
yWord = np.concatenate((yWord,yPlaceholder), axis=1)
sampleDF.IALIAS = sampleDF.IALIAS.replace(r'^\s*$', np.nan, regex=True)
for num in range(1000):
    nextX = sampleDF.IALIAS.dropna().sample(n=1, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[1]
    nextX = np.sum(nextX, axis=1)
    xWord = np.append(xWord, nextX, axis=0)
print("IALIAS DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[2]=1
yWord = np.concatenate((yWord,yPlaceholder), axis=1)
sampleDF.ADDR = sampleDF.ADDR.replace(r'^\s*$', np.nan, regex=True)
for num in range(1000):
    nextX = sampleDF.ADDR.dropna().sample(n=1, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[1]
    nextX = np.sum(nextX, axis=1)
    xWord = np.append(xWord, nextX, axis=0)
print("ADDR DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[3]=1
yWord = np.concatenate((yWord,yPlaceholder), axis=1)
sampleDF.CITY = sampleDF.CITY.replace(r'^\s*$', np.nan, regex=True)
for num in range(1000):
    nextX = sampleDF.CITY.dropna().sample(n=1, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[1]
    nextX = np.sum(nextX, axis=1)
    xWord = np.append(xWord, nextX, axis=0)
print("CITY DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[4]=1
yWord = np.concatenate((yWord,yPlaceholder), axis=1)
sampleDF.STABBR = sampleDF.STABBR.replace(r'^\s*$', np.nan, regex=True)
for num in range(1000):
    nextX = sampleDF.STABBR.dropna().sample(n=1, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[1]
    nextX = np.sum(nextX, axis=1)
    xWord = np.append(xWord, nextX, axis=0)
print("STABBR DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[5]=1
yWord = np.concatenate((yWord,yPlaceholder), axis=1)
sampleDF.CHFNM = sampleDF.CHFNM.replace(r'^\s*$', np.nan, regex=True)
for num in range(1000):
    nextX = sampleDF.CHFNM.dropna().sample(n=1, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[1]
    nextX = np.sum(nextX, axis=1)
    xWord = np.append(xWord, nextX, axis=0)
print("CHFNM DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[6]=1
yWord = np.concatenate((yWord,yPlaceholder), axis=1)
sampleDF.CHFTITLE = sampleDF.CHFTITLE.replace(r'^\s*$', np.nan, regex=True)
for num in range(1000):
    nextX = sampleDF.CHFTITLE.dropna().sample(n=1, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[1]
    nextX = np.sum(nextX, axis=1)
    xWord = np.append(xWord, nextX, axis=0)
print("CHFTITLE DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[7]=1
yWord = np.concatenate((yWord,yPlaceholder), axis=1)
sampleDF.COUNTYNM = sampleDF.COUNTYNM.replace(r'^\s*$', np.nan, regex=True)
for num in range(1000):
    nextX = sampleDF.COUNTYNM.dropna().sample(n=1, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[1]
    nextX = np.sum(nextX, axis=1)
    xWord = np.append(xWord, nextX, axis=0)
print("COUNTYNM DONE")







INSTNM DONE






IALIAS DONE






ADDR DONE






CITY DONE








STABBR DONE






CHFNM DONE






CHFTITLE DONE






COUNTYNM DONE


In [38]:
print(xWord.shape)
print(yWord.shape)

(8000, 768)
(8, 8000)


In [39]:
input_shape_word = xWord.shape

modelWord = models.Sequential()
modelWord.add(layers.Dense(1000, input_shape = input_shape_word[1:], activation='relu'))
#modelWord.add(layers.Dense(2000, activation='relu'))
modelWord.add(layers.Dense(8, activation = 'softmax'))

In [40]:
xTrainWord, xTestWord, yTrainWord, yTestWord = train_test_split(xWord, yWord.transpose(), test_size=0.1, random_state=42)

In [41]:
modelWord.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy'])
modelWord.fit(xTrainWord, yTrainWord, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x293da4cd0>

In [42]:
test_loss, test_acc = modelWord.evaluate(xTestWord, yTestWord, verbose=2)
predictionSeries = np.argmax(modelWord.predict(xTestWord), axis=-1)
#print(predictionSeries)
print(predictionSeries.shape)

25/25 - 0s - loss: 0.1359 - accuracy: 0.9187 - 73ms/epoch - 3ms/step
(800,)


In [49]:
test_loss, test_acc = model.evaluate(xTestWord, yTestWord, verbose=2)
predictionSeries = np.argmax(model.predict(xTestWord), axis=-1)
print(predictionSeries)
print(predictionSeries.shape)

25/25 - 0s - loss: 179.2371 - accuracy: 0.1050 - 90ms/epoch - 4ms/step
[3 3 3 3 3 3 7 6 2 3 3 3 3 7 3 3 3 3 3 7 3 3 7 3 3 3 3 3 3 3 3 3 7 3 7 3 3
 3 3 3 3 7 3 7 3 3 3 7 3 7 3 3 3 3 2 3 3 3 3 3 3 3 7 7 3 3 7 3 3 3 2 3 3 3
 3 1 3 3 3 4 7 3 3 3 7 7 3 3 3 3 7 3 3 3 3 3 3 7 7 3 1 3 3 3 7 3 3 3 3 3 7
 7 3 3 3 3 3 7 3 3 7 7 7 7 3 2 3 3 3 3 7 3 3 7 3 7 3 7 3 7 3 7 7 3 2 3 3 3
 7 3 3 3 3 3 3 3 7 3 7 3 7 3 7 3 2 2 2 3 3 7 3 3 3 7 3 7 2 3 3 3 3 3 3 3 3
 3 3 3 1 3 3 3 2 3 3 3 3 3 3 3 3 3 7 3 3 3 3 3 3 3 7 3 7 3 7 3 3 2 3 3 3 7
 7 3 7 3 3 7 3 3 7 3 3 3 4 3 7 3 7 3 3 3 3 3 3 3 3 3 4 7 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 7 3 3 3 3 3 3 3 3 7 3 2 3 3 7 7 3 7 3 7 1 3 3 7 3 3 3 3
 3 3 3 3 7 3 3 7 7 7 3 7 3 3 7 3 3 3 3 7 3 3 3 3 7 3 3 3 7 7 3 7 3 3 3 3 3
 3 3 3 7 7 3 3 7 3 7 3 7 3 3 3 3 7 7 3 3 1 3 3 3 7 3 3 7 3 3 3 3 3 7 2 3 3
 3 3 3 7 3 3 3 7 3 3 3 3 3 3 7 3 7 3 3 0 3 3 3 3 3 7 3 7 3 3 3 2 3 2 3 3 3
 7 7 3 7 7 3 3 2 3 3 7 3 3 3 7 3 3 3 3 3 7 3 3 3 7 3 3 3 7 3 7 3 3 3 7 3 3
 7 3 7 3 7 3 3 3 3 3 3 3 7 3 

### Sequence Embedding Model (Variable: xWordSeq, yWordSeq)

In [26]:
#Convert column segments of length 8 into sentence embeddings + labels
xWordSeq = np.empty((0,768))

yWordSeq = np.zeros((8,1000))
yWordSeq[0] = 1
for num in range(1000):
    nextX = sampleDF.INSTNM.sample(n=1, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[0]
    xWordSeq = np.append(xWordSeq, nextX, axis=0)
print("INSTNM DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[1]=1
yWordSeq = np.concatenate((yWordSeq,yPlaceholder), axis=1)
sampleDF.IALIAS = sampleDF.IALIAS.replace(r'^\s*$', np.nan, regex=True)
for num in range(1000):
    nextX = sampleDF.IALIAS.dropna().sample(n=1, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[0]
    xWordSeq = np.append(xWordSeq, nextX, axis=0)
print("IALIAS DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[2]=1
ADDR = sampleDF.ADDR.replace(r'^\s*$', np.nan, regex=True)
yWordSeq = np.concatenate((yWordSeq,yPlaceholder), axis=1)
for num in range(1000):
    nextX = ADDR.dropna().sample(n=1, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[0]
    xWordSeq = np.append(xWordSeq, nextX, axis=0)
print("ADDR DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[3]=1
yWordSeq = np.concatenate((yWordSeq,yPlaceholder), axis=1)
for num in range(1000):
    nextX = sampleDF.CITY.sample(n=1, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[0]
    xWordSeq = np.append(xWordSeq, nextX, axis=0)
print("CITY DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[4]=1
yWordSeq = np.concatenate((yWordSeq,yPlaceholder), axis=1)
for num in range(1000):
    nextX = sampleDF.STABBR.sample(n=1, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[0]
    xWordSeq = np.append(xWordSeq, nextX, axis=0)
print("STABBR DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[5]=1
CHFNM = sampleDF.CHFNM.replace(r'^\s*$', np.nan, regex=True)
yWordSeq = np.concatenate((yWordSeq,yPlaceholder), axis=1)
for num in range(1000):
    nextX = CHFNM.dropna().sample(n=1, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[0]
    xWordSeq = np.append(xWordSeq, nextX, axis=0)
print("CHFNM DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[6]=1
CHFTITLE = sampleDF.CHFTITLE.replace(r'^\s*$', np.nan, regex=True)
yWordSeq = np.concatenate((yWordSeq,yPlaceholder), axis=1)
for num in range(1000):
    nextX = CHFTITLE.dropna().sample(n=1, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[0]
    xWordSeq = np.append(xWordSeq, nextX, axis=0)
print("CHFTITLE DONE")

yPlaceholder= np.zeros((8,1000))
yPlaceholder[7]=1
COUNTYNM = sampleDF.COUNTYNM.replace(r'^\s*$', np.nan, regex=True)
yWordSeq = np.concatenate((yWordSeq,yPlaceholder), axis=1)
for num in range(1000):
    nextX = COUNTYNM.dropna().sample(n=1, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[0]
    xWordSeq = np.append(xWordSeq, nextX, axis=0)
print("COUNTYNM DONE")







INSTNM DONE






IALIAS DONE






ADDR DONE






CITY DONE








STABBR DONE






CHFNM DONE






CHFTITLE DONE






COUNTYNM DONE


In [27]:
print(xWordSeq.shape)
print(yWordSeq.shape)

(8000, 768)
(8, 8000)


In [29]:
input_shape_wordseq = xWordSeq.shape

modelWordSeq = models.Sequential()
modelWordSeq.add(layers.Dense(1000, input_shape = input_shape_wordseq[1:], activation='relu'))
#modelWord.add(layers.Dense(1000, activation='relu'))
modelWordSeq.add(layers.Dense(8, activation = 'softmax'))

In [30]:
xTrainWordSeq, xTestWordSeq, yTrainWordSeq, yTestWordSeq = train_test_split(xWordSeq, yWordSeq.transpose(), test_size=0.1, random_state=42)

In [31]:
modelWordSeq.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy'])
modelWordSeq.fit(xTrainWordSeq, yTrainWordSeq, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x3445ab050>

### More Power (Variable: xWord, yWord)

In [36]:
xWord = np.empty((0,768))

yWord = np.zeros((8,2000))
yWord[0] = 1
for num in range(2000):
    nextX = sampleDF.INSTNM.sample(n=1, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[0]
    xWord = np.append(xWord, nextX, axis=0)
print("INSTNM DONE")

yPlaceholder= np.zeros((8,2000))
yPlaceholder[1]=1
yWord = np.concatenate((yWord,yPlaceholder), axis=1)
sampleDF.IALIAS = sampleDF.IALIAS.replace(r'^\s*$', np.nan, regex=True)
for num in range(2000):
    nextX = sampleDF.IALIAS.dropna().sample(n=1, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[0]
    xWord = np.append(xWord, nextX, axis=0)
print("IALIAS DONE")













INSTNM DONE












IALIAS DONE


In [37]:
yPlaceholder= np.zeros((8,2000))
yPlaceholder[2]=1
yWord = np.concatenate((yWord,yPlaceholder), axis=1)
sampleDF.ADDR = sampleDF.ADDR.replace(r'^\s*$', np.nan, regex=True)
for num in range(2000):
    nextX = sampleDF.ADDR.dropna().sample(n=1, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[1]
    nextX = np.sum(nextX, axis=1)
    xWord = np.append(xWord, nextX, axis=0)
print("ADDR DONE")

yPlaceholder= np.zeros((8,2000))
yPlaceholder[3]=1
yWord = np.concatenate((yWord,yPlaceholder), axis=1)
sampleDF.CITY = sampleDF.CITY.replace(r'^\s*$', np.nan, regex=True)
for num in range(2000):
    nextX = sampleDF.CITY.dropna().sample(n=1, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[1]
    nextX = np.sum(nextX, axis=1)
    xWord = np.append(xWord, nextX, axis=0)
print("CITY DONE")













ADDR DONE












CITY DONE


In [38]:
yPlaceholder= np.zeros((8,2000))
yPlaceholder[4]=1
yWord = np.concatenate((yWord,yPlaceholder), axis=1)
sampleDF.STABBR = sampleDF.STABBR.replace(r'^\s*$', np.nan, regex=True)
for num in range(2000):
    nextX = sampleDF.STABBR.dropna().sample(n=1, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[1]
    nextX = np.sum(nextX, axis=1)
    xWord = np.append(xWord, nextX, axis=0)
print("STABBR DONE")

yPlaceholder= np.zeros((8,2000))
yPlaceholder[5]=1
yWord = np.concatenate((yWord,yPlaceholder), axis=1)
sampleDF.CHFNM = sampleDF.CHFNM.replace(r'^\s*$', np.nan, regex=True)
for num in range(2000):
    nextX = sampleDF.CHFNM.dropna().sample(n=1, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[1]
    nextX = np.sum(nextX, axis=1)
    xWord = np.append(xWord, nextX, axis=0)
print("CHFNM DONE")













STABBR DONE












CHFNM DONE


In [39]:
yPlaceholder= np.zeros((8,2000))
yPlaceholder[6]=1
yWord = np.concatenate((yWord,yPlaceholder), axis=1)
sampleDF.CHFTITLE = sampleDF.CHFTITLE.replace(r'^\s*$', np.nan, regex=True)
for num in range(2000):
    nextX = sampleDF.CHFTITLE.dropna().sample(n=1, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[1]
    nextX = np.sum(nextX, axis=1)
    xWord = np.append(xWord, nextX, axis=0)
print("CHFTITLE DONE")

yPlaceholder= np.zeros((8,2000))
yPlaceholder[7]=1
yWord = np.concatenate((yWord,yPlaceholder), axis=1)
sampleDF.COUNTYNM = sampleDF.COUNTYNM.replace(r'^\s*$', np.nan, regex=True)
for num in range(2000):
    nextX = sampleDF.COUNTYNM.dropna().sample(n=1, random_state = num)
    nextX = documentEmbed(' '.join(nextX))[1]
    nextX = np.sum(nextX, axis=1)
    xWord = np.append(xWord, nextX, axis=0)
print("COUNTYNM DONE")













CHFTITLE DONE












COUNTYNM DONE


In [40]:
print(xWord.shape)
print(yWord.shape)

(16000, 768)
(8, 16000)


In [41]:
input_shape_word = xWord.shape

modelWord = models.Sequential()
modelWord.add(layers.Dense(1000, input_shape = input_shape_word[1:], activation='relu'))
#modelWord.add(layers.Dense(1000, activation='relu'))
modelWord.add(layers.Dense(8, activation = 'softmax'))

In [42]:
xTrainWord, xTestWord, yTrainWord, yTestWord = train_test_split(xWord, yWord.transpose(), test_size=0.1, random_state=42)

In [43]:
modelWord.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy'])
modelWord.fit(xTrainWord, yTrainWord, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x43f9b6610>

In [44]:
test_loss, test_acc = modelWord.evaluate(xTestWord, yTestWord, verbose=2)
predictionSeries = np.argmax(modelWord.predict(xTestWord), axis=-1)
#print(predictionSeries)
print(predictionSeries.shape)

50/50 - 0s - loss: 0.0938 - accuracy: 0.9300 - 68ms/epoch - 1ms/step
(1600,)


# Single-Word Model Evaluation

### INSTNM

In [97]:
INSTNMEmbeds = sampleDF.INSTNM.dropna().apply(lambda x: documentEmbed(x))











































In [98]:
INSTNMEmbeds.apply(pd.Series)
INSTNMEmbeds = INSTNMEmbeds.apply(lambda x: np.sum(x[1], axis=1))
INSTNMEmbeds = np.concatenate(INSTNMEmbeds.tolist(), axis=0)

0       ([[-0.78147554, -0.3108122, -0.1281887, 0.6285...
1       ([[-0.8604626, -0.32166138, -0.49984777, 0.658...
2       ([[-0.8869562, -0.3388467, 0.0066298125, 0.707...
3       ([[-0.95116925, -0.49766952, -0.7495503, 0.814...
4       ([[-0.8711352, -0.28046894, 0.13559116, 0.6418...
                              ...                        
6284    ([[-0.8020656, -0.24700965, 0.370333, 0.597179...
6285    ([[-0.88376975, -0.3825523, -0.76656204, 0.823...
6286    ([[-0.88436836, -0.19147411, 0.041495122, 0.70...
6287    ([[-0.8782817, -0.58265305, -0.93047756, 0.805...
6288    ([[-0.96717733, -0.45804062, -0.84830433, 0.90...
Name: INSTNM, Length: 6289, dtype: object


In [107]:
INSTNMLabels = np.zeros((8,6289))
INSTNMLabels[0] = 1

test_loss, test_acc = modelWord.evaluate(INSTNMEmbeds, INSTNMLabels.transpose(), verbose=2)
predictionSeries = np.argmax(modelWord.predict(INSTNMEmbeds), axis=-1)

197/197 - 0s - loss: 0.2160 - accuracy: 0.8082 - 349ms/epoch - 2ms/step


### IALIAS

In [108]:
IALIASEmbeds = sampleDF.IALIAS.dropna().apply(lambda x: documentEmbed(x))

















In [111]:
IALIASEmbeds.apply(pd.Series)
IALIASEmbeds = IALIASEmbeds.apply(lambda x: np.sum(x[1], axis=1))
IALIASEmbeds = np.concatenate(IALIASEmbeds.tolist(), axis=0)

In [113]:
IALIASLabels = np.zeros((8,2204))
IALIASLabels[1] = 1

test_loss, test_acc = modelWord.evaluate(IALIASEmbeds, IALIASLabels.transpose(), verbose=2)
predictionSeries = np.argmax(modelWord.predict(IALIASEmbeds), axis=-1)

69/69 - 0s - loss: 0.4372 - accuracy: 0.7573 - 90ms/epoch - 1ms/step


### ADDR

In [114]:
ADDREmbeds = sampleDF.ADDR.dropna().apply(lambda x: documentEmbed(x))











































In [115]:
ADDREmbeds.apply(pd.Series)
ADDREmbeds = ADDREmbeds.apply(lambda x: np.sum(x[1], axis=1))
ADDREmbeds = np.concatenate(ADDREmbeds.tolist(), axis=0)

In [117]:
ADDRLabels = np.zeros((8,6278))
ADDRLabels[2] = 1

test_loss, test_acc = modelWord.evaluate(ADDREmbeds, ADDRLabels.transpose(), verbose=2)
predictionSeries = np.argmax(modelWord.predict(ADDREmbeds), axis=-1)

197/197 - 0s - loss: 0.1328 - accuracy: 0.9603 - 189ms/epoch - 959us/step


### CITY

In [118]:
CITYEmbeds = sampleDF.CITY.dropna().apply(lambda x: documentEmbed(x))











































In [119]:
CITYEmbeds.apply(pd.Series)
CITYEmbeds = CITYEmbeds.apply(lambda x: np.sum(x[1], axis=1))
CITYEmbeds = np.concatenate(CITYEmbeds.tolist(), axis=0)

In [121]:
CITYLabels = np.zeros((8,6289))
CITYLabels[3] = 1

test_loss, test_acc = modelWord.evaluate(CITYEmbeds, CITYLabels.transpose(), verbose=2)
predictionSeries = np.argmax(modelWord.predict(CITYEmbeds), axis=-1)

197/197 - 0s - loss: 0.1331 - accuracy: 0.9401 - 224ms/epoch - 1ms/step


### STABBR

In [122]:
STABBREmbeds = sampleDF.STABBR.dropna().apply(lambda x: documentEmbed(x))











































In [123]:
STABBREmbeds.apply(pd.Series)
STABBREmbeds = STABBREmbeds.apply(lambda x: np.sum(x[1], axis=1))
STABBREmbeds = np.concatenate(STABBREmbeds.tolist(), axis=0)

In [125]:
STABBRLabels = np.zeros((8,6289))
STABBRLabels[4] = 1

test_loss, test_acc = modelWord.evaluate(STABBREmbeds, STABBRLabels.transpose(), verbose=2)
predictionSeries = np.argmax(modelWord.predict(STABBREmbeds), axis=-1)

197/197 - 0s - loss: 0.0020 - accuracy: 0.9995 - 213ms/epoch - 1ms/step


### CHFNM

In [126]:
CHFNMEmbeds = sampleDF.CHFNM.dropna().apply(lambda x: documentEmbed(x))











































In [127]:
CHFNMEmbeds.apply(pd.Series)
CHFNMEmbeds = CHFNMEmbeds.apply(lambda x: np.sum(x[1], axis=1))
CHFNMEmbeds = np.concatenate(CHFNMEmbeds.tolist(), axis=0)

In [129]:
CHFNMLabels = np.zeros((8,6229))
CHFNMLabels[5] = 1

test_loss, test_acc = modelWord.evaluate(CHFNMEmbeds, CHFNMLabels.transpose(), verbose=2)
predictionSeries = np.argmax(modelWord.predict(CHFNMEmbeds), axis=-1)

195/195 - 0s - loss: 0.0766 - accuracy: 0.9695 - 245ms/epoch - 1ms/step


### CHFTITLE

In [130]:
CHFTITLEEmbeds = sampleDF.CHFTITLE.dropna().apply(lambda x: documentEmbed(x))











































In [131]:
CHFTITLEEmbeds.apply(pd.Series)
CHFTITLEEmbeds = CHFTITLEEmbeds.apply(lambda x: np.sum(x[1], axis=1))
CHFTITLEEmbeds = np.concatenate(CHFTITLEEmbeds.tolist(), axis=0)

In [133]:
CHFTITLELabels = np.zeros((8,6229))
CHFTITLELabels[6] = 1

test_loss, test_acc = modelWord.evaluate(CHFTITLEEmbeds, CHFTITLELabels.transpose(), verbose=2)
predictionSeries = np.argmax(modelWord.predict(CHFTITLEEmbeds), axis=-1)

195/195 - 0s - loss: 0.0425 - accuracy: 0.9928 - 202ms/epoch - 1ms/step


### COUNTYNM

In [134]:
COUNTYNMEmbeds = sampleDF.COUNTYNM.dropna().apply(lambda x: documentEmbed(x))











































In [135]:
COUNTYNMEmbeds.apply(pd.Series)
COUNTYNMEmbeds = COUNTYNMEmbeds.apply(lambda x: np.sum(x[1], axis=1))
COUNTYNMEmbeds = np.concatenate(COUNTYNMEmbeds.tolist(), axis=0)

In [138]:
COUNTYNMLabels = np.zeros((8,6286))
COUNTYNMLabels[7] = 1

test_loss, test_acc = modelWord.evaluate(COUNTYNMEmbeds, COUNTYNMLabels.transpose(), verbose=2)
predictionSeries = np.argmax(modelWord.predict(COUNTYNMEmbeds), axis=-1)

197/197 - 0s - loss: 0.0343 - accuracy: 0.9892 - 158ms/epoch - 801us/step
