In [7]:
#Load Embeddings model

from transformers import AutoTokenizer, AutoModel
import torch

# Load the pre-trained BERT model and tokenizer
model_name = "cross-encoder/ms-marco-TinyBERT-L-2-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Check if a GPU is available and move the model to the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

#Test functionaliy
# # Tokenize the text field and create a tensor with the token IDs
# text_field = "This is an example text field."
# inputs = tokenizer(text_field, return_tensors="pt", padding=True, truncation=True)
# inputs = {k: v.to(device) for k, v in inputs.items()}

# # Obtain the embeddings
# with torch.no_grad():
#     outputs = model(**inputs)
#     embeddings = outputs.last_hidden_state

# # Calculate the average embedding
# avg_embedding = embeddings.mean(dim=1).squeeze().cpu().numpy()
# avg_embedding

Downloading (…)okenizer_config.json:   0%|          | 0.00/525 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/787 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/17.6M [00:00<?, ?B/s]

Some weights of the model checkpoint at cross-encoder/ms-marco-TinyBERT-L-2-v2 were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


array([-0.4357803 ,  0.36340347, -0.06323107, -0.4547933 ,  0.7432816 ,
       -0.08517475,  0.16130118, -0.46278253, -1.5739542 , -1.5847503 ,
       -0.69040406,  0.7726777 ,  0.3736833 ,  0.66305876,  0.83053374,
       -1.5233039 , -1.5528806 , -0.8697493 ,  0.03831073,  0.20659317,
        0.30142975, -0.5748035 ,  1.8313729 ,  1.5341616 ,  0.0682143 ,
       -0.55511665,  1.4307392 ,  0.48015532, -0.37222254, -0.73368627,
       -2.1135156 ,  0.5932261 , -0.46605512,  1.1745759 , -0.23546444,
       -0.0999713 ,  0.3225118 , -1.3161471 ,  0.44002432,  0.04558567,
       -1.6922033 ,  0.7150186 , -1.025261  ,  0.5914664 ,  0.85801417,
        0.7897008 , -0.6620282 , -0.4997413 ,  0.27993873,  0.7173517 ,
        0.4957321 ,  0.12957649, -0.7568081 , -0.18477207, -0.41459048,
        0.20078199,  1.3074706 ,  0.3579617 ,  0.09655037,  2.6447399 ,
        0.28725016, -0.726943  , -0.055682  , -0.31199515, -0.6006541 ,
        0.18334545, -1.5489371 , -0.17327069, -1.2425092 , -0.95

In [17]:
#Load SampleData.csv into a dataframe / turn labels into embeddings
import pandas as pd
df = pd.read_csv('SampleData.csv')

#Get field AdjustmentName as list
AdjustmentNames = df['AdjustmentName'].tolist()

#Loop over every field and turn it into an embedding
queryList=[]
for AdjustmentName in AdjustmentNames:
    text_field = AdjustmentName.replace('_', ' ').replace('-', ' ')
    inputs = tokenizer(text_field, return_tensors="pt", padding=True, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state

    #Turn embedding into a regular vector
    avg_embedding = embeddings.mean(dim=1).squeeze().cpu().numpy()
    queryList.append(avg_embedding)



In [28]:
#Onehot encode the existing adjustment names
#Get only the AdjustmentName column
df2=df[['AdjustmentName']]

#Get list of unique values produce mapping to numeric index
uniqueValues=df2['AdjustmentName'].unique()
uniqueValues.sort()

#Create a dictionary of unique values
uniqueValuesDict={}
for i in range(len(uniqueValues)):
    uniqueValuesDict[uniqueValues[i]]=i

#Create an inverse dictionary of unique values
uniqueValuesDictInverse={}
for i in range(len(uniqueValues)):
    uniqueValuesDictInverse[i]=uniqueValues[i]

#Get AdjustmentName column as a list
AdjustmentNames = df2['AdjustmentName'].tolist()

#Replace AdjustmentName with numeric index
for i in range(len(AdjustmentNames)):
    AdjustmentNames[i]=uniqueValuesDict[AdjustmentNames[i]]

#Make new dataframe from AdjustmentNames
df_encoded = pd.DataFrame(AdjustmentNames, columns=['AdjustmentName'])

#Unpack lists in queryList and turn into a dataframe grid
import numpy as np
df_query = pd.DataFrame(np.array(queryList).reshape(len(queryList), 128))

#Prepend df_query to df_encoded
df_encoded = pd.concat([df_query, df_encoded], axis=1)
df_encoded


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,AdjustmentName_Market Downturn,AdjustmentName_Noise_116z,AdjustmentName_Noise_128x,AdjustmentName_Noise_19b,AdjustmentName_Noise_22d,AdjustmentName_Noise_256n,AdjustmentName_Noise_6f,AdjustmentName_STD downgrade credit card,AdjustmentName_US Corporate Crunch,AdjustmentName_US STD downgrade
0,0.142441,1.340561,-1.079388,0.648643,-0.219182,-0.083226,-0.409499,-0.826931,-0.035303,-0.097986,...,0,0,0,0,0,0,0,0,0,0
1,-0.269460,1.361620,-1.282112,0.089436,0.785191,0.243085,0.841210,0.014414,-1.682267,0.049375,...,1,0,0,0,0,0,0,0,0,0
2,-0.269460,1.361620,-1.282112,0.089436,0.785191,0.243085,0.841210,0.014414,-1.682267,0.049375,...,1,0,0,0,0,0,0,0,0,0
3,0.142441,1.340561,-1.079388,0.648643,-0.219182,-0.083226,-0.409499,-0.826931,-0.035303,-0.097986,...,0,0,0,0,0,0,0,0,0,0
4,-0.472149,1.183860,-1.850750,0.366249,-0.031028,0.506558,-0.143773,-0.467634,-1.137986,0.385473,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,0.015324,0.773752,-1.002914,0.593089,0.050763,0.109482,-0.460086,-0.350108,0.002370,0.501412,...,0,0,0,0,0,1,0,0,0,0
19996,0.142441,1.340561,-1.079388,0.648643,-0.219182,-0.083226,-0.409499,-0.826931,-0.035303,-0.097986,...,0,0,0,0,0,0,0,0,0,0
19997,-0.299798,1.626725,-1.120108,0.248860,1.173713,0.360141,0.049600,-0.746300,-1.034973,-0.495910,...,0,0,0,0,0,0,0,0,0,0
19998,-0.576203,0.907879,-1.088134,0.363876,-0.116089,-0.251194,-0.732559,-0.551034,-0.318248,0.431621,...,0,0,0,1,0,0,0,0,0,0


In [29]:
#Break into test and training sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_encoded, df_encoded, test_size=0.2, random_state=42)


In [33]:
#Define model
#Build model parameterized so we can do a grid-search on hyperparameters
model.fit(X_train, y_train, batch_size = 32, epochs = 80)
from keras.wrappers.scikit_learn import KerasClassifier

def CreateModel(neurons=200, dropout=0.2,layers=3):

    #------------------PARAMETERS------------------
    num_classes=len(string_columns)

    #Simple feed-forward model. No U-nets or anything fancy
    ann = tf.keras.models.Sequential()

    #Variable number of layers
    for i in range(layers):
        ann.add(tf.keras.layers.Dense(units=neurons, activation='relu'))

    #Final dropout layer at last fully connected layer before classification head
    ann.add(tf.keras.layers.Dropout(dropout))
    ann.add(tf.keras.layers.Dense(units=num_classes, activation='sigmoid'))

    # ann.add(tf.keras.layers.Flatten())
    # ann.add(tf.keras.layers.Dense(num_classes, activation='softmax'))


    #Compile!
    ann.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    ann.build(input_shape=(None, X_train.shape[1]))
    #ann.summary()
    return ann

model = KerasClassifier(build_fn=CreateModel, verbose=0)

param_grid = {
    'epochs': [1,2,4,8,16],
    'neurons': [200, 300, 400],
    "dropout":[0.05,0.1,0.2,0.4],
    "layers":[1,2,4]
}

In [32]:
model.fit(X_train, y_train, batch_size = 32, epochs = 80)