In [3]:
from sentence_transformers import SentenceTransformer
import numpy as np
import json
import joblib
from transformers import AutoTokenizer

from ConfigSchema import ConfigSchema

  from tqdm.autonotebook import tqdm, trange





In [4]:
with open('semi_automated_dataset_creation/processed_decomposed_dataset.jsonl', 'r') as f:
    dataset = [json.loads(line) for line in f]

data_pair = np.array([
    (item['query'], item['S_a'] if item['label'] <= 0 else item['S_b'])
    for item in dataset
])

In [5]:
config_schema = ConfigSchema()
with open("config.cfg", "r") as cfg:
    config = {}
    for line in cfg:
        if line.strip() and not line.startswith("#"):
            key, value = line.strip().split("=")
            config[key.strip()] = value.strip()
config_schema.from_dict(config)

In [6]:
tokenizer = AutoTokenizer.from_pretrained(config_schema.model_name)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
X = tokenizer.batch_encode_plus(
    data_pair[:, 0].tolist(),
    padding='max_length',
    truncation=True,
    max_length=256,
    return_tensors='np'
)
X = X['input_ids'] / tokenizer.vocab_size
np.save('X.npy', X)
print("X done")

In [7]:
X = np.load('X.npy')
X

array([[2.33034166e-01, 1.79566482e-01, 4.30478163e-01, ...,
        1.00001319e+00, 1.00001319e+00, 1.00001319e+00],
       [3.62759903e-02, 8.44747202e-03, 1.74093100e-03, ...,
        1.00001319e+00, 1.00001319e+00, 1.00001319e+00],
       [5.53932592e-03, 1.36979617e-01, 3.46669480e-02, ...,
        1.00001319e+00, 1.00001319e+00, 1.00001319e+00],
       ...,
       [3.40140989e-02, 1.83984754e-03, 2.20346472e-01, ...,
        1.00001319e+00, 1.00001319e+00, 1.00001319e+00],
       [3.23015240e-01, 1.76269264e-02, 7.25387918e-05, ...,
        5.85651827e-02, 1.74093100e-03, 1.13635315e-01],
       [5.55251479e-02, 3.46207870e-03, 2.14319157e-02, ...,
        1.01949975e-01, 5.55251479e-03, 4.08855008e-04]])

In [None]:
encoder = SentenceTransformer('all-mpnet-base-v2')
Y = encoder.encode(data_pair[:,1], normalize_embeddings=True)
np.save('Y.npy', Y)
print("Y done")

In [8]:
Y = np.load('Y.npy')
Y

array([[ 0.00895995, -0.12334388, -0.0325086 , ..., -0.05825622,
         0.0237086 , -0.00698727],
       [ 0.02767649,  0.02718943, -0.01895026, ...,  0.02996054,
         0.01355811, -0.02394591],
       [-0.02491825,  0.00840581, -0.0215272 , ...,  0.05550037,
        -0.03574743, -0.00795398],
       ...,
       [ 0.03039899, -0.00654207,  0.00221696, ...,  0.00505765,
         0.03097504, -0.00514259],
       [-0.06168054,  0.13452576, -0.03894248, ...,  0.03692231,
        -0.05549473, -0.03547874],
       [ 0.0645332 , -0.00039842,  0.00253504, ..., -0.00615496,
         0.02678013, -0.03398724]], dtype=float32)

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

print(f"Train shape: X={X_train.shape}, Y={Y_train.shape}")
print(f"Test shape:  X={X_test.shape}, Y={Y_test.shape}")

Train shape: X=(7295, 256), Y=(7295, 768)
Test shape:  X=(1824, 256), Y=(1824, 768)


In [41]:
import tensorflow as tf
from tensorflow.keras import backend as K

def cosine_similarity_loss(y_true, y_pred):
    """
    Loss = 1 - cosine_similarity (averaged over batch)
    """
    y_true = tf.math.l2_normalize(y_true, axis=1)
    y_pred = tf.math.l2_normalize(y_pred, axis=1)
    cosine_sim = tf.reduce_sum(y_true * y_pred, axis=1)  # batch of sims
    return 1.0 - tf.reduce_mean(cosine_sim)  # final scalar loss

def hybrid_loss(y_true, y_pred):
    cosine = cosine_similarity_loss(y_true, y_pred)
    mse = tf.reduce_mean(tf.square(y_true - y_pred))
    return cosine + 0.5 * mse  # weight mse if needed

In [None]:

from tensorflow.keras import layers, models

def create_mlp(input_dim, output_dim):
    model = models.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(1024, activation='relu'),
        layers.Dense(1024, activation='relu'),
        layers.Dense(output_dim)  # Linear activation for regression
    ])
    model.compile(optimizer='adam', loss=hybrid_loss, metrics=['mae',cosine_similarity_loss])
    return model

model = create_mlp(X_train.shape[1], Y_train.shape[1])
model.fit(X_train, Y_train, batch_size=16, epochs=20)

Epoch 1/20
[1m456/456[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - cosine_similarity_loss: 0.5909 - loss: 0.5984 - mae: 0.0828
Epoch 2/20
[1m456/456[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - cosine_similarity_loss: 0.5529 - loss: 0.5543 - mae: 0.0404
Epoch 3/20
[1m456/456[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - cosine_similarity_loss: 0.5478 - loss: 0.5490 - mae: 0.0376
Epoch 4/20
[1m456/456[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - cosine_similarity_loss: 0.5336 - loss: 0.5348 - mae: 0.0376
Epoch 5/20
[1m456/456[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - cosine_similarity_loss: 0.5193 - loss: 0.5204 - mae: 0.0368
Epoch 6/20
[1m456/456[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - cosine_similarity_loss: 0.5019 - loss: 0.5030 - mae: 0.0359
Epoch 7/20
[1m456/456[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - cosine_similarity_loss: 

<keras.src.callbacks.history.History at 0x1db812a2e10>

In [45]:
model.evaluate(X_test, Y_test)

[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - cosine_similarity_loss: 0.5800 - loss: 0.5813 - mae: 0.0391


[0.5787839293479919, 0.03891289606690407, 0.5774489045143127]

In [46]:
unique_strategies = np.array(list(set(data_pair[:,1])))
# get the encodings for the unique strategies via indexing

In [47]:
locs = [np.where(data_pair[:,1] == strategy)[0][0] for strategy in unique_strategies]
unique_embds = Y[locs]

In [65]:
def predict_strategy(query: str|list[str], verbose=False)->str:
    x = tokenizer.batch_encode_plus(
        query,
        padding='max_length',
        truncation=True,
        max_length=256,
        return_tensors='np'
    )
    x = x['input_ids'] / tokenizer.vocab_size
    pred_emb = model.predict(x)
    input_norm = pred_emb / np.linalg.norm(pred_emb, axis=1, keepdims=True)
    database_norm = Y / np.linalg.norm(Y, axis=1, keepdims=True)
    cos_sim = np.dot(input_norm, database_norm.T)
    nearest_indices = np.argmax(cos_sim, axis=1).astype(int)
    if verbose:
        print(f"Nearest indices: {nearest_indices}")
        print("Similarities:",[cos_sim[i,nearest_indices[i]] for i in range(len(nearest_indices)) ] )
    return data_pair[nearest_indices, 1], cos_sim

In [71]:
out, sim = predict_strategy(data_pair[:,0].tolist())

[1m285/285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [None]:
sim