### Creating transformer encoding and vector db

In [1]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
DATA_PATH = "../data/raw/all_tickets_processed_improved_v3.csv"
df = pd.read_csv(DATA_PATH)
df.head()

X = df["Document"]
y = df["Topic_group"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, shuffle=True, random_state=2)

In [3]:
from sentence_transformers import SentenceTransformer
import numpy as np

retrieval_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
X_train_embeddings = retrieval_model.encode(
    X_train.tolist(),
    batch_size=64,
    normalize_embeddings=True,
    show_progress_bar=True
)

X_train_embeddings = np.asarray(X_train_embeddings, dtype=np.float64)

Batches: 100%|██████████| 598/598 [00:27<00:00, 21.96it/s]


In [5]:
import faiss
embedding_dim = X_train_embeddings.shape[1] 
similarity_index = faiss.IndexFlatIP(embedding_dim)
euclidian_index = faiss.IndexFlatL2(embedding_dim)

similarity_index.add(X_train_embeddings)
euclidian_index.add(X_train_embeddings)

In [6]:
faiss.write_index(similarity_index, "../artifacts/traindata_similarity_index_v01.index")
faiss.write_index(euclidian_index, "../artifacts/traindata_euclidian_index_v01.index")

In [8]:
import json
corpus = {
    "Document": X_train.tolist(),
    "Topic_group": y_train.tolist()
}
with open("../artifacts/rac_corpus_similarity-euclidian_index_v01.json", 'w') as f:
    json.dump(corpus, f)

In [None]:
scores, idxs = euclidian_index.search(retrieval_model.encode(["card card va sa se pare ca la va si manager"], normalize_embeddings=True), 10)
for idx in idxs[0]:
    print(X_train.tolist()[idx])

card card va sa se pare ca la va si manager
damaged card card va la ca card va sa si care se sa alt card va manager
card card va si si va
card card la care vine si ca badge sa si sa sa ii manager
card si card si va si
card card sa la la la va sa la parte accountant
card de card si ca
card de pm card si ca card
card card va sa tot card si sa
card card tin sa ca si va cum sa


In [9]:
retrieval_model.encode(["card card va sa se pare ca la va si manager"], normalize_embeddings=True)

array([[-4.73827235e-02,  3.59718092e-02, -2.70001367e-02,
         4.62755142e-03, -4.97829095e-02,  3.55505906e-02,
         6.78645670e-02,  5.69617115e-02,  8.80174413e-02,
         2.87438394e-03,  2.09275912e-02, -8.00661892e-02,
        -3.97205167e-02, -2.96296142e-02,  5.30041791e-02,
        -7.66308531e-02, -7.15494901e-02,  5.32197617e-02,
         5.93414493e-02,  7.86548108e-02,  9.07367002e-03,
        -1.31609682e-02, -1.28871441e-01,  6.41460493e-02,
        -1.26899436e-01,  2.63836626e-02, -5.24218939e-02,
        -2.62743561e-03, -3.10838427e-02, -6.44801632e-02,
         8.66640657e-02,  9.02670100e-02,  1.01636373e-01,
         6.23881929e-02, -1.25458129e-04, -5.70091512e-03,
         1.61388353e-03,  5.17689344e-03,  4.49216142e-02,
         1.37265744e-02, -7.57528171e-02, -2.91122999e-02,
        -4.48098108e-02, -4.12317477e-02,  7.21852183e-02,
         5.77058224e-03,  5.04175909e-02,  2.14143526e-02,
        -1.02370217e-01,  2.92841699e-02, -4.43877317e-0