In [1]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import json
import faiss
import joblib

import torch

from sklearn.model_selection import train_test_split

from src.data.rac_utils import pooled_agument_texts
from src.data.datastruct import Sample, Batch
from src.data.collate import collate_func, create_batch, create_samples

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# import data and index
device = "cuda" if torch.cuda.is_available() else "cpu"
DATA_PATH = "../data/raw/all_tickets_processed_improved_v3.csv"
df = pd.read_csv(DATA_PATH)
df.head()

X = df["Document"]
y = df["Topic_group"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, shuffle=True, random_state=2)

with open("../artifacts/rac_corpus_similarity-euclidian_index_v01.json", 'r') as f:
    corpus = json.load(f)

similarity_index = faiss.read_index("../artifacts/traindata_similarity_index_v01.index")
euclidian_index = faiss.read_index("../artifacts/traindata_euclidian_index_v01.index")

retrieval_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=device)

In [3]:
X_train_agumented = pooled_agument_texts(retrieval_model = retrieval_model,
                                        index=similarity_index,
                                        corpus=corpus,
                                        keys=["Document", "Topic_group"],
                                        augmentation_key="Document",
                                        texts=X_train.tolist(),
                                        remove_top=True)

38269it [05:13, 121.90it/s]


In [4]:
X_test_agumented = pooled_agument_texts(retrieval_model = retrieval_model,
                                        index=similarity_index,
                                        corpus=corpus,
                                        keys=["Document", "Topic_group"],
                                        augmentation_key="Document",
                                        texts=X_test.tolist(),
                                        remove_top=True)

9568it [01:20, 118.16it/s]


In [6]:
with open("../data/processed/agumented_ticketdata_similarity_v01.json", 'w') as f:
    json.dump(
        {
            "train": {
                "X": X_train_agumented,
                "y": y_train.tolist()
            },
            "test": {
                "X": X_test_agumented,
                "y": y_test.tolist()
            }
        },
        f
    )

In [7]:
X_train_agumented = pooled_agument_texts(retrieval_model = retrieval_model,
                                        index=euclidian_index,
                                        corpus=corpus,
                                        keys=["Document", "Topic_group"],
                                        augmentation_key="Document",
                                        texts=X_train.tolist(),
                                        remove_top=True)
X_test_agumented = pooled_agument_texts(retrieval_model = retrieval_model,
                                        index=euclidian_index,
                                        corpus=corpus,
                                        keys=["Document", "Topic_group"],
                                        augmentation_key="Document",
                                        texts=X_test.tolist(),
                                        remove_top=True)

38269it [05:49, 109.57it/s]
9568it [01:30, 105.76it/s]


In [8]:
with open("../data/processed/agumented_ticketdata_euclidian_v01.json", 'w') as f:
    json.dump(
        {
            "train": {
                "X": X_train_agumented,
                "y": y_train.tolist()
            },
            "test": {
                "X": X_test_agumented,
                "y": y_test.tolist()
            }
        },
        f
    )