In [None]:
import pandas as pd
import json 
import glob
import faiss
from fuzzywuzzy import fuzz
from llmsherpa.readers import LayoutPDFReader
import matplotlib.pyplot as plt
import numpy as np
import os
import datetime
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import VarianceThreshold
import faiss

In [None]:
from src.rechunker import Rechunker
from src.encoder.tf_idf import Encoder
from src.faiss.flat_idx import FlatIdx
from utils.utils import flatten_list, write_list_to_file, read_list_from_file
from src.eval import Eval
from src.post_processing import idk

In [None]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import string

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

## Data

In [None]:
save_path = r"C:\Users\J C SINGLA\Downloads\External - take_home_challenge_(withJSONs)\take_home_challenge_(withJSONs)\data"

In [None]:
all_data_sherpa = read_list_from_file(save_path, "sherpa_paras_and_tables")
filenames_sherpa = read_list_from_file(save_path, "sherpa_paras_and_tables_filenames")
assert (len(all_data_sherpa)==len(filenames_sherpa))

In [None]:
ground_truth_path = r"C:\Users\J C SINGLA\Downloads\External - take_home_challenge_(withJSONs)\take_home_challenge_(withJSONs)\document_questions.xlsx"
ground_truth = pd.read_excel(ground_truth_path)
ground_truth_text = ground_truth[ground_truth["complexity"].isin(["table", "text"])].copy()
test_data = list(ground_truth_text["relevant questions"])
test_labels = list(ground_truth_text["answer"])

In [None]:
ground_truth

In [None]:
len(test_labels)

## Vectorizer

In [None]:
def preprocess_text(text):
    translator = str.maketrans('', '', string.punctuation)
    text_no_punctuation = text.translate(translator)
    tokens = nltk.word_tokenize(text_no_punctuation)
    stop_words = set(stopwords.words('english'))
    tokens = [word.lower() for word in tokens if word not in stop_words]
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()    
    return ' '.join(tokens)

In [None]:
def tf_encoder(data, clean = False):
    vectorizer = TfidfVectorizer()
    if clean:
        data = [preprocess_text(x) for x in data]
    tfidf_vectors = vectorizer.fit_transform(data)
    dense_vectors = tfidf_vectors.toarray()
    return dense_vectors, vectorizer, data

In [None]:
dense_vectors, vectorizer, data = tf_encoder(all_data_sherpa, False)

In [None]:
feature_names = vectorizer.get_feature_names_out()
len(feature_names)

## FAISS

L2

In [None]:
tf_encoder = Encoder(all_data_sherpa)
tf_encoder.get_embeddings(clean=True)

In [None]:
k = 10
index = FlatIdx(d=len(feature_names))
index.add_idx(tf_encoder.embedding)

In [None]:
test_data = [preprocess_text(x) for x in test_data]
retrieved_items, D = index.faiss_tfidf_inference(vectorizer, all_data_sherpa, test_data, k=k)

In [None]:
test_labels, retrieved_items, test_query = idk(test_labels, retrieved_items, D, test_data)

### Cosine

In [None]:
def normalize_vectors(vectors):
    """Normalize vectors to unit length."""
    norm = np.linalg.norm(vectors, axis=1, keepdims=True)
    return vectors / np.where(norm == 0, 1, norm)

In [None]:
dense_vectors_ = normalize_vectors(dense_vectors)

In [None]:
index = faiss.IndexFlatIP(len(feature_names))
index.add(dense_vectors)

In [None]:
xq = vectorizer.transform(test_data)
xq = xq.toarray().astype('float32')

In [None]:
k = 10
distances, indices = index.search(xq, k)

## Eval

In [None]:
metric = Eval(k=10)
recall, incorrect, correct = metric.recall_k(test_labels, retrieved_items)

In [None]:
print ("Recall is ", recall)
print ("MRR is ", metric.mean_reciprocal_rank(retrieved_items, test_labels))

# Level-2

In [None]:
from src.encoder.dragon import Encoder
import tqdm

In [None]:
dragon = Encoder()

In [None]:
# return reordered retrieved ranks
reorder_items = []
test_query = test_data
for i in range(len(test_query)):
    print (i)
    c_embedding = dragon.get_embeddings(retrieved_items[i])
    index = FlatIdx(d=c_embedding.shape[1])
    if isinstance(c_embedding, torch.Tensor):
        c_embedding = c_embedding.detach().numpy()
    index.add_idx(c_embedding)
    temp = index.faiss_dragon_inference(dragon.query_encoder, dragon.tokenizer, retrieved_items[i], [test_query[i]], k=len(retrieved_items[i]))
    reorder_items.append(temp[0])

In [None]:
recall, incorrect, correct = metric.recall_k(test_labels, reorder_items)
print ("Recall is ", recall)
print ("MRR is ", metric.mean_reciprocal_rank(reorder_items, test_labels))

In [None]:
test_data