In [1]:
import pandas as pd
import json 
import glob
import faiss
from fuzzywuzzy import fuzz
from llmsherpa.readers import LayoutPDFReader
import matplotlib.pyplot as plt
import numpy as np
import os
import datetime
import torch
from transformers import AutoTokenizer, AutoModel



In [2]:
import tqdm

In [3]:
from src.rechunker import Rechunker
from src.encoder.sentence_transformer import Encoder
from src.faiss.flat_idx import FlatIdx
from utils.utils import flatten_list, write_list_to_file, read_list_from_file
from src.eval import Eval


[nltk_data] Downloading package punkt to C:\Users\J C
[nltk_data]     SINGLA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\J C
[nltk_data]     SINGLA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\J C
[nltk_data]     SINGLA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data

In [4]:
save_path = r"C:\Users\J C SINGLA\Downloads\External - take_home_challenge_(withJSONs)\take_home_challenge_(withJSONs)\data"

In [5]:
all_data_sherpa = read_list_from_file(save_path, "sherpa_paras_and_tables")
filenames_sherpa = read_list_from_file(save_path, "sherpa_paras_and_tables_filenames")
assert (len(all_data_sherpa)==len(filenames_sherpa))

In [6]:
ground_truth_path = r"C:\Users\J C SINGLA\Downloads\External - take_home_challenge_(withJSONs)\take_home_challenge_(withJSONs)\document_questions.xlsx"
ground_truth = pd.read_excel(ground_truth_path)
ground_truth_text = ground_truth
test_data = list(ground_truth_text["relevant questions"])
test_labels = list(ground_truth_text["answer"])

In [7]:
len(test_labels), len(all_data_sherpa)

(33, 502)

## Model

In [8]:
def normalize_vectors(vectors):
    """Normalize vectors to unit length."""
    norm = np.linalg.norm(vectors, axis=1, keepdims=True)
    return vectors / np.where(norm == 0, 1, norm)

In [9]:
tokenizer = AutoTokenizer.from_pretrained('facebook/dragon-plus-query-encoder')
query_encoder = AutoModel.from_pretrained('facebook/dragon-plus-query-encoder')
context_encoder = AutoModel.from_pretrained('facebook/dragon-plus-context-encoder')

In [10]:
query =  test_data
contexts = all_data_sherpa

In [11]:
ctx_emb = torch.empty((0, 768))
for i in tqdm.tqdm(range(len(all_data_sherpa))):
    ctx_input = tokenizer(contexts[i:i+1], padding=True, truncation=True, return_tensors='pt', max_length = 512)
    temp_emb = context_encoder(**ctx_input).last_hidden_state[:, 0, :]
    ctx_emb = torch.cat((ctx_emb, temp_emb), dim=0)

100%|███████████████████████████████████████████████████████████████████████| 502/502 [03:09<00:00,  2.65it/s]


In [12]:
ctx_emb_ = normalize_vectors(ctx_emb.detach().numpy())

In [13]:
save_path = r"C:\Users\J C SINGLA\Downloads\External - take_home_challenge_(withJSONs)\take_home_challenge_(withJSONs)\data\tensors\dragon_paras_norm.pt"
torch.save(ctx_emb_, save_path)

QUERY VECTORS

In [30]:
xq = torch.empty((0, 768))
for i in tqdm.tqdm(range(len(test_data))):
    q_input = tokenizer(test_data[i:i+1], padding=True, truncation=True, return_tensors='pt', max_length = 512)
    temp_emb = context_encoder(**q_input).last_hidden_state[:, 0, :]
    xq = torch.cat((xq, temp_emb), dim=0)

100%|█████████████████████████████████████████████████████████████████████████| 33/33 [00:01<00:00, 16.97it/s]


In [31]:
xq_ = normalize_vectors(xq.detach().numpy())

In [32]:
index = faiss.IndexFlatL2(ctx_emb.shape[1])

In [33]:
index.add(ctx_emb_)

## Search

In [34]:
k = 10
distances, indices = index.search(xq_, k)

In [35]:
ret_context = []
for i in range(len(test_data)):
    retrieved_items = [contexts[i] for i in list(indices[i])]
    ret_context.append(retrieved_items)

In [36]:
metric = Eval(k=10)
recall, incorrect, correct = metric.recall_k(test_labels, ret_context)

In [37]:
print ("Recall is ", recall)
print ("MRR is ", metric.mean_reciprocal_rank(ret_context, test_labels))

Recall is  0.5757575757575758
MRR is  0.27878787878787875


In [41]:
incorrect[3]

{'None': ['Financial markets are an important challenge for agent-based computational modelers.\nFinancial markets may be one of the important early areas where agent-based methods show their worth, for two basic reasons.',
  'Financial markets are particularly appealing applications for agent-based methods for several reasons.\nFirst, the key debates in nance about market eciency and rationality are still unresolved.\nSecond, nancial time series contain many curious puzzles that are not well understood.\nThird, nancial markets provide a wealth of pricing and volume data that can be analyzed.\nFourth, when considering evolution, nancial markets provide a good approximation to a crude tness measure through wealth or return performance.\nFinally, there are strong connections to relevant experimental results that in some cases operate at the same time scales as actual nancial markets.',
  'Levy, M., Levy, H. & Solomon, S. (2000), Microscopic Simulation of Financial Markets, Academic Press

In [43]:
test_data[3]

'Why do you think financial markets are viewed as interacting groups of learning and boundedly-rational agents?'