In [1]:
import pandas as pd
import json 
import glob
import faiss
from fuzzywuzzy import fuzz
from llmsherpa.readers import LayoutPDFReader
import matplotlib.pyplot as plt
import numpy as np
import os
import datetime
import torch
from transformers import AutoTokenizer, AutoModel



In [2]:
import tqdm

In [3]:
from src.rechunker import Rechunker
from src.encoder.sentence_transformer import Encoder
from src.faiss.flat_idx import FlatIdx
from utils.utils import flatten_list, write_list_to_file, read_list_from_file
from src.eval import Eval


## Data

In [12]:
save_path = r"C:\Users\J C SINGLA\Downloads\External - take_home_challenge_(withJSONs)\take_home_challenge_(withJSONs)\data"

In [13]:
all_data_sherpa = read_list_from_file(save_path, "sherpa_paras")
filenames_sherpa = read_list_from_file(save_path, "sherpa_paras_filenames")
assert (len(all_data_sherpa)==len(filenames_sherpa))

In [14]:
ground_truth_path = r"C:\Users\J C SINGLA\Downloads\External - take_home_challenge_(withJSONs)\take_home_challenge_(withJSONs)\document_questions.xlsx"
ground_truth = pd.read_excel(ground_truth_path)
ground_truth_text = ground_truth[ground_truth["complexity"]=="text"].copy()
test_data = list(ground_truth_text["relevant questions"])
test_labels = list(ground_truth_text["answer"])

## Model

In [10]:
tokenizer = AutoTokenizer.from_pretrained('facebook/dragon-plus-query-encoder')
query_encoder = AutoModel.from_pretrained('facebook/dragon-plus-query-encoder')
context_encoder = AutoModel.from_pretrained('facebook/dragon-plus-context-encoder')

In [15]:
query =  test_data
contexts = all_data_sherpa


In [42]:
query_input = tokenizer(query, return_tensors='pt')
ctx_input = tokenizer(contexts[0:2], padding=True, truncation=True, return_tensors='pt')


In [53]:
ctx_input.input_ids.shape

torch.Size([1, 660])

In [59]:
ctx_emb = torch.empty((0, 768))
for i in tqdm.tqdm(range(len(all_data_sherpa))):
    ctx_input = tokenizer(contexts[i:i+1], padding=True, truncation=True, return_tensors='pt', max_length = 512)
    temp_emb = context_encoder(**ctx_input).last_hidden_state[:, 0, :]
    ctx_emb = torch.cat((ctx_emb, temp_emb), dim=0)


  0%|                                                                                       | 0/488 [00:00<?, ?it/s][A
  0%|▎                                                                              | 2/488 [00:00<01:16,  6.39it/s][A
  1%|▋                                                                              | 4/488 [00:00<01:02,  7.77it/s][A
  1%|▊                                                                              | 5/488 [00:00<01:10,  6.86it/s][A
  1%|█▏                                                                             | 7/488 [00:01<01:40,  4.79it/s][A
  2%|█▍                                                                             | 9/488 [00:01<01:14,  6.41it/s][A
  2%|█▊                                                                            | 11/488 [00:01<01:16,  6.25it/s][A
  2%|█▉                                                                            | 12/488 [00:01<01:15,  6.32it/s][A
  3%|██                                

In [60]:
query_emb = query_encoder(**query_input).last_hidden_state[:, 0, :]

In [61]:
ctx_emb.shape

torch.Size([488, 768])

In [4]:
save_path = r"C:\Users\J C SINGLA\Downloads\External - take_home_challenge_(withJSONs)\take_home_challenge_(withJSONs)\data\tensors\sherpa_paras.pt"
# torch.save(ctx_emb, save_path)

In [6]:
ctx_emb = torch.load(save_path)
ctx_emb.shape

torch.Size([488, 768])

In [7]:
index = FlatIdx(ctx_emb.shape[1])

In [8]:
index.add_idx(ctx_emb.detach().numpy())

In [15]:
retrieved_items = index.faiss_dragon_inference(query_encoder, tokenizer, all_data_sherpa, test_data)

In [17]:
retrieved_items[0]

['The following table provides a summary of the Oregon Sections membership, including grade and section dues.',
 'The 6 month time limit is particularly short when compared with similar jurisdictions that deal with discrimination or employment law issues.',
 ' Net sales were $112.5 million, an increase of $11.4 million, or 11%, from the first quarter of 2014 and an increase of $1.3 million, or 1%, from the second quarter of 2013.',
 'This year we had 52 students from six universities attending the event.\nParticipating schools were:\n Portland State University\n University of Idaho\n University of Portland\n Oregon State University\n University of Washington\n Oregon Institute of Technology']

In [18]:
metric = Eval(k=4)
recall, incorrect, correct = metric.recall_k(test_labels, retrieved_items)

In [19]:
recall

0.07142857142857142

In [29]:
list(incorrect[1].keys())

['University of Portland took home the grand prize of bragging rights, a trophy and a $400 scholarship \naward. University of Washington and Oregon Institute of Technology both received $300 for tying \nfor second place.  The remaining three participating schools received a $150 participation award.  All \nof the student attendees received a free dinner at this event. In order to offset the cost of the student \nmeals and scholarships, several local companies donated funds to support student attendance. \nDonations were received from: Advanced Traffic Products, CH2M Hill, City of Gresham, City of \nPortland, Coral Sales Company, David Evans and Associates, DKS Associates, Group Mackenzie, \nHDR, IBI Group, JRH Engineering, Kittelson and Associates, Lancaster Engineering, NWS Traffic \nEngineering, Parametrix, Quality Counts, TrafStats, URS Corp, W&H Pacific, and Western Systems.  \nThe Oregon Section has already secured the room for Traffic Bowl 2008 (November, 2008) at \nMcMenamin’s E

In [30]:
list(incorrect[1].values())

[[' Net sales were $112.5 million, an increase of $11.4 million, or 11%, from the first quarter of 2014 and an increase of $1.3 million, or 1%, from the second quarter of 2013.',
  'The 6 month time limit is particularly short when compared with similar jurisdictions that deal with discrimination or employment law issues.',
  'Net sales for the second quarter were $112.5 million, an 11% increase compared with $101.1 million in the first quarter of 2014 and a 1% increase compared with $111.2 million in the second quarter of 2013.\nNet income in the second quarter was $2.0 million, or $0.10 per diluted share, compared with $3.9 million, or $0.19 per diluted share, in the first quarter of 2014 and compared with $4.6 million, or $0.22 per diluted share, in the second quarter of 2013.',
  ' Net income was $2.0 million, a decrease from $3.9 million in the first quarter of 2014 and a decrease from $4.6 million in the second quarter of 2013.\nDiluted earnings per share were $0.10, a decrease f