In [78]:
import pandas as pd
import json 
from sentence_transformers import SentenceTransformer
import glob
import faiss
from fuzzywuzzy import fuzz



In [2]:
# !pip install pandas
# !pip install sentence-transformers
# !pip install openpyxl
# !pip install fuzzywuzzy

## Load model

In [4]:
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [5]:
def split_data(data):
    return data["raw_text"].split("\n")

In [6]:
def read_json_from_folders(directory_path):
    """
    Reads all JSON files from each folder in the specified directory.

    :param directory_path: Path to the directory containing folders of JSON files.
    :return: A list of dictionaries where each dictionary contains data from a single JSON file.
    """
    all_data = []
    filenames = []
    search_pattern = f"{directory_path}/*/*.json"
    for file_path in glob.glob(search_pattern):
        with open(file_path, 'r') as file:
            data = json.load(file)
            data = split_data(data)
            filenames.extend([file_path]*len(data))
            all_data.extend(data)
    return all_data, filenames

In [7]:
directory_path = r'C:\Users\J C SINGLA\Downloads\External - take_home_challenge_(withJSONs)\take_home_challenge_(withJSONs)\raw_text'
all_data, filenames = read_json_from_folders(directory_path)

In [8]:
doc_name = "DR--110685614"
subset_str = "C:\\Users\\J C SINGLA\\Downloads\\External - take_home_challenge_(withJSONs)\\take_home_challenge_(withJSONs)\\raw_text\\" +doc_name+"\\raw_text.json"

In [9]:
sentence_embeddings = model.encode(all_data)

In [10]:
sentence_embeddings.shape

(428, 768)

# Index Flat2

In [11]:
d = sentence_embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.is_trained

In [12]:
index.add(sentence_embeddings)
index.ntotal

428

# Ground Truth

In [41]:
ground_truth_path = r"C:\Users\J C SINGLA\Downloads\External - take_home_challenge_(withJSONs)\take_home_challenge_(withJSONs)\document_questions.xlsx"

In [42]:
ground_truth = pd.read_excel(ground_truth_path)

In [43]:
ground_truth

Unnamed: 0,document,relevant questions,answer,complexity
0,DR--185549702_INTRO,"What is meant by ""computational finance""?",,no raw_text
1,DR--185549702_INTRO,What is meant by 'investor heterogeneity'?,,no raw_text
2,DR--185549702_INTRO,What was the revolution witnessed by finance i...,,no raw_text
3,DR--185549702_INTRO,Why do you think financial markets are viewed ...,,no raw_text
4,DR--185549702_INTRO,Why are the financial markets appealing applic...,,no raw_text
5,DR--185549702_INTRO,What are the different design questions in con...,,no raw_text
6,DR--185549702_INTRO,Discuss Lettau’s framework using various mathe...,,no raw_text
7,DR--185549702_INTRO,The exchange rate is given by what formula?,,no raw_text
8,DR--185549702_INTRO,Discuss the criticisms of agent-based markets.,,no raw_text
9,DR--185549702_INTRO,How is timing an ignored final problem?,,no raw_text


In [44]:
ground_truth_text = ground_truth[ground_truth["complexity"]=="text"].copy()

In [45]:
ground_truth_text

Unnamed: 0,document,relevant questions,answer,complexity
19,DR--182866691_INTRO,What is the Oregon Section's annual Traffic Bo...,The Student Liaison Committee had another succ...,text
20,DR--182866691_INTRO,The participating schools of the Traffic Bowl ...,This year we had 52 students from six universi...,text
21,DR--182866691_INTRO,How do you think the donations from the local ...,University of Portland took home the grand pri...,text
22,DR--182866691_INTRO,Discuss the Oregon Section's website.,This year the Oregon Section retained the doma...,text
23,DR--182866691_INTRO,What do you know about the 18th annual ITE Gol...,The Oregon Section hosted its 18th annual ITE ...,text
24,DR--182866691_INTRO,What are the standard rules of a golf game?,,text
25,DR--182866691_INTRO,How do you think the token gifts motivate the ...,,text
26,DR--182866691_INTRO,How was the District 6 meeting a huge success?,The District 6 meeting was a huge success in t...,text
27,DR--182866691_INTRO,Which award was granted over the past year? To...,Award Name: Traffic Bowl - First Place Award R...,text
28,DR--14627260_INTRO,How do you think sexual harassment as an issue...,,text


# Inference

In [61]:
def test_case():
    query = ["What legislative suggestions pertaining to workplace sexual harassment does the firm provide in its submission?", 
                  "What legislative suggestions pertaining to workplace sexual harassment does the firm provide in its submission?"]
    return faiss_inference(query)[0] == faiss_inference(query)[1]

In [55]:
def faiss_inference(query, k=4):
    if isinstance(query, list):
        xq = model.encode(query)
        D, I = index.search(xq, k)
        retrieved_items = [[all_data[i] for i in sublist] for sublist in I]
        assert test_case()
    else:
        xq = model.encode([query])
        D, I = index.search(xq, k)
        retrieved_items = [all_data[i] for i in list(I[0])]
    return retrieved_items    

In [66]:
test_data = list(ground_truth_text["relevant questions"])

In [67]:
ret_context = faiss_inference(test_data)

# Testing 

In [150]:
def recall_k(test_labels, ret_context, k=len(ret_context[0])):
    ctr = 0
    for i in range(len(ret_context)):
        if isinstance(test_labels[i], float):
            continue
        for j in range(min(k,len(ret_context))):
            if fuzz.ratio(test_labels[i], ret_context[i][j])>=95:
                # print ("test_labels:", test_labels[i], "\n", "ret_context:", ret_context[i][j])
                ctr += 1
                break
    return ctr/len(ret_context)

In [151]:
recall_k(test_labels, ret_context)

0.35714285714285715