In [1]:
import pandas as pd
import json 
import glob
import faiss
from fuzzywuzzy import fuzz
from llmsherpa.readers import LayoutPDFReader
import matplotlib.pyplot as plt
import numpy as np
import os
import datetime
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import VarianceThreshold
import faiss
# from transformers import AutoTokenizer, AutoModel



In [2]:
from src.rechunker import Rechunker
from src.encoder.sentence_transformer import Encoder
from src.faiss.flat_idx import FlatIdx
from utils.utils import flatten_list, write_list_to_file, read_list_from_file
from src.eval import Eval


In [3]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import string

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\J C
[nltk_data]     SINGLA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\J C
[nltk_data]     SINGLA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\J C
[nltk_data]     SINGLA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# !pip install nltk

## Data

In [5]:
save_path = r"C:\Users\J C SINGLA\Downloads\External - take_home_challenge_(withJSONs)\take_home_challenge_(withJSONs)\data"

In [91]:
all_data_sherpa = read_list_from_file(save_path, "sherpa_paras")
filenames_sherpa = read_list_from_file(save_path, "sherpa_paras_filenames")
assert (len(all_data_sherpa)==len(filenames_sherpa))

In [92]:
ground_truth_path = r"C:\Users\J C SINGLA\Downloads\External - take_home_challenge_(withJSONs)\take_home_challenge_(withJSONs)\document_questions.xlsx"
ground_truth = pd.read_excel(ground_truth_path)
ground_truth_text = ground_truth[ground_truth["complexity"]=="text"].copy()
test_data = list(ground_truth_text["relevant questions"])
test_labels = list(ground_truth_text["answer"])

## Vectorizer

In [93]:
def preprocess_text(text):
    translator = str.maketrans('', '', string.punctuation)
    text_no_punctuation = text.translate(translator)
    tokens = nltk.word_tokenize(text_no_punctuation)
    stop_words = set(stopwords.words('english'))
    tokens = [word.lower() for word in tokens if word not in stop_words]
    stemmer = PorterStemmer()
    # tokens = [stemmer.stem(word) for word in tokens]
    lemmatizer = WordNetLemmatizer()
    # tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)

In [94]:
def tf_encoder(data, clean = False):
    vectorizer = TfidfVectorizer()
    if clean:
        data = [preprocess_text(x) for x in data]
    tfidf_vectors = vectorizer.fit_transform(data)
    dense_vectors = tfidf_vectors.toarray()
    return dense_vectors, vectorizer, data

In [95]:
dense_vectors, vectorizer, data = tf_encoder(all_data_sherpa, True)

In [96]:
feature_names = vectorizer.get_feature_names_out()
len(feature_names)

4616

## FAISS

In [97]:
k = 15
index = FlatIdx(d=len(feature_names))
index.add_idx(dense_vectors)

In [98]:
test_data = [preprocess_text(x) for x in test_data]
retrieved_items, D = index.faiss_tfidf_inference(vectorizer, all_data_sherpa, test_data, k=k)

In [99]:
test_labels

["The Student Liaison Committee had another successful year in attracting schools from the Pacific Northwest to compete in the Oregon Section's annual Traffic Bowl Competition. The Traffic Bowl is a Jeopardy-based trivia contest where students must answer questions on a variety of traffic and transportation trivia. This year the competition was held on November 15, 2007 at McMenamin's Edgefield just east of Portland, Oregon. ",
 'This year we had 52 students from six universities attending the event. Participating schools were: \nPortland State University \nUniversity of Idaho \nUniversity of Portland \nOregon State University \nUniversity of Washington \nOregon Institute of Technology ',
 'University of Portland took home the grand prize of bragging rights, a trophy and a $400 scholarship \naward. University of Washington and Oregon Institute of Technology both received $300 for tying \nfor second place.  The remaining three participating schools received a $150 participation award.  

In [100]:
# what does test_labels and retrieved items -> what is my retrieved items when their min(D) is less than 1.5917805 and greater than 1.5917805
th = 1.5917805
bad_retrieved_items = [(v,l) for (v,l,m) in zip(test_labels, retrieved_items, D) if min(m[0])>=th]
bad_retrieved_labels = [i[0] for i in bad_retrieved_items]
bad_retrieved_items = [i[1] for i in bad_retrieved_items]


In [101]:
# what does test_labels and retrieved items -> what is my retrieved items when their min(D) is less than 1.5917805 and greater than 1.5917805
th = 1.5917805
good_retrieved_items = [(v,l) for (v,l,m) in zip(test_labels, retrieved_items, D) if min(m[0])<th]
good_retrieved_labels = [i[0] for i in good_retrieved_items]
good_retrieved_items = [i[1] for i in good_retrieved_items]


In [102]:
bad_retrieved_items = [["None"]*k for i in range(len(bad_retrieved_items))]

In [103]:
retrieved_items_ = good_retrieved_items + bad_retrieved_items
test_labels_ = good_retrieved_labels + bad_retrieved_labels

In [116]:
test_labels_[-2]

nan

In [117]:
retrieved_items_[-2]

['None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None',
 'None']

In [118]:
metric = Eval(k=k)
clean = True
if clean:
    retrieved_items_clean = []
    for i in retrieved_items_:
        temp = []
        for x in i:
            temp.append(preprocess_text(x))
        retrieved_items_clean.append(temp)
        
    test_labels_clean = [x if isinstance(x, str) else "None" for x in test_labels_]
    test_labels_clean = [preprocess_text(x) for x in test_labels_clean]
    recall, incorrect, correct = metric.recall_k(test_labels_clean, retrieved_items_clean)
else:
    recall, incorrect, correct = metric.recall_k(test_labels_, retrieved_items_)

In [119]:
recall

0.7857142857142857

In [120]:
len(incorrect)

3

In [126]:
list(incorrect[2].keys())

['we believe current legislated timeframe making complaint ahrc unworkable step legislative provisions it amended 6 years line employment law jurisdictions we believe australian human rights commission act 1986 ahrc act regulations amended expressly prescribe time frames scheduling mediation conferences we long argued legislative framework needed incorporates \uf0b7 regulation criminal sanctions holding behaviours abusers employers carriage services account \uf0b7 a civil regime victims survivors online abuse access legal tools allow seek relief damages maurice blackburn believes australia needs civil criminal legislative framework could ensure \uf0b7 that breaches investigated statutory body established act failing courts \uf0b7 that statutory body order offending materials removed online platform require correction andor apology \uf0b7 that frameworks allows release identity anonymous abusers \uf0b7 that online sexual harassment criminalised abuser intends digital communication cause

In [127]:
list(incorrect[2].values())[0][0]

'none'