# ⚙️ Setup

In [1]:
import pandas as pd
from retriever import Retriever

  from .autonotebook import tqdm as notebook_tqdm


# 🗃️ Modules

## Query

In [2]:
def extract_queries() -> pd.DataFrame:
    url_source: str = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vR1hUlRhTJQgNzSbTyRtDNh1mCrbfy0iUm6oiHK7oHb_iQQ5t7XCB_xyUCwoZ2fdg/pub?output=xlsx'
    queries = pd.read_excel(url_source, sheet_name='queries')
    return queries

In [3]:
def remove_null_entries(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    return df[df['intent'].notna()]

def get_only_target_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    target_columns = ['query_id', 'query']
    return df[target_columns]

def transform_queries(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df = remove_null_entries(df)
    df = get_only_target_columns(df)
    return df
    

In [4]:
def load_queries() -> pd.DataFrame:
    df = extract_queries()
    df = transform_queries(df)
    return df

# Generate Annotation Pools

In [5]:
queries = load_queries()

In [6]:
queries.head(5)

Unnamed: 0,query_id,query
0,1,Saan po pwede mag apply ng Japan Visa bukod sa...
1,2,Gaano katagal ang processing ng Japan Visa sa ...
2,3,Magkano po ang Japan Visa sa attic tours?
3,4,Ano po dapat ang size ng picture sa applicatio...
4,5,Anong date po dapat sa application form?


In [7]:
retriever = Retriever()
retriever.pre_compute_docs()

All collections has been removed!
New Collection created!
📄 Processing: MULTIPLE-ENTRY VISA FOR TEMPORARY VISITOR.pdf


  return forward_call(*args, **kwargs)


Inserted object chunk_id=MULTIPLE-ENTRY VISA FOR TEMPORARY VISITOR.pdf_chunk_0 title=MULTIPLE-ENTRY VISA FOR TEMPORARY VISITOR
Inserted object chunk_id=MULTIPLE-ENTRY VISA FOR TEMPORARY VISITOR.pdf_chunk_1 title=MULTIPLE-ENTRY VISA FOR TEMPORARY VISITOR
Inserted object chunk_id=MULTIPLE-ENTRY VISA FOR TEMPORARY VISITOR.pdf_chunk_2 title=MULTIPLE-ENTRY VISA FOR TEMPORARY VISITOR
Inserted object chunk_id=MULTIPLE-ENTRY VISA FOR TEMPORARY VISITOR.pdf_chunk_3 title=MULTIPLE-ENTRY VISA FOR TEMPORARY VISITOR
Inserted object chunk_id=MULTIPLE-ENTRY VISA FOR TEMPORARY VISITOR.pdf_chunk_4 title=MULTIPLE-ENTRY VISA FOR TEMPORARY VISITOR
📄 Processing: FREQUENTLY ASKED QUESTIONS.pdf
Inserted object chunk_id=FREQUENTLY ASKED QUESTIONS.pdf_chunk_0 title=Frequently Asked Questions (Visa
Inserted object chunk_id=FREQUENTLY ASKED QUESTIONS.pdf_chunk_1 title=Frequently Asked Questions (Visa
Inserted object chunk_id=FREQUENTLY ASKED QUESTIONS.pdf_chunk_2 title=Frequently Asked Questions (Visa
Inserted ob

In [8]:
query = queries.iloc[0, 1]

In [9]:
retrieved_docs = retriever.retrieve_relevant_docs(query=query)

In [10]:
retrieved_docs[0].properties

{'content': 'ATTIC TOURS\nSpecialized in Japan Visa Application Services',
 'chunk_id': 'ATTIC TOURS.pdf_chunk_0',
 'title': 'ATTIC TOURS',
 'file_name': 'ATTIC TOURS.pdf'}

In [17]:
annotation_pools_data = {'query_id': []}
for query_id, query in queries.to_numpy():
    
    retrieved_docs = retriever.retrieve_relevant_docs(query=query)

    for doc in retrieved_docs:
        # dynamically set the column name
        for column in list(doc.properties.keys()):
            annotation_pools_data.setdefault(column, [])
        for key, value in doc.properties.items():
            annotation_pools_data[key].append(value)
        annotation_pools_data['query_id'].append(query_id)

annotation_pools = pd.DataFrame(annotation_pools_data)
        

In [18]:
annotation_pools.head(5)

Unnamed: 0,query_id,content,file_name,title,chunk_id
0,1,ATTIC TOURS\nSpecialized in Japan Visa Applica...,ATTIC TOURS.pdf,ATTIC TOURS,ATTIC TOURS.pdf_chunk_0
1,1,B. REQUIREMENTS（Details → https://www.ph.emb-j...,JAPAN VISA GENERAL INFO.pdf,JAPAN VISA – GENERAL INFORMATION,JAPAN VISA GENERAL INFO.pdf_chunk_2
2,1,【In case that applicant is planning to work fo...,NIKKEI-JIN (JAPANESE DESCENDANT).pdf,NIKKEI-JIN (JAPANESE DESCENDANT),NIKKEI-JIN (JAPANESE DESCENDANT).pdf_chunk_6
3,2,ATTIC TOURS\nSpecialized in Japan Visa Applica...,ATTIC TOURS.pdf,ATTIC TOURS,ATTIC TOURS.pdf_chunk_0
4,2,D. Important Notes\n•\nNo appointment is neede...,ATTIC TOURS.pdf,ATTIC TOURS,ATTIC TOURS.pdf_chunk_4


In [19]:
annotation_pools.to_csv('annotation_pools.csv')

In [20]:
def extract_annotation_pools() -> pd.DataFrame:
    url_source: str = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vR1hUlRhTJQgNzSbTyRtDNh1mCrbfy0iUm6oiHK7oHb_iQQ5t7XCB_xyUCwoZ2fdg/pub?output=xlsx'
    annotation_pools = pd.read_excel(url_source, sheet_name='annotation_pools')
    return annotation_pools

In [22]:
def transform_annotation_pools(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    target_columns = ['query_id', 'is_relevant']
    return df[target_columns]

In [23]:
def load_annotation_pools() -> pd.DataFrame:
    df = extract_annotation_pools()
    df = transform_annotation_pools(df)
    return df

In [25]:
annotation_pools = load_annotation_pools()

In [27]:
annotation_pools.head(5)

Unnamed: 0,query_id,is_relevant
0,1,1
1,1,1
2,1,0
3,2,0
4,2,0


In [28]:
annotation_pools.groupby('query_id').sum()

Unnamed: 0_level_0,is_relevant
query_id,Unnamed: 1_level_1
1,2
2,0
3,0
4,1
5,0
6,0
7,1
8,0
9,0
10,1
