# DISCLAIMER:
This is a lighter version of the project. It is built only to easily visualize data step by step and perform tests "on the fly".

NOTE: may not be alligned to the latest version of `main.py`, refer to that one to be sure to run the most updated version of the code.

## Imports

In [None]:
# !pip install -r requirements.txt

In [None]:
from itertools import combinations
from parsers import TipsterParser, QueryParser
from tuners import NMFTuner

import argparse
import bert_helpers
import es_helpers
import octis_helpers
import operators
import random
import utils
import genai_helpers

## Args setup

In [None]:
def get_args():
    return argparse.Namespace(
        index=False,
        delete_index=False,
        evaluate=False,
        run_index='tipster_45_kstem',
        tune=False,
        simulate=False,
        verbose=True,
        feedback_method='Pseudo Feedback',
        evaluation_type='Residual Ranking',
        vocab_source='Terms window',
        topic_model='NMF',
        embedding_type='embedding_full',
        topics_type='join'
    )

args = get_args()

## ES connection and Index choice

In [None]:
es, _ = utils.setup()
index = args.run_index

## Parsing queries and random selection

In [None]:
query_parser = QueryParser('storage/queries/robust04.topics')
queries = query_parser.parse_queries()
query = random.choice(list(queries.values()))

print(f'Query: {query["title"]}')

## Run search

In [None]:
feedback_type = args.feedback_method
evaluation_type = args.evaluation_type

res = es_helpers.search(es, index, query)

if feedback_type == 'Pseudo Feedback':
    oracle_res = es_helpers.search(es, index, query, 75)
else:
    oracle_res = utils.ask_oracle(res, query, feedback_type)
    if "Oracle" in feedback_type:
        oracle_res = es_helpers.search_by_id(es, index, oracle_res)

## First docs ranking

In [None]:
ranked_docs = [(hit['_id'], hit['_score']) for hit in res['hits']['hits']]
utils.print_rank(ranked_docs)

## Oracle and Dictionary Creation

In [None]:
oracle_docs = {hit['_id']: hit['_source']['text'] for hit in oracle_res['hits']['hits']}
oracle_texts = list(oracle_docs.values())
oracle_ids = list(oracle_docs.keys())

selected_vocab = args.vocab_source

if selected_vocab == 'Terms window':
    vocabulary = es_helpers.get_terms_window(es, index, query, oracle_texts)
else:
    vocabulary = es_helpers.get_significant_words(es, index, query, oracle_ids)

## Topic modeling

In [None]:
dataset = octis_helpers.create_dataset(oracle_texts, vocabulary)
topics = 6
topwords = 6

# NMF
print('\n###### NMF MODEL ######\n')
nmf_output, nmf_id2word = octis_helpers.run_nmf_model(dataset, topics, topwords)
octis_helpers.evaluate_model(nmf_output, dataset, topwords)
octis_helpers.display_topics(nmf_output, nmf_id2word, topwords)

# BERT
print('\n###### BERT MODEL ######\n')
bert_output, bert_id2word = bert_helpers.run_bertopic_model(topwords)
bert_helpers.evaluate_model(bert_output, dataset, topwords)
bert_helpers.display_topics(bert_output, True)

## Meet & Join

In [None]:
topic_model = args.topic_model or utils.select_model()
topic_vectors = (
    octis_helpers.get_topic_vectors(nmf_output)
    if topic_model == 'NMF' else
    bert_helpers.get_topic_vectors(bert_output)
)
id2word = nmf_id2word if topic_model == 'NMF' else bert_id2word

# JOIN
join_topic_vectors = [op for v1, v2 in combinations(topic_vectors, 2) for op in operators.join(v1, v2)]
join_topics = [utils.topic_from_vector(id2word, vec, topwords) for vec in join_topic_vectors]
print('JOIN Topics:\n', join_topics)

# MEET
meet_topic_vectors = [operators.meet(v1, v2, v3, v4)
                      for v1, v2, v3, v4 in combinations(topic_vectors, 4)]
meet_topics = [utils.topic_from_vector(id2word, vec, topwords) for vec in meet_topic_vectors]
print('MEET Topics:\n', meet_topics)


## Reranking

In [None]:
embedding_type = args.embedding_type
documents_embeddings = [hit["_source"][embedding_type] for hit in res['hits']['hits']]
documents = {hit['_id']: hit["_source"]["text"] for hit in res['hits']['hits']}

topics_type = args.topics_type or utils.select_topics_for_reranking()
reranking_topics = join_topics if topics_type == 'join' else meet_topics

reranked_docs = utils.rerank_documents(evaluation_type, ranked_docs, oracle_docs, documents, documents_embeddings, query, reranking_topics)
print('\nRERANKED DOCUMENTS:')
utils.print_rank(reranked_docs, ranked_docs)


## Relevance Feedback from Gemini

In [None]:
import time
import pymongo_helpers

client = pymongo_helpers.client()
rel_collection = pymongo_helpers.get_collection("query_doc_rel", "relevance_feedback", client)
docs_collection = pymongo_helpers.get_collection("documents", "relevance_feedback", client)
queries_collection = pymongo_helpers.get_collection("queries", "relevance_feedback", client)

for i in range(15):
    doc_id = reranked_docs[i][0]
    doc_text = documents.get(doc_id, "[NONE]")

    client = genai_helpers.client()
    rel, notes = genai_helpers.ask(doc_text, query, client)
    print(f"Gemini relevance for doc {doc_id}")
    print(f"Rel: {True if rel == 1 else False}")
    print(f"Notes: {notes}\n")

    document = pymongo_helpers.format_data(doc_id, query["num"], rel, i + 1, notes)
    query_data = {
        '_id' : query["num"],
        'query': query["title"],
    }
    doc_data = {
        '_id' : doc_id,
        'text': doc_text,
    }

    pymongo_helpers.add_document(rel_collection, document)
    pymongo_helpers.add_document(docs_collection, doc_data)
    pymongo_helpers.add_document(queries_collection, query_data)

    time.sleep(10)