In [6]:
from warnings import filterwarnings

filterwarnings('ignore')


In [5]:
%reload_ext autoreload

In [7]:
from utils import get_openai_api_key

get_openai_api_key()

import os 
import openai
openai.api_key = os.environ["OPENAI_API_KEY"]

In [8]:
from llama_index import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_files=["/Users/abhinay/Desktop/Real-Time_Detection_of_DNS_Exfiltration_and_Tunneling_from_Enterprise_Networks-1.pdf"]
).load_data()

In [9]:
from llama_index import Document

document = Document(text = "\n\n".join([doc.text for doc in documents]))

In [10]:
from llama_index.node_parser import SentenceWindowNodeParser

node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text"
)

In [11]:
text = "hello!! world again. for the 1000th time"

nodes = node_parser.get_nodes_from_documents([Document(text = text)])

In [33]:
print([x.text for x in nodes])

['hello!! ', 'world again. ', 'for the 1000th time']


In [34]:
print(nodes[1].metadata["window"])

hello!!  world again.  for the 1000th time


In [12]:
text = "hello! puppy. cat says hi!!"

nodes = node_parser.get_nodes_from_documents([Document(text = text)])

In [36]:
print([x.text for x in nodes])

['hello! ', 'puppy. ', 'cat says hi!', '!']


In [20]:
print(nodes[0].metadata["window"])

hello!  puppy.  cat says hi!


In [13]:
from llama_index.llms import OpenAI

llm = OpenAI(model = "gpt-3.5-turbo", temperature = 0.1)

In [14]:
from llama_index import ServiceContext

sentence_context = ServiceContext.from_defaults(
    llm = llm,
    embed_model="local:BAAI/bge-small-en-v1.5", 
    node_parser = node_parser 
)

In [15]:
from llama_index import VectorStoreIndex

sentence_index = VectorStoreIndex.from_documents(
    [document],
    sentence_context = sentence_context
)

In [16]:
sentence_index.storage_context.persist(persist_dir="./sentence_index_v4/")

In [18]:
from llama_index.indices.postprocessor import MetadataReplacementPostProcessor

postproc = MetadataReplacementPostProcessor(
    target_metadata_key="window"
)

In [19]:
from llama_index.schema import NodeWithScore
from copy import deepcopy

scoredNodes = [NodeWithScore(node=x, score=0.1) for x in nodes]
nodes_old = [deepcopy(n) for n in nodes]


In [45]:
nodes_old[1].text

'puppy. '

In [20]:
replaced_nodes = postproc.postprocess_nodes(scoredNodes)

In [47]:
print(replaced_nodes[1].text)

hello!  puppy.  cat says hi! !


In [21]:
from llama_index.indices.postprocessor import SentenceTransformerRerank

rerank = SentenceTransformerRerank(top_n=2,
                                              model="BAAI/bge-reranker-base")

In [22]:
from llama_index.schema import QueryBundle
from llama_index.schema import NodeWithScore, TextNode

query = QueryBundle("I want a puppy")

scored_nodes = [
    NodeWithScore(node = TextNode(text = "This is a dog"), score=0.4),
    NodeWithScore(node = TextNode(text = "This is a cat"), score=0.6)
]

In [23]:
reranked_nodes = rerank.postprocess_nodes(
    scored_nodes, query_bundle=query
)

In [53]:
print([(x.score, x.text )for x in reranked_nodes])

[(0.6339884, 'This is a dog'), (0.004385631, 'This is a cat')]


In [24]:
sentence_window_engine = sentence_index.as_query_engine(
    similarity_top_k = 6, node_postprocessors = [postproc, rerank]
)

In [29]:
from llama_index.response.notebook_utils import display_response

window_response = sentence_window_engine.query("what were the final parameters and the model that was used ")

display_response(window_response)


**`Final Response:`** The final parameters used for the iForest algorithm during the training phase were as follows: 
- Number of trees (nestimators): 2
- Height limit of trees (max samples): 18
- Contamination rate: 10%

The model used for anomaly detection was the Isolation Forest (iForest) algorithm, which is known for its effectiveness in detecting anomalous instances in high-dimensional datasets with minimal memory and time complexities. The iForest algorithm isolates observations by randomly selecting attributes and split values, and the path length from the root node to the terminating node is used as a measure of normality. When a forest of random trees collectively produces shorter path lengths for a particular instance, it is highly likely to be an anomaly.

In [26]:
eval_questions = []

with open('eval_questions.txt', 'r') as file:
    for line in file:
        item = line.strip()
        print(item)
        eval_questions.append(item)

what were the final parameters that were used in the model
what were the final attributes that were used
what were all the evaluation metrics used
what were all the references that was used
what were all the algorithms that were tested out
what was the final goal of the author and did they achieve it
what does each attribute mean in the aritcle
what is is that this paper is trying to accomplish
How was the data set collected
what were the final parameters that were used in the model
what were the final attributes that were used
what were all the evaluation metrics used
what were all the references that was used
what were all the algorithms that were tested out
what was the final goal of the author and did they achieve it
what does each attribute mean in the aritcle
what is is that this paper is trying to accomplish
How was the data set collected


In [27]:
from trulens_eval import Tru

def run_evals (eval_questions, tru_recorder,query_engine):
    for question in eval_questions:
        with tru_recorder as recording:
            response = query_engine.query(question)

In [28]:
from utils import get_prebuilt_trulens_recorder

from trulens_eval import Tru

Tru().reset_database()

🦑 Tru initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `Tru` to prevent this.


In [36]:
from utils import build_sentence_window_index


senetence_window_index_1 = build_sentence_window_index(
    documents,
    llm=OpenAI(model="gpt-3.5-turbo", temperature=0.1),
    embed_model="local:BAAI/bge-small-en-v1.5",
    sentence_window_size=1,
    save_dir="sentence_index_v5",
    )

In [37]:
from utils import get_sentence_window_query_engine

sentence_window_engine_1 = get_sentence_window_query_engine(
    senetence_window_index_1
)

In [39]:
tru_recorder_1 = get_prebuilt_trulens_recorder(
    sentence_window_engine_1,
    app_id = 'sentence window engine 1'
)

✅ In answer relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In answer relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In context relecnace, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In context relecnace, input response will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In groundedness, input source will be set to __record__.main_output or `Select.RecordOutput` .


In [None]:
run_evals(eval_questions, tru_recorder_1, sentence_window_engine_1)

Feedback Function exception caught: Traceback (most recent call last):
  File "/Users/abhinay/Library/Python/3.9/lib/python/site-packages/trulens_eval/feedback/feedback.py", line 501, in run
    result_and_meta, part_cost = Endpoint.track_all_costs_tally(
  File "/Users/abhinay/Library/Python/3.9/lib/python/site-packages/trulens_eval/feedback/provider/endpoint/base.py", line 447, in track_all_costs_tally
    result, cbs = Endpoint.track_all_costs(
  File "/Users/abhinay/Library/Python/3.9/lib/python/site-packages/trulens_eval/feedback/provider/endpoint/base.py", line 396, in track_all_costs
    return Endpoint._track_costs(thunk, with_endpoints=endpoints)
  File "/Users/abhinay/Library/Python/3.9/lib/python/site-packages/trulens_eval/feedback/provider/endpoint/base.py", line 553, in _track_costs
    result: T = thunk()
  File "/Users/abhinay/Library/Python/3.9/lib/python/site-packages/trulens_eval/feedback/feedback.py", line 502, in <lambda>
    lambda: self.imp(**ins)
TypeError: groun

Feedback Function exception caught: Traceback (most recent call last):
  File "/Users/abhinay/Library/Python/3.9/lib/python/site-packages/trulens_eval/feedback/feedback.py", line 501, in run
    result_and_meta, part_cost = Endpoint.track_all_costs_tally(
  File "/Users/abhinay/Library/Python/3.9/lib/python/site-packages/trulens_eval/feedback/provider/endpoint/base.py", line 447, in track_all_costs_tally
    result, cbs = Endpoint.track_all_costs(
  File "/Users/abhinay/Library/Python/3.9/lib/python/site-packages/trulens_eval/feedback/provider/endpoint/base.py", line 396, in track_all_costs
    return Endpoint._track_costs(thunk, with_endpoints=endpoints)
  File "/Users/abhinay/Library/Python/3.9/lib/python/site-packages/trulens_eval/feedback/provider/endpoint/base.py", line 553, in _track_costs
    result: T = thunk()
  File "/Users/abhinay/Library/Python/3.9/lib/python/site-packages/trulens_eval/feedback/feedback.py", line 502, in <lambda>
    lambda: self.imp(**ins)
TypeError: groun