In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import utils

import os 
import openai
openai.api_key = utils.get_openai_api_key()

In [5]:
from trulens_eval import Tru

tru = Tru()


In [7]:
from llama_index import SimpleDirectoryReader

documents = SimpleDirectoryReader(input_files=["/Users/abhinay/Desktop/Real-Time_Detection_of_DNS_Exfiltration_and_Tunneling_from_Enterprise_Networks-1.pdf"]
                                 ).load_data()

In [8]:
from llama_index import Document

document = Document(text = "\n\n".join(doc.text for doc in documents))


In [9]:
from utils import build_sentence_window_index

from llama_index.llms import OpenAI

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

sentence_index = build_sentence_window_index(
    document,
    llm,
    embed_model="local:BAAI/bge-small-en-v1.5",
    save_dir="sentence_index_v2"
)

In [10]:
from utils import get_sentence_window_query_engine

sentence_window_engine = get_sentence_window_query_engine(sentence_index)

In [None]:
output = sentence_window_engine.query("what are all the final attributes that were used")
output.response

In [11]:
import nest_asyncio

nest_asyncio.apply()

In [12]:
from trulens_eval import OpenAI as fOpenAI

provider = fOpenAI()

In [13]:
from trulens_eval import Feedback
f_qa_relevence = Feedback(provider.relevance_with_cot_reasons).on_input_output()

✅ In relevance_with_cot_reasons, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In relevance_with_cot_reasons, input response will be set to __record__.main_output or `Select.RecordOutput` .


In [14]:
from trulens_eval import TruLlama

context_selection = TruLlama.select_source_nodes().node.text

In [15]:
import numpy as np

f_qs_relevanve = (
    Feedback(provider.qs_relevance_with_cot_reasons,
             name = "context relevane").on_input().on(context_selection).aggregate(np.mean)
)

✅ In context relevane, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In context relevane, input statement will be set to __record__.app.query.rets.source_nodes[:].node.text .


In [16]:
from trulens_eval.feedback import Groundedness

grounded = Groundedness(groundedness_provider=provider)

In [17]:
f_groundedness = (Feedback(grounded.groundedness_measure_with_cot_reasons, 
                           name = "grounded").on(context_selection)
                           .on_output()
                           .aggregate(grounded.grounded_statements_aggregator))

✅ In grounded, input source will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In grounded, input statement will be set to __record__.main_output or `Select.RecordOutput` .


In [18]:
from trulens_eval import TruLlama
from trulens_eval import FeedbackMode

tru_recorder = TruLlama(
    sentence_window_engine,
    app_id = "app_1",
    feedbacks = [
        f_qa_relevence,
        f_groundedness,
        f_qs_relevanve
    ]
)

In [23]:
eval_questions = []
with open('eval_questions.txt', 'r') as file:
    for line in file:
        item = line.strip()
        eval_questions.append(item)

In [24]:
eval_questions

['what were the final parameters that were used in the model',
 'what were the final attributes that were used',
 'what were all the evaluation metrics used',
 'what were all the references that was used',
 'what were all the algorithms that were tested out',
 'what was the final goal of the author and did they achieve it',
 'what does each attribute mean in the aritcle',
 'what is is that this paper is trying to accomplish',
 'How was the data set collected',
 'what were the final parameters that were used in the model',
 'what were the final attributes that were used',
 'what were all the evaluation metrics used',
 'what were all the references that was used',
 'what were all the algorithms that were tested out',
 'what was the final goal of the author and did they achieve it',
 'what does each attribute mean in the aritcle',
 'what is is that this paper is trying to accomplish',
 'How was the data set collected']

In [26]:
for question in eval_questions:
    with tru_recorder as recording:
        sentence_window_engine.query(question)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [27]:
records, feedback = tru.get_records_and_feedback(app_ids=[])
records.head()

Unnamed: 0,app_id,app_json,type,record_id,input,output,tags,record_json,cost_json,perf_json,ts,relevance_with_cot_reasons,grounded,context relevane,relevance_with_cot_reasons_calls,grounded_calls,context relevane_calls,latency,total_tokens,total_cost
0,app_1,"{""tru_class_info"": {""name"": ""TruLlama"", ""modul...",RetrieverQueryEngine(llama_index.query_engine....,record_hash_3d0da85ce86e0839c9f2db7a5c9611f4,"""what were the final parameters that were used...","""The final parameters used in the model were a...",-,"{""record_id"": ""record_hash_3d0da85ce86e0839c9f...","{""n_requests"": 1, ""n_successful_requests"": 1, ...","{""start_time"": ""2023-12-20T20:06:48.297098"", ""...",2023-12-20T20:06:58.800071,1.0,1.0,0.85,[{'args': {'prompt': 'what were the final para...,[{'args': {'source': 'We use the num- ber of l...,[{'args': {'question': 'what were the final pa...,10,908,0.00137
1,app_1,"{""tru_class_info"": {""name"": ""TruLlama"", ""modul...",RetrieverQueryEngine(llama_index.query_engine....,record_hash_880b92c41fbf2bfc3fdfcb37adaac93c,"""what were the final attributes that were used""","""The final attributes that were used in the ma...",-,"{""record_id"": ""record_hash_880b92c41fbf2bfc3fd...","{""n_requests"": 1, ""n_successful_requests"": 1, ...","{""start_time"": ""2023-12-20T20:06:59.289101"", ""...",2023-12-20T20:07:02.775674,1.0,1.0,0.85,[{'args': {'prompt': 'what were the final attr...,[{'args': {'source': 'We use the num- ber of l...,[{'args': {'question': 'what were the final at...,3,562,0.000874
2,app_1,"{""tru_class_info"": {""name"": ""TruLlama"", ""modul...",RetrieverQueryEngine(llama_index.query_engine....,record_hash_c34a78e77850f4754ab468abcd6f99e2,"""what were all the evaluation metrics used""","""The evaluation metrics used in the given cont...",-,"{""record_id"": ""record_hash_c34a78e77850f4754ab...","{""n_requests"": 1, ""n_successful_requests"": 1, ...","{""start_time"": ""2023-12-20T20:07:03.249717"", ""...",2023-12-20T20:07:08.006388,1.0,1.0,0.8,[{'args': {'prompt': 'what were all the evalua...,[{'args': {'source': 'Input Output Days 1-4 Da...,[{'args': {'question': 'what were all the eval...,4,812,0.001265
3,app_1,"{""tru_class_info"": {""name"": ""TruLlama"", ""modul...",RetrieverQueryEngine(llama_index.query_engine....,record_hash_ad66a5aae7ed4df88fd2ba2f193274fc,"""what were all the references that was used""","""The references that were used in the given co...",-,"{""record_id"": ""record_hash_ad66a5aae7ed4df88fd...","{""n_requests"": 1, ""n_successful_requests"": 1, ...","{""start_time"": ""2023-12-20T20:07:08.453769"", ""...",2023-12-20T20:07:13.726252,1.0,0.5,0.4,[{'args': {'prompt': 'what were all the refere...,[{'args': {'source': '2019 IFIP/IEEE Internati...,[{'args': {'question': 'what were all the refe...,5,413,0.00064
4,app_1,"{""tru_class_info"": {""name"": ""TruLlama"", ""modul...",RetrieverQueryEngine(llama_index.query_engine....,record_hash_bd4a4b24d2ca49bba9cbaf0d09f96f84,"""what were all the algorithms that were tested...","""The context information does not provide any ...",-,"{""record_id"": ""record_hash_bd4a4b24d2ca49bba9c...","{""n_requests"": 1, ""n_successful_requests"": 1, ...","{""start_time"": ""2023-12-20T20:07:14.174074"", ""...",2023-12-20T20:07:16.375417,0.0,0.0,0.0,[{'args': {'prompt': 'what were all the algori...,"[{'args': {'source': 'Shen, M. Shashanka, and ...",[{'args': {'question': 'what were all the algo...,2,522,0.000791


In [28]:
import pandas as pd
pd.set_option("display.max_colwidth", None)
records[["input", "output"] + feedback]

Unnamed: 0,input,output,context relevane,grounded,relevance_with_cot_reasons
0,"""what were the final parameters that were used in the model""","""The final parameters used in the model were a number of trees equal to 2.""",0.85,1.0,1.0
1,"""what were the final attributes that were used""","""The final attributes that were used in the machine learning model are the number of labels, maximum label length, average label length, total count of characters in the FQDN, count of characters in the sub-domain, count of uppercase characters in the query name, and count of numerical characters in the query name.""",0.85,1.0,1.0
2,"""what were all the evaluation metrics used""","""The evaluation metrics used in the given context are:\n1. Accuracy: The accuracy of the trained model for benign instances was cross-validated and tested. The rate of detection (normal versus anomalous) for the benign instances was measured.\n2. Detection Rate: The detection rate for malicious DNS queries generated using their tool was tested.\n3. Performance: The performance of the scheme in real-time on live 10 Gbps traffic streams from the two organizations was quantified.""",0.8,1.0,1.0
3,"""what were all the references that was used""","""The references that were used in the given context are:\n1. Ef\ufb01cient iP, \u201cThe Global DNS Threat Survey,\u201d Tech. Rep., 2017.\n2. B. Krebs.""",0.4,0.5,1.0
4,"""what were all the algorithms that were tested out""","""The context information does not provide any specific information about the algorithms that were tested out.""",0.0,0.0,0.0
5,"""what was the final goal of the author and did they achieve it""","""The final goal of the author was to establish an upper bound on the volume of surreptitious communication by investigating inter-query time and query record type. It is not mentioned in the context whether the author achieved this goal or not.""",0.25,1.0,0.9
6,"""what does each attribute mean in the aritcle""","""The attributes mentioned in the article are used to analyze DNS query packets and determine the degree of non-readability or strength of encryption. Here is the meaning of each attribute:\n\n1. Number of Labels: Refers to the number of labels (separated by dots) in the query name. Certain patterns of labels are often used in DNS exfiltration/tunneling traffic.\n\n2. Maximum Label Length: Represents the maximum length of a label in the fully qualified domain name (FQDN). It is one of the attributes used in the machine learning model.\n\n3. Average Label Length: Represents the average length of labels in the FQDN. It is another attribute used in the machine learning model.\n\n4. Total Count of Characters in FQDN: Indicates the total number of characters in the FQDN. More characters suggest that the query name may carry embedded information for an outside host.\n\n5. Count of Characters in Sub-Domain: Refers to the count of characters in the sub-domain portion of the FQDN. This attribute helps identify if the query name is benign or malicious.\n\n6. Count of Uppercase Characters: Represents the count of uppercase characters in the query name. A high fraction of uppercase characters may indicate encrypted or encoded data.\n\n7. Count of Numerical Characters: Represents the count of numerical characters in the query name. Similar to uppercase characters, a high fraction of numerical characters may suggest encrypted or encoded data. However, not all encrypted data is necessarily malicious.""",0.85,0.888889,1.0
7,"""what is is that this paper is trying to accomplish""","""The paper is trying to accomplish training an anomaly detection machine using benign data and testing it with the remaining data.""",0.1,1.0,1.0
8,"""How was the data set collected""","""The data set was collected over a one-week period from 30-Jul-2018 to 5-Aug-2018.""",0.85,1.0,1.0
9,"""what were the final parameters that were used in the model""","""The final parameters used in the model were a number of trees equal to 2.""",0.85,1.0,1.0


In [29]:
tru.run_dashboard()

Starting dashboard ...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Accordion(children=(VBox(children=(VBox(children=(Label(value='STDOUT'), Output())), VBox(children=(Label(valu…

Dashboard started at http://localhost:8501 .


<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>