In [1]:
import numpy as np
import pandas as pd
import PyPDF2
import spacy
nlp = spacy.load("en_core_web_sm")
from sentence_transformers import SentenceTransformer
from langchain.llms import CTransformers
from langchain import PromptTemplate, LLMChain
import matplotlib.pyplot as plt
import io
from urllib.request import Request, urlopen
from newspaper import Article
from PyPDF2 import PdfReader
import docx
from rouge_score import rouge_scorer

## 1. Implementation 

### 1.1 Document Processing

In [2]:
class Processor:
    '''
    The Processor class reads text from a source and splits the text into chunks.
    
    Attributes:
    - source(str): source of the target document, supporting url, txt, docx and pdf.
    
    Methods:
    - extract_text: get the text from the source
    - get_chunks: split the text into chunks
    '''
    
    def __init__(self, source):
        '''
        Initialize a Processor instance

        Parameter:
        - source: source of the target document, supporting url, txt, docx and pdf.
        '''
        self.source = source
    
    def extract_text(self):
        '''
        Get the text from the source
        '''
        if self.source.startswith('https:') and self.source.endswith('.pdf'):
            document_from_source = urlopen(Request(self.source)).read()
            document_in_memory = io.BytesIO(document_from_source)
            document = PdfReader(document_in_memory)
            pages = len(document.pages)
            text = ''
            for p in range(pages):
                    page = document.pages[p].extract_text()
                    text += page

        elif self.source.startswith('https:') or self.source.startswith('www:'):
            document = Article(self.source, language="en")
            document.download()
            document.parse()
            document.nlp()
            text = document.title + '.\n' + document.text

        elif self.source.endswith('.txt'):
            with open(self.source, 'r') as f:
                text = f.read()

        elif self.source.endswith('.pdf'):
            with open (self.source, 'rb') as f:
                document = PdfReader(f)

                pages = len(document.pages)
                text = ''
                for p in range(pages):
                    page = document.pages[p].extract_text()
                    text += page

        elif self.source.endswith('.docx'):
            with open(self.source, "rb") as f:
                document = docx.Document(f)
                text = '\n'.join([paragraph.text for paragraph in document.paragraphs])

        return text
    
    def get_chunks(self, by_tokens=False, num_tokens=100): 
        '''
        Split the text into chunks
        
        Parameters:
        by_tokens(boolean): if true, the text is split into strings containing a certain number of tokens; otherwise, the text is split into sentences
        num_tokens(int): the number of tokens in each string if by_tokens=True
        '''
        text = self.extract_text()
        doc = nlp(text)
        
        if by_tokens:
            chunks = []
            start = 0
            while start <= len(doc) - num_tokens:
                end = start + num_tokens
                tokens = doc[start:end]
                chunk = ' '.join([token.text for token in tokens])
                chunks.append(chunk)
                start = end            
            if start < len(doc):
                residual = ' '.join([token.text for token in doc[start:]])
                chunks.append(residual)
        
        # the default is splitting by sentences
        else:
            chunks = [sent.text.replace('\n', ' ') for sent in doc.sents]
        
        return np.array(list(set(chunks)))

### 1.2 Context Retrieving 

In [3]:
class Retriever:
    '''
    The Retriever class retrieves the top-k chunks that are the most relevant to a query
    
    Attributes: 
    - encoder(str): the name of a specific encoder under Sentence Transformers
    - chunks(arr): an array of chunks that a document is split into
    
    Methods:
    - chunks_embedding: convert the chunks into embeddings
    - retrieve_context: retrieve the context from the chunks according to a query
    '''
    
    def __init__(self, encoder, chunks):
        '''
        Initialize a Retriever instancce
        
        Attributes:
        - encoder(str): the name of the encoder
        - chunks(list): a list of chunks that a document is split into
        '''
        self.encoder = SentenceTransformer(encoder)
        self.chunks = chunks
    
    def chunks_embedding(self):
        '''
        Convert the chunks into embeddings
        '''
        chunks_embeddings = self.encoder.encode(self.chunks)
        
        return chunks_embeddings
    
    def retrieve_context(self, chunks_embeddings, query, k=1, enhanced=False, min_length=256):
        '''
        Retrieve the context from the chunks for a specific query
        
        Parameters:
        - chunks_embedding(arr): an array of embeddings of the chunks
        - query(str): a question from a user
        - k(int): the number of the most relevant chunks to be retrieved
        - enhenced(boolean): if true, when the retrieved context is shorter than minimum length, the next most relevant chunks will be added to the context
        - min_length(int): the minimum length of context
        '''
        query_embeddings = self.encoder.encode(query)
        
        similarities = np.dot(chunks_embeddings, query_embeddings) / (np.linalg.norm(chunks_embeddings, axis=1) * np.linalg.norm(query_embeddings))
        
        sorted_indices = np.argsort(similarities)[::-1]
        
        top_k_indices = sorted_indices[:k]
        
        top_k_chunks = self.chunks[top_k_indices]
        
        context = ' '.join(top_k_chunks)
        
        if enhanced:
            # check the length of context
            while len(context) < min_length:
                extra_idx = sorted_indices[k]
                extra_chunk = self.chunks[extra_idx]
                context += extra_chunk
                k += 1
        
        return context

### 1.3 Answer Generating 

In [4]:
class Generator:
    '''
    The Generator class takes in a query and relevant context and generates an answer
    
    Attributes: 
    - model(str): name of model
    - model_file(str): name of model file
    - model_type(str): model type
    
    Method:
    - generate_answer: generate answers based on query and context
    '''
    
    def __init__(self, model, model_file, model_type):
        '''
        Initialize a Generator instance
        - model(str): name of model
        - model_file(str): name of model file
        - model_type(str): model type
        '''
        llm = CTransformers(model=model, model_file=model_file, model_type=model_type)
        
        template = '''
                    Context: {context}
                    Question: {question}
                    Answer:
                    '''
        prompt = PromptTemplate(template=template, input_variables=['context','question'])
        self.llm_chain = LLMChain(prompt=prompt, llm=llm)
        
    def generate_answer(self, context, question):
        '''
        Generate answers based on query and context
        
        Parameters:
        - context(str): context provided to the generator
        - question(str): question from a user
        '''
        answer = self.llm_chain.run({'context': context, 'question': question})
        return answer

## 2. Evaluation

### 2.1 Data 

In [None]:
# load the test dataset

In [3]:
dataset = pd.read_csv('rag_benchmark_apple_10k_2022_with_context.csv')
# source: https://huggingface.co/datasets/lighthouzai/finqabench

In [61]:
dataset.head()

Unnamed: 0,Query,Response,Context,Category,Filename,Source
0,What is the aggregate market value of the voti...,"$2,830,067,000,000.",Indicate by check mark whether the Registrant ...,Hallucination: Direct Question,apple-10K-2022.pdf,https://d18rn0p25nwr6d.cloudfront.net/CIK-0000...
1,What is the title of Item 7 in Part II of the ...,Item 7. Management's Discussion and Analysis o...,Apple Inc.\nForm 10-K\nFor the Fiscal Year End...,Hallucination: Direct Question,apple-10K-2022.pdf,https://d18rn0p25nwr6d.cloudfront.net/CIK-0000...
2,What is the Company's line of personal compute...,The Mac line includes laptops MacBook Air and ...,® is the Company’s line of smartphones based o...,Hallucination: Direct Question,apple-10K-2022.pdf,https://d18rn0p25nwr6d.cloudfront.net/CIK-0000...
3,What are the main competitive factors that App...,Principal competitive factors important to the...,of consumers and businesses. Many of \nthe Co...,Hallucination: Direct Question,apple-10K-2022.pdf,https://d18rn0p25nwr6d.cloudfront.net/CIK-0000...
4,What measures does the Company take to protect...,The Company supports employees with general sa...,The Company experiences malicious attacks and ...,Hallucination: Direct Question,apple-10K-2022.pdf,https://d18rn0p25nwr6d.cloudfront.net/CIK-0000...


### 2.2 Metric - Sematic Similarity to "Ground Truth"  for Final Answers

In [None]:
import numpy as np

def cosine_similarity(references, predictions):
    '''
    Calcuate cosine similarity between "ground truth" references and predictions
    
    Parameters:
    - references(list): a list of strings representing ground truth
    - predictions(list): a list of strings representing predictions
    
    Return:
    - similarity_scores(list): a list of element-wise cosine similarity scores
    '''
    
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

    # Encode the sentences from both lists
    embeddings_references = model.encode(references)
    embeddings_predictions = model.encode(predictions)

    # Normalize the embeddings
    embeddings_references /= np.linalg.norm(embeddings_references, axis=1, keepdims=True)
    embeddings_predictions /= np.linalg.norm(embeddings_predictions, axis=1, keepdims=True)

    # Compute cosine similarity element-wise
    similarity_scores = np.sum(embeddings_references * embeddings_predictions, axis=1)

    return similarity_scores

### 2.3 Metric - ROUGE for Retrieved Contexts

In [51]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

In [66]:
def evaluate_by_rouge(references, retrieved_contexts):
    '''
    Calculate rouge scores of generated texts with respect to reference texts
    
    Parameters:
    - references(list): a list of strings representing reference texts
    - retrieved_ocntexts: a list of strings representing the texts to be evaluated.
    
    Return:
    - mean_rouge_scores(dict): a dictionary of mean rouge scores of chosen metrics.
    '''
    
    rouge_scores = [scorer.score(references[i], retrieved_contexts[i]) for i in range(len(references))]
    mean_rouge_scores = {'rouge1_precision': np.mean([score['rouge1'].precision for score in rouge_scores]),
                         'rouge1_recall': np.mean([score['rouge1'].recall for score in rouge_scores]),
                         'rouge1_fmeasure': np.mean([score['rouge1'].fmeasure for score in rouge_scores]),
                         'rougeL_precision': np.mean([score['rougeL'].precision for score in rouge_scores]),
                         'rougeL_recall': np.mean([score['rougeL'].recall for score in rouge_scores]),
                         'rougeL_fmeasure': np.mean([score['rougeL'].fmeasure for score in rouge_scores])
                        }
    
    return mean_rouge_scores

### 2.4 Baseline - No-context  Generation

In [384]:
# LLM parameters
model='TheBloke/Llama-2-7B-Chat-GGUF'
model_file = 'llama-2-7b-chat.Q4_K_M.gguf'
model_type='llama'

In [385]:
llm = CTransformers(model=model, model_file=model_file, model_type=model_type)

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

In [386]:
template = '''
            Question: {question}
            Answer:
            '''
prompt = PromptTemplate(template=template, input_variables=['question'])
llm_chain = LLMChain(prompt=prompt, llm=llm)

In [190]:
no_context_answers = []
i = 1
for query in dataset['Query']:
    answer = llm_chain.run({'question': query})
    no_context_answers.append(answer)
    print(f'{i}/100 completed')
    i +=1

1/100 completed
2/100 completed
3/100 completed
4/100 completed
5/100 completed
6/100 completed
7/100 completed
8/100 completed
9/100 completed
10/100 completed
11/100 completed
12/100 completed
13/100 completed
14/100 completed
15/100 completed
16/100 completed
17/100 completed
18/100 completed
19/100 completed
20/100 completed
21/100 completed
22/100 completed
23/100 completed
24/100 completed
25/100 completed
26/100 completed
27/100 completed
28/100 completed
29/100 completed
30/100 completed
31/100 completed
32/100 completed
33/100 completed
34/100 completed
35/100 completed
36/100 completed
37/100 completed
38/100 completed
39/100 completed
40/100 completed
41/100 completed
42/100 completed
43/100 completed
44/100 completed
45/100 completed
46/100 completed
47/100 completed
48/100 completed
49/100 completed
50/100 completed
51/100 completed
52/100 completed
53/100 completed
54/100 completed
55/100 completed
56/100 completed
57/100 completed
58/100 completed
59/100 completed
60/100

In [387]:
similarity_no_context_answers = cosine_similarity(dataset['Response'], no_context_answers)

In [388]:
np.mean(similarity_no_context_answers)

0.56955093

### 2.5 RAG - Sentences-based Context

In [12]:
doc_name = 'Apple_2022_annual_report.pdf'

In [12]:
encoder = 'sentence-transformers/all-MiniLM-L6-v2'

In [None]:
# Use the Processor to split the document into sentences

In [14]:
processor = Processor(doc_name)

In [15]:
sentences = processor.get_chunks()

In [None]:
# Use the Retriever to retrive context 

In [16]:
retriever = Retriever(encoder, sentences)

  return self.fget.__get__(instance, owner)()


In [17]:
sentences_embeddings = retriever.chunks_embedding()

In [18]:
retrieved_contexts = []
for query in dataset['Query']:
    context = retriever.retrieve_context(sentences_embeddings, query, k=1)
    retrieved_contexts.append(context)

In [None]:
# Use the Generator to generate answers

In [152]:
generator = Generator(model=model, model_file=model_file, model_type=model_type)

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

In [154]:
generated_answers = []
for i in range(len(dataset['Query'])):
    query = dataset['Query'][i]
    context = retrieved_contexts[i]
    answer = generator.generate_answer(context, query)
    generated_answers.append(answer)
    print(f'{i+1}/100 completed')

1/100 completed
2/100 completed
3/100 completed
4/100 completed
5/100 completed
6/100 completed
7/100 completed
8/100 completed
9/100 completed
10/100 completed
11/100 completed
12/100 completed
13/100 completed
14/100 completed
15/100 completed
16/100 completed
17/100 completed
18/100 completed
19/100 completed
20/100 completed
21/100 completed
22/100 completed
23/100 completed
24/100 completed
25/100 completed
26/100 completed
27/100 completed
28/100 completed
29/100 completed
30/100 completed
31/100 completed
32/100 completed
33/100 completed
34/100 completed
35/100 completed
36/100 completed
37/100 completed
38/100 completed
39/100 completed
40/100 completed
41/100 completed
42/100 completed
43/100 completed
44/100 completed
45/100 completed
46/100 completed
47/100 completed
48/100 completed
49/100 completed
50/100 completed
51/100 completed
52/100 completed
53/100 completed
54/100 completed
55/100 completed
56/100 completed
57/100 completed
58/100 completed
59/100 completed
60/100

Number of tokens (813) exceeded maximum context length (512).


64/100 completed


Number of tokens (814) exceeded maximum context length (512).
Number of tokens (815) exceeded maximum context length (512).
Number of tokens (816) exceeded maximum context length (512).
Number of tokens (817) exceeded maximum context length (512).
Number of tokens (818) exceeded maximum context length (512).
Number of tokens (819) exceeded maximum context length (512).
Number of tokens (820) exceeded maximum context length (512).
Number of tokens (821) exceeded maximum context length (512).
Number of tokens (822) exceeded maximum context length (512).
Number of tokens (823) exceeded maximum context length (512).
Number of tokens (824) exceeded maximum context length (512).
Number of tokens (825) exceeded maximum context length (512).
Number of tokens (826) exceeded maximum context length (512).
Number of tokens (827) exceeded maximum context length (512).
Number of tokens (828) exceeded maximum context length (512).
Number of tokens (829) exceeded maximum context length (512).
Number o

Number of tokens (947) exceeded maximum context length (512).
Number of tokens (948) exceeded maximum context length (512).
Number of tokens (949) exceeded maximum context length (512).
Number of tokens (950) exceeded maximum context length (512).
Number of tokens (951) exceeded maximum context length (512).
Number of tokens (952) exceeded maximum context length (512).
Number of tokens (953) exceeded maximum context length (512).
Number of tokens (954) exceeded maximum context length (512).
Number of tokens (955) exceeded maximum context length (512).
Number of tokens (956) exceeded maximum context length (512).
Number of tokens (957) exceeded maximum context length (512).
Number of tokens (958) exceeded maximum context length (512).
Number of tokens (959) exceeded maximum context length (512).
Number of tokens (960) exceeded maximum context length (512).
Number of tokens (961) exceeded maximum context length (512).
Number of tokens (962) exceeded maximum context length (512).
Number o

65/100 completed


Number of tokens (814) exceeded maximum context length (512).
Number of tokens (815) exceeded maximum context length (512).
Number of tokens (816) exceeded maximum context length (512).
Number of tokens (817) exceeded maximum context length (512).
Number of tokens (818) exceeded maximum context length (512).
Number of tokens (819) exceeded maximum context length (512).
Number of tokens (820) exceeded maximum context length (512).
Number of tokens (821) exceeded maximum context length (512).
Number of tokens (822) exceeded maximum context length (512).
Number of tokens (823) exceeded maximum context length (512).
Number of tokens (824) exceeded maximum context length (512).
Number of tokens (825) exceeded maximum context length (512).
Number of tokens (826) exceeded maximum context length (512).
Number of tokens (827) exceeded maximum context length (512).
Number of tokens (828) exceeded maximum context length (512).
Number of tokens (829) exceeded maximum context length (512).
Number o

Number of tokens (947) exceeded maximum context length (512).
Number of tokens (948) exceeded maximum context length (512).
Number of tokens (949) exceeded maximum context length (512).
Number of tokens (950) exceeded maximum context length (512).
Number of tokens (951) exceeded maximum context length (512).
Number of tokens (952) exceeded maximum context length (512).
Number of tokens (953) exceeded maximum context length (512).
Number of tokens (954) exceeded maximum context length (512).
Number of tokens (955) exceeded maximum context length (512).
Number of tokens (956) exceeded maximum context length (512).
Number of tokens (957) exceeded maximum context length (512).
Number of tokens (958) exceeded maximum context length (512).
Number of tokens (959) exceeded maximum context length (512).
Number of tokens (960) exceeded maximum context length (512).
Number of tokens (961) exceeded maximum context length (512).
Number of tokens (962) exceeded maximum context length (512).
Number o

66/100 completed
67/100 completed
68/100 completed
69/100 completed
70/100 completed
71/100 completed
72/100 completed
73/100 completed
74/100 completed
75/100 completed
76/100 completed
77/100 completed
78/100 completed
79/100 completed
80/100 completed
81/100 completed


Number of tokens (513) exceeded maximum context length (512).
Number of tokens (514) exceeded maximum context length (512).
Number of tokens (515) exceeded maximum context length (512).
Number of tokens (516) exceeded maximum context length (512).
Number of tokens (517) exceeded maximum context length (512).
Number of tokens (518) exceeded maximum context length (512).
Number of tokens (519) exceeded maximum context length (512).
Number of tokens (520) exceeded maximum context length (512).
Number of tokens (521) exceeded maximum context length (512).
Number of tokens (522) exceeded maximum context length (512).
Number of tokens (523) exceeded maximum context length (512).
Number of tokens (524) exceeded maximum context length (512).
Number of tokens (525) exceeded maximum context length (512).
Number of tokens (526) exceeded maximum context length (512).
Number of tokens (527) exceeded maximum context length (512).
Number of tokens (528) exceeded maximum context length (512).
Number o

82/100 completed


Number of tokens (940) exceeded maximum context length (512).


83/100 completed


Number of tokens (941) exceeded maximum context length (512).
Number of tokens (942) exceeded maximum context length (512).
Number of tokens (943) exceeded maximum context length (512).
Number of tokens (944) exceeded maximum context length (512).
Number of tokens (945) exceeded maximum context length (512).
Number of tokens (946) exceeded maximum context length (512).
Number of tokens (947) exceeded maximum context length (512).
Number of tokens (948) exceeded maximum context length (512).
Number of tokens (949) exceeded maximum context length (512).
Number of tokens (950) exceeded maximum context length (512).
Number of tokens (951) exceeded maximum context length (512).
Number of tokens (952) exceeded maximum context length (512).
Number of tokens (953) exceeded maximum context length (512).
Number of tokens (954) exceeded maximum context length (512).
Number of tokens (955) exceeded maximum context length (512).
Number of tokens (956) exceeded maximum context length (512).
Number o

84/100 completed


Number of tokens (513) exceeded maximum context length (512).
Number of tokens (514) exceeded maximum context length (512).
Number of tokens (515) exceeded maximum context length (512).
Number of tokens (516) exceeded maximum context length (512).
Number of tokens (517) exceeded maximum context length (512).
Number of tokens (518) exceeded maximum context length (512).
Number of tokens (519) exceeded maximum context length (512).
Number of tokens (520) exceeded maximum context length (512).
Number of tokens (521) exceeded maximum context length (512).
Number of tokens (522) exceeded maximum context length (512).
Number of tokens (523) exceeded maximum context length (512).
Number of tokens (524) exceeded maximum context length (512).
Number of tokens (525) exceeded maximum context length (512).
Number of tokens (526) exceeded maximum context length (512).
Number of tokens (527) exceeded maximum context length (512).
Number of tokens (528) exceeded maximum context length (512).
Number o

85/100 completed
86/100 completed
87/100 completed
88/100 completed


Number of tokens (513) exceeded maximum context length (512).
Number of tokens (514) exceeded maximum context length (512).
Number of tokens (515) exceeded maximum context length (512).
Number of tokens (516) exceeded maximum context length (512).
Number of tokens (517) exceeded maximum context length (512).
Number of tokens (518) exceeded maximum context length (512).
Number of tokens (519) exceeded maximum context length (512).
Number of tokens (520) exceeded maximum context length (512).
Number of tokens (521) exceeded maximum context length (512).
Number of tokens (522) exceeded maximum context length (512).
Number of tokens (523) exceeded maximum context length (512).
Number of tokens (524) exceeded maximum context length (512).
Number of tokens (525) exceeded maximum context length (512).
Number of tokens (526) exceeded maximum context length (512).
Number of tokens (527) exceeded maximum context length (512).
Number of tokens (528) exceeded maximum context length (512).
Number o

89/100 completed


Number of tokens (806) exceeded maximum context length (512).
Number of tokens (807) exceeded maximum context length (512).
Number of tokens (808) exceeded maximum context length (512).
Number of tokens (809) exceeded maximum context length (512).
Number of tokens (810) exceeded maximum context length (512).
Number of tokens (811) exceeded maximum context length (512).
Number of tokens (812) exceeded maximum context length (512).
Number of tokens (813) exceeded maximum context length (512).
Number of tokens (814) exceeded maximum context length (512).
Number of tokens (815) exceeded maximum context length (512).
Number of tokens (816) exceeded maximum context length (512).
Number of tokens (817) exceeded maximum context length (512).
Number of tokens (818) exceeded maximum context length (512).
Number of tokens (819) exceeded maximum context length (512).


90/100 completed
91/100 completed
92/100 completed
93/100 completed
94/100 completed
95/100 completed
96/100 completed
97/100 completed


Number of tokens (901) exceeded maximum context length (512).


98/100 completed


Number of tokens (902) exceeded maximum context length (512).
Number of tokens (903) exceeded maximum context length (512).
Number of tokens (904) exceeded maximum context length (512).
Number of tokens (905) exceeded maximum context length (512).
Number of tokens (906) exceeded maximum context length (512).
Number of tokens (907) exceeded maximum context length (512).
Number of tokens (908) exceeded maximum context length (512).
Number of tokens (909) exceeded maximum context length (512).
Number of tokens (910) exceeded maximum context length (512).
Number of tokens (911) exceeded maximum context length (512).
Number of tokens (912) exceeded maximum context length (512).
Number of tokens (913) exceeded maximum context length (512).
Number of tokens (914) exceeded maximum context length (512).
Number of tokens (915) exceeded maximum context length (512).
Number of tokens (916) exceeded maximum context length (512).
Number of tokens (917) exceeded maximum context length (512).
Number o

Number of tokens (1034) exceeded maximum context length (512).
Number of tokens (1035) exceeded maximum context length (512).
Number of tokens (1036) exceeded maximum context length (512).
Number of tokens (1037) exceeded maximum context length (512).
Number of tokens (1038) exceeded maximum context length (512).
Number of tokens (1039) exceeded maximum context length (512).
Number of tokens (1040) exceeded maximum context length (512).
Number of tokens (1041) exceeded maximum context length (512).
Number of tokens (1042) exceeded maximum context length (512).
Number of tokens (1043) exceeded maximum context length (512).
Number of tokens (1044) exceeded maximum context length (512).
Number of tokens (1045) exceeded maximum context length (512).
Number of tokens (1046) exceeded maximum context length (512).
Number of tokens (1047) exceeded maximum context length (512).
Number of tokens (1048) exceeded maximum context length (512).
Number of tokens (1049) exceeded maximum context length

99/100 completed


Number of tokens (513) exceeded maximum context length (512).
Number of tokens (514) exceeded maximum context length (512).
Number of tokens (515) exceeded maximum context length (512).
Number of tokens (516) exceeded maximum context length (512).
Number of tokens (517) exceeded maximum context length (512).
Number of tokens (518) exceeded maximum context length (512).
Number of tokens (519) exceeded maximum context length (512).
Number of tokens (520) exceeded maximum context length (512).
Number of tokens (521) exceeded maximum context length (512).
Number of tokens (522) exceeded maximum context length (512).
Number of tokens (523) exceeded maximum context length (512).
Number of tokens (524) exceeded maximum context length (512).
Number of tokens (525) exceeded maximum context length (512).
Number of tokens (526) exceeded maximum context length (512).
Number of tokens (527) exceeded maximum context length (512).
Number of tokens (528) exceeded maximum context length (512).
Number o

100/100 completed


In [68]:
rouge_socres_sentence_context = evaluate_by_rouge(dataset['Context'], retrieved_contexts)

In [69]:
rouge_socres_sentence_context

{'rouge1_precision': 0.7713680473739707,
 'rouge1_recall': 0.26113240865405934,
 'rouge1_fmeasure': 0.3272604002245511,
 'rougeL_precision': 0.7037216186917309,
 'rougeL_recall': 0.23437532559906737,
 'rougeL_fmeasure': 0.29640497028372637}

In [382]:
similarity_context = cosine_similarity(dataset['Context'], retrieved_contexts)

In [383]:
np.mean(similarity_context)

0.6831899

In [391]:
similarity_answers = cosine_similarity(dataset['Response'], generated_answers)

In [392]:
np.mean(similarity_answers)

0.6250273

### 2.6 RAG - Sentences-based Context with Minimum Length

In [23]:
retrieved_contexts_enhenced = []
for query in dataset['Query']:
    context = retriever.retrieve_context(sentences_embeddings, query, k=1, enhanced=True, min_length=256)
    retrieved_contexts_enhenced.append(context)

In [308]:
generated_answers_enhenced = []
for i in range(len(dataset['Query'])):
    query = dataset['Query'][i]
    context = retrieved_contexts_enhenced[i]
    answer = generator.generate_answer(context, query)
    generated_answers_enhenced.append(answer)
    print(f'{i+1}/100 completed')

1/100 completed
2/100 completed
3/100 completed
4/100 completed
5/100 completed
6/100 completed
7/100 completed
8/100 completed
9/100 completed
10/100 completed
11/100 completed
12/100 completed
13/100 completed
14/100 completed
15/100 completed
16/100 completed
17/100 completed
18/100 completed
19/100 completed
20/100 completed
21/100 completed
22/100 completed
23/100 completed
24/100 completed
25/100 completed
26/100 completed
27/100 completed
28/100 completed
29/100 completed
30/100 completed
31/100 completed
32/100 completed
33/100 completed
34/100 completed
35/100 completed
36/100 completed
37/100 completed
38/100 completed
39/100 completed
40/100 completed
41/100 completed
42/100 completed
43/100 completed




44/100 completed








45/100 completed
46/100 completed
47/100 completed
48/100 completed
49/100 completed
50/100 completed
51/100 completed
52/100 completed
53/100 completed
54/100 completed




55/100 completed








56/100 completed
57/100 completed
58/100 completed
59/100 completed
60/100 completed
61/100 completed
62/100 completed
63/100 completed




64/100 completed








65/100 completed








66/100 completed
67/100 completed
68/100 completed
69/100 completed
70/100 completed
71/100 completed
72/100 completed
73/100 completed
74/100 completed




75/100 completed








76/100 completed
77/100 completed
78/100 completed
79/100 completed
80/100 completed
81/100 completed








82/100 completed




83/100 completed








84/100 completed




85/100 completed
86/100 completed
87/100 completed
88/100 completed






89/100 completed








90/100 completed
91/100 completed
92/100 completed
93/100 completed
94/100 completed
95/100 completed
96/100 completed
97/100 completed




98/100 completed








99/100 completed




100/100 completed


In [70]:
rouge_socres_sentence_context_enhanced = evaluate_by_rouge(dataset['Context'], retrieved_contexts_enhenced)
rouge_socres_sentence_context_enhanced

{'rouge1_precision': 0.6923667952355967,
 'rouge1_recall': 0.3666219457925905,
 'rouge1_fmeasure': 0.4145396757848288,
 'rougeL_precision': 0.5897822336597313,
 'rougeL_recall': 0.31005953237606837,
 'rougeL_fmeasure': 0.3524005053966626}

In [306]:
similarity_context_enhenced = cosine_similarity(dataset['Context'], retrieved_contexts_enhenced)

In [307]:
np.mean(similarity_context_enhenced)

0.7221808

In [310]:
similarity_answers_enhenced = cosine_similarity(dataset['Response'], generated_answers_enhenced)

In [311]:
np.mean(similarity_answers_enhenced)

0.66181976

### 2.7 RAG - N-tokens Context

In [None]:
# N = 100

In [None]:
# Processing

In [24]:
chunks_by_tokens = processor.get_chunks(by_tokens=True)

In [None]:
# retrieving

In [25]:
retriever = Retriever(encoder, chunks_by_tokens)

In [26]:
chunks_embeddings = retriever.chunks_embedding() # to update

In [27]:
retrieved_contexts_chunks = []
for query in dataset['Query']:
    context = retriever.retrieve_context(chunks_embeddings, query, k=1)
    retrieved_contexts_chunks.append(context)

In [254]:
generated_answers_chunks = []
for i in range(len(dataset['Query'])):
    query = dataset['Query'][i]
    context = retrieved_contexts_chunks[i]
    answer = generator.generate_answer(context, query)
    generated_answers_chunks.append(answer)
    print(f'{i+1}/100 completed')

1/100 completed
2/100 completed
3/100 completed
4/100 completed
5/100 completed
6/100 completed
7/100 completed
8/100 completed
9/100 completed
10/100 completed
11/100 completed
12/100 completed
13/100 completed
14/100 completed
15/100 completed
16/100 completed
17/100 completed
18/100 completed
19/100 completed
20/100 completed
21/100 completed
22/100 completed
23/100 completed
24/100 completed
25/100 completed
26/100 completed
27/100 completed
28/100 completed
29/100 completed
30/100 completed
31/100 completed
32/100 completed
33/100 completed
34/100 completed
35/100 completed
36/100 completed
37/100 completed
38/100 completed
39/100 completed
40/100 completed
41/100 completed
42/100 completed
43/100 completed
44/100 completed
45/100 completed
46/100 completed
47/100 completed
48/100 completed
49/100 completed
50/100 completed
51/100 completed
52/100 completed
53/100 completed
54/100 completed
55/100 completed
56/100 completed
57/100 completed
58/100 completed
59/100 completed
60/100



75/100 completed
76/100 completed
77/100 completed
78/100 completed
79/100 completed
80/100 completed
81/100 completed
82/100 completed
83/100 completed
84/100 completed
85/100 completed




86/100 completed
87/100 completed
88/100 completed
89/100 completed
90/100 completed
91/100 completed
92/100 completed
93/100 completed
94/100 completed
95/100 completed
96/100 completed
97/100 completed
98/100 completed
99/100 completed
100/100 completed


In [96]:
rouge_socres_context_chunks = evaluate_by_rouge(dataset['Context'], retrieved_contexts_chunks)
rouge_socres_context_chunks

{'rouge1_precision': 0.7620690832044232,
 'rouge1_recall': 0.39205051368994853,
 'rouge1_fmeasure': 0.48037647175013903,
 'rougeL_precision': 0.6538873451565632,
 'rougeL_recall': 0.33577910019806123,
 'rougeL_fmeasure': 0.41035555985650574}

In [396]:
similarity_answers_chunks = cosine_similarity(dataset['Response'], generated_answers_chunks)

In [397]:
np.mean(similarity_answers_chunks)

0.6659089

In [398]:
similarity_context_chunks = cosine_similarity(dataset['Context'], retrieved_contexts_chunks)

In [399]:
np.mean(similarity_context_chunks)

0.7710459

### 2.8 Analysis 

In [408]:
# Average evaluation results

In [536]:
avg_similarity = {
                  'Answer similarity': [np.mean(similarity_no_context_answers), np.mean(similarity_answers), np.mean(similarity_answers_enhenced), np.mean(similarity_answers_chunks)],
                  'Context similarity':['-', np.mean(similarity_context), np.mean(similarity_context_enhenced), np.mean(similarity_context_chunks)]
}
idx = ['No context', 'RAG sentences-based context', 'RAG sentences-based context with mininum length', 'RAG N-tokens-based context']

In [537]:
df = pd.DataFrame(avg_similarity, index=idx)

In [538]:
df

Unnamed: 0,Answer similarity,Context similarity
No context,0.569551,-
RAG sentences-based context,0.625027,0.68319
RAG sentences-based context with mininum length,0.66182,0.722181
RAG N-tokens-based context,0.665909,0.771046


In [108]:
rouge_scores = {'rouge1_precision': ['-', rouge_socres_sentence_context['rouge1_precision'], rouge_socres_sentence_context_enhanced['rouge1_precision'], rouge_socres_context_chunks['rouge1_precision']],
                'rouge1_recall': ['-', rouge_socres_sentence_context['rouge1_recall'], rouge_socres_sentence_context_enhanced['rouge1_recall'], rouge_socres_context_chunks['rouge1_recall']],
                'rouge1_fmeasure': ['-', rouge_socres_sentence_context['rouge1_fmeasure'], rouge_socres_sentence_context_enhanced['rouge1_fmeasure'], rouge_socres_context_chunks['rouge1_fmeasure']],
                'rougeL_precision': ['-', rouge_socres_sentence_context['rougeL_precision'], rouge_socres_sentence_context_enhanced['rougeL_precision'], rouge_socres_context_chunks['rougeL_precision']],
                'rougeL_recall': ['-', rouge_socres_sentence_context['rougeL_recall'], rouge_socres_sentence_context_enhanced['rougeL_recall'], rouge_socres_context_chunks['rougeL_recall']],
                'rougeL_fmeasure': ['-', rouge_socres_sentence_context['rougeL_fmeasure'], rouge_socres_sentence_context_enhanced['rougeL_fmeasure'], rouge_socres_context_chunks['rougeL_fmeasure']]
               }

In [109]:
idx = ['No context', 'RAG sentences-based context', 'RAG sentences-based context with mininum length', 'RAG N-tokens-based context']
rouge_df = pd.DataFrame(rouge_scores, index = idx)

In [110]:
rouge_df

Unnamed: 0,rouge1_precision,rouge1_recall,rouge1_fmeasure,rougeL_precision,rougeL_recall,rougeL_fmeasure
No context,-,-,-,-,-,-
RAG sentences-based context,0.771368,0.261132,0.32726,0.703722,0.234375,0.296405
RAG sentences-based context with mininum length,0.692367,0.366622,0.41454,0.589782,0.31006,0.352401
RAG N-tokens-based context,0.762069,0.392051,0.480376,0.653887,0.335779,0.410356
