In [None]:
from pydantic import Field, BaseModel, model_validator, ValidationInfo
from typing import List
import re
from typing import List
import re
from pypdf import PdfReader
from openai import OpenAI

In [42]:
class Fact(BaseModel):
    fact: str = Field(...)
    substring_quote: List[str] = Field(...)

    @model_validator(mode="after")
    def validate_sources(self) -> "Fact":
        # Assuming text_chunk is provided during initialization
        text_chunks = getattr(self, "text_chunk", None)
        if text_chunks:
            spans = list(self.get_spans(text_chunks))
            self.substring_quote = [text_chunks[span[0]:span[1]] for span in spans]
        return self

    def get_spans(self, context):
        for quote in self.substring_quote:
            yield from self._get_span(quote, context)

    def _get_span(self, quote, context):
        for match in re.finditer(re.escape(quote), context):
            yield match.span()

class QuestionAnswer(BaseModel):
    questions: List[str] = Field(..., description='A list of questions about the source content')
    answers: List[Fact] = Field(..., description='A list of answers as Fact objects to answer each question')

    @model_validator(mode="after")
    def validate_sources(self) -> "QuestionAnswer":
        self.answers = [fact for fact in self.answers if len(fact.substring_quote) > 0]
        return self

# Example usage with text_chunk context passed in initialization
fact_data = {
    "fact": "Example fact",
    "substring_quote": ["some quote"],
    "text_chunk": "This is some context that contains some quote."
}
fact = Fact(**fact_data)

fact

Fact(fact='Example fact', substring_quote=['some quote'])

In [2]:
client = OpenAI()

MODEL = 'gpt-4o-mini'

def load_pdf_text(file_path):
    '''Loads text from a PDF file.'''
    # importing required modules

    # creating a pdf reader object
    reader = PdfReader(file_path)

    # extracting text from page
    text = "\n\n".join([page.extract_text() for page in reader.pages])
    
    return text

Simpler working version is below:

In [3]:
class FactExtractor(BaseModel):
    facts: List[Fact] = Field(..., description="List with all the facts contained in the source text.")

def extract_facts(prompt_question):
    response = client.beta.chat.completions.parse(
        model="gpt-4o-2024-08-06",
        messages=[{"role": "system", "content": "You are a fact extraction engine."},
                  {"role": "user", "content": prompt_question}],
        response_format=FactExtractor
    )
    
    return response.choices[0].message.parsed


def load_pdf_pages(file_path):
    '''Loads text from a PDF file.'''
    # importing required modules

    # creating a pdf reader object
    reader = PdfReader(file_path)

    # extracting text from page
    pages = [page.extract_text() for page in reader.pages]
    
    return pages

In [4]:
file_path = './assets-resources/sources/attention-paper.pdf'

pages = load_pdf_pages(file_path)

page1 = pages[0]

facts = extract_facts(f'Extract the facts from the following text:\n\n{page1}')

for fact in facts.facts:
    print(fact.fact)
    print(fact.substring_quote)
    print("\n")


Google grants permission to reproduce tables and figures from the paper for journalistic or scholarly works with proper attribution.
['Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works.']


The Transformer architecture is based solely on attention mechanisms without recurrence or convolutions.
['We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely.']


The Transformer model achieved a BLEU score of 28.4 on the WMT 2014 English-to-German translation task.
['Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task.']


The Transformer model achieved a BLEU score of 41.8 on the WMT 2014 English-to-French translation task after training for 3.5 days on eight GPUs.
['On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of

In [5]:
# disclaimer: THIS MIGHT BE A SILY ILLOGICAL THING TO DO....
paper_facts = []
for page in pages:
    paper_facts.append(extract_facts(f'Extract the facts from the following text:\n\n{page}'))

In [6]:
paper_facts[0].facts

[Fact(fact='Google grants permission to reproduce tables and figures in the paper for journalistic or scholarly works provided proper attribution is given.', substring_quote=['Google hereby grants permission', 'to reproduce the tables and figures in this paper', 'solely for use in journalistic or scholarly works.', 'Provided proper attribution is provided']),
 Fact(fact="The paper 'Attention Is All You Need' is associated with several Google Research contributors including Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, and Łukasz Kaiser.", substring_quote=['Attention Is All You Need', 'Ashish Vaswani', 'Noam Shazeer', 'Niki Parmar', 'Jakob Uszkoreit', 'Llion Jones', 'Łukasz Kaiser']),
 Fact(fact='Aidan N. Gomez from the University of Toronto contributed to the paper.', substring_quote=['Aidan N. Gomez', 'University of Toronto']),
 Fact(fact='Illia Polosukhin also contributed to the paper.', substring_quote=['Illia Polosukhin']),
 Fact(fact='The Transformer net

In [8]:
from typing import Literal

class RelevantFact(BaseModel):
    relevancy_score: Literal['yes', 'no'] = Field(description="A binary score yes|no if a specific statement/fact is relevant given an objective.")
    justification: str = Field(description="A short one sentence justification for the relevancy score.")

In [9]:

SYS_MSG_RELEVANT_FACT = """
Given an objective from the user you inspect if a given fact is relevant to that objective and output:
- relevancy_score: yes/no.
- justification: one sentence justification for the relevancy score.
"""


In [10]:
def is_it_relevant(objective, fact):
    response = client.beta.chat.completions.parse(
        model="gpt-4o-2024-08-06",
        messages=[{"role": "system", "content": SYS_MSG_RELEVANT_FACT},
                  {"role": "user", "content": f"{objective}\n\n{fact.fact}"}],
        response_format=RelevantFact
    )
    
    return response.choices[0].message.parsed

In [11]:
objective_description = '''I want to understand how the attention mechanism works'''

relevant_facts = []

for facts in paper_facts:
    for fact in facts.facts:
        relevancy_assessment =  is_it_relevant(objective_description, fact)
        if relevancy_assessment.relevancy_score == 'yes':
            relevant_facts.append((fact, relevancy_assessment.justification))
        elif relevancy_assessment.relevancy_score == 'no':
            print("This is not relevant: ", fact)
        else:
            print("Error: ", fact)

This is not relevant:  fact='Google grants permission to reproduce tables and figures in the paper for journalistic or scholarly works provided proper attribution is given.' substring_quote=['Google hereby grants permission', 'to reproduce the tables and figures in this paper', 'solely for use in journalistic or scholarly works.', 'Provided proper attribution is provided']
This is not relevant:  fact='Illia Polosukhin also contributed to the paper.' substring_quote=['Illia Polosukhin']
This is not relevant:  fact='The Transformer model achieves a BLEU score of 28.4 on the WMT 2014 English-to-German translation task.' substring_quote=['Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task']
This is not relevant:  fact='On the WMT 2014 English-to-French translation task, the Transformer model achieves a BLEU score of 41.8 after training for 3.5 days on eight GPUs.' substring_quote=['On the WMT 2014 English-to-French translation task', 'our model establishes a ne

In [13]:
for fact in relevant_facts:
    print(fact[0].fact)
    print(fact[0].substring_quote)
    print(fact[1])
    print("\n")

The paper 'Attention Is All You Need' is associated with several Google Research contributors including Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, and Łukasz Kaiser.
['Attention Is All You Need', 'Ashish Vaswani', 'Noam Shazeer', 'Niki Parmar', 'Jakob Uszkoreit', 'Llion Jones', 'Łukasz Kaiser']
The paper 'Attention Is All You Need' is foundational in explaining the attention mechanism, making it directly relevant to your objective of understanding how the attention mechanism works.


Aidan N. Gomez from the University of Toronto contributed to the paper.
['Aidan N. Gomez', 'University of Toronto']
Aidan N. Gomez was a co-author of the original 'Attention Is All You Need' paper which introduced the attention mechanism.


The Transformer network architecture is proposed, which relies solely on attention mechanisms and omits recurrence and convolutions.
['We propose a new simple network architecture, the Transformer', 'based solely on attention mechanisms', '

In [15]:
from IPython.display import Markdown

markdown_string = ""
for fact, justification in relevant_facts:
    markdown_string += f"**Fact:** {fact.fact}\n\n"
    markdown_string += f"**Quotes:** {', '.join(fact.substring_quote)}\n\n"
    markdown_string += f"**Justification:** {justification}\n\n---\n\n"

markdown_string = markdown_string.strip()
Markdown(markdown_string)

**Fact:** The paper 'Attention Is All You Need' is associated with several Google Research contributors including Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, and Łukasz Kaiser.

**Quotes:** Attention Is All You Need, Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Łukasz Kaiser

**Justification:** The paper 'Attention Is All You Need' is foundational in explaining the attention mechanism, making it directly relevant to your objective of understanding how the attention mechanism works.

---

**Fact:** Aidan N. Gomez from the University of Toronto contributed to the paper.

**Quotes:** Aidan N. Gomez, University of Toronto

**Justification:** Aidan N. Gomez was a co-author of the original 'Attention Is All You Need' paper which introduced the attention mechanism.

---

**Fact:** The Transformer network architecture is proposed, which relies solely on attention mechanisms and omits recurrence and convolutions.

**Quotes:** We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely.

**Justification:** The fact is directly relevant because it mentions the use of attention mechanisms in the Transformer, which is key to understanding how attention works.

---

**Fact:** The Transformer architecture generalizes well to other tasks such as English constituency parsing.

**Quotes:** We show that the Transformer generalizes well to other tasks, applying it successfully to English constituency parsing

**Justification:** Understanding the Transformer's successes in various tasks can provide insights into how its attention mechanism contributes to its efficacy.

---

**Fact:** The contribution of the team members is notable, with Jakob proposing replacing RNNs with self-attention, Noam proposing scaled dot-product attention, and others responsible for implementation and experimental variations.

**Quotes:** Jakob proposed replacing RNNs with self-attention, Noam proposed scaled dot-product attention, multi-head attention, Ashish, with Illia, designed and implemented the first Transformer models, Niki designed, implemented, tuned and evaluated countless model variants, Llion also experimented with novel model variants, Lukasz and Aidan spent countless long days designing various parts of and implementing tensor2tensor

**Justification:** It provides insight into the development and components of the attention mechanism, particularly self-attention and scaled dot-product attention.

---

**Fact:** The work was presented at the 31st Conference on Neural Information Processing Systems (NIPS 2017) in Long Beach, CA, USA.

**Quotes:** 31st Conference on Neural Information Processing Systems (NIPS 2017), Long Beach, CA, USA

**Justification:** Understanding the context and timeline of the presentation provides background information about the formal introduction of the attention mechanism in neural networks.

---

**Fact:** The Transformer is a model architecture that does not rely on recurrence but entirely on an attention mechanism to draw global dependencies between input and output, allowing for significantly more parallelization.

**Quotes:** the Transformer, a model architecture eschewing recurrence, relying entirely on an attention mechanism, draw global dependencies between input and output, The Transformer allows for significantly more parallelization

**Justification:** The statement explains how the attention mechanism is utilized in the Transformer architecture, which is relevant to understanding its functionality.

---

**Fact:** The Transformer reduces the number of operations required to relate signals from two arbitrary input or output positions to a constant number, unlike ConvS2S and ByteNet.

**Quotes:** The Transformer this is reduced to a constant number of operations, ConvS2S, ByteNet

**Justification:** The fact explains a key aspect of the attention mechanism within the Transformer model, which is relevant to understanding how it functions.

---

**Fact:** Self-attention, also known as intra-attention, is an attention mechanism that relates different positions of a single sequence to compute a representation of the sequence, used in tasks like reading comprehension, abstractive summarization, textual entailment, and learning task-independent sentence representations.

**Quotes:** Self-attention, sometimes called intra-attention, relating different positions of a single sequence, compute a representation of the sequence, reading comprehension,, abstractive summarization,, textual entailment, learning task-independent sentence representations

**Justification:** The fact explains self-attention, a key component of the attention mechanism relevant to understanding how it works.

---

**Fact:** End-to-end memory networks are based on a recurrent attention mechanism and perform well on simple-language question answering and language modeling tasks, unlike sequence-aligned recurrence.

**Quotes:** End-to-end memory networks, based on a recurrent attention mechanism, perform well on simple-language question answering, language modeling tasks

**Justification:** The fact discusses a type of attention mechanism in memory networks, directly relevant to understanding how attention mechanisms function in certain models.

---

**Fact:** The Transformer is the first transduction model relying entirely on self-attention to compute representations of its input and output, without using sequence-aligned RNNs or convolution.

**Quotes:** Transformer is the first transduction model, relying entirely on self-attention, compute representations of its input and output, without using sequence-aligned RNNs or convolution

**Justification:** The fact explains a key component (self-attention) used in the Transformer model, which is vital for understanding how attention mechanisms work.

---

**Fact:** Most competitive neural sequence transduction models have an encoder-decoder structure.

**Quotes:** Most competitive neural sequence transduction models have an encoder-decoder structure

**Justification:** The encoder-decoder structure is a foundational concept in sequence transduction models that utilize attention mechanisms, making it directly relevant to understanding how attention works.

---

**Fact:** The Transformer model architecture uses stacked self-attention and point-wise, fully connected layers for both the encoder and decoder.

**Quotes:** The Transformer follows this overall architecture using stacked self-attention and point-wise, fully connected layers for both the encoder and decoder

**Justification:** The statement directly relates to how the attention mechanism is implemented within the Transformer model.

---

**Fact:** The encoder is composed of a stack of 6 identical layers.

**Quotes:** Encoder: The encoder is composed of a stack of N= 6 identical layers

**Justification:** Understanding the encoder's structure is essential for grasping the role of the attention mechanism within it.

---

**Fact:** Each encoder layer has two sub-layers: a multi-head self-attention mechanism and a position-wise fully connected feed-forward network.

**Quotes:** Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position-wise fully connected feed-forward network

**Justification:** The statement describes components of the attention mechanism within the encoder layer, which is relevant to understanding its function.

---

**Fact:** In the decoder, there is a third sub-layer which performs multi-head attention over the encoder's output.

**Quotes:** the decoder inserts a third sub-layer, which performs multi-head attention over the output of the encoder stack

**Justification:** Understanding the sub-layer's role in the decoder is crucial to comprehending the attention mechanism's function within seq2seq models.

---

**Fact:** Self-attention sub-layer in the decoder stack is modified to prevent positions from attending to subsequent positions.

**Quotes:** modify the self-attention sub-layer in the decoder stack to prevent positions from attending to subsequent positions

**Justification:** Understanding the modification of self-attention in the decoder stack is crucial to comprehending how the attention mechanism functions overall, especially its role in sequence prediction tasks.

---

**Fact:** The attention function maps a query and a set of key-value pairs to an output, where query, keys, values, and output are all vectors.

**Quotes:** An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors

**Justification:** The fact directly explains the mapping process in the attention mechanism, which is fundamental to understanding how it works.

---

**Fact:** Scaled Dot-Product Attention is a type of attention mechanism used in neural networks.

**Quotes:** "We call our particular attention 'Scaled Dot-Product Attention' (Figure 2)"

**Justification:** The statement directly addresses a type of attention mechanism, which is central to the user's objective of understanding attention mechanisms.

---

**Fact:** In Scaled Dot-Product Attention, the input consists of queries and keys of dimension dk, and values of dimension dv.

**Quotes:** "The input consists of queries and keys of dimension dk, and values of dimension dv."

**Justification:** This fact directly pertains to the technical details of the attention mechanism, specifically the Scaled Dot-Product Attention, which is crucial for understanding how it functions.

---

**Fact:** The process involves computing the dot products of the query with all keys, dividing each by the square root of dk, and applying a softmax function to obtain the weights on the values.

**Quotes:** "We compute the dot products of the query with all keys, divide each by√dk, and apply a softmax function to obtain the weights on the values."

**Justification:** The fact directly explains a key aspect of how the attention mechanism functions in neural networks.

---

**Fact:** Multi-Head Attention consists of several attention layers running in parallel.

**Quotes:** "Multi-Head Attention consists of several attention layers running in parallel."

**Justification:** Multi-Head Attention is a key component of the attention mechanism, making the fact directly relevant to the objective.

---

**Fact:** Dot-product attention is very similar to Scaled Dot-Product Attention, except for the scaling factor.

**Quotes:** "Dot-product attention is identical to our algorithm, except for the scaling factor of1√dk."

**Justification:** The fact directly relates to the components of the attention mechanism, providing insight into its operation.

---

**Fact:** Additive attention computes the compatibility function using a feed-forward network with a single hidden layer.

**Quotes:** "Additive attention computes the compatibility function using a feed-forward network with a single hidden layer."

**Justification:** The statement explains a specific method within attention mechanisms, directly relevant to understanding how they function.

---

**Fact:** Dot-product attention is faster and more space-efficient than additive attention because it uses optimized matrix multiplication code.

**Quotes:** "Dot-product attention is much faster and more space-efficient in practice, since it can be implemented using highly optimized matrix multiplication code."

**Justification:** The fact directly explains a part of how the attention mechanism works by comparing two types of attention.

---

**Fact:** For large values of dk, additive attention outperforms dot product attention without scaling.

**Quotes:** "additive attention outperforms dot product attention without scaling for larger values of dk"

**Justification:** The fact directly relates to understanding how attention mechanisms work by comparing the effectiveness of additive vs dot product attention in specific conditions.

---

**Fact:** The scaling factor in Scaled Dot-Product Attention is used to counteract the effect of large magnitudes in the dot products, which can push the softmax function into regions where it has small gradients.

**Quotes:** "To counteract this effect, we scale the dot products by1√dk."

**Justification:** The scaling factor is a key component of the attention mechanism, thus relevant to understanding how it works.

---

**Fact:** Multi-Head Attention involves projecting queries, keys, and values multiple times with learned linear projections before applying the attention function in parallel.

**Quotes:** "we found it beneficial to linearly project the queries, keys and values htimes with different, learned linear projections to dk,dkanddvdimensions, respectively.", "On each of these projected versions of queries, keys and values we then perform the attention function in parallel."

**Justification:** The fact describes a component of the attention mechanism, specifically multi-head attention, which is crucial for understanding how attention works in models like Transformers.

---

**Fact:** Multi-head attention allows the model to attend to information from different representation subspaces at different positions.

**Quotes:** Multi-head attention allows the model to jointly attend to information from different representation subspaces at different positions.

**Justification:** The fact explains a core aspect of how attention mechanisms, specifically multi-head attention, function, which aligns with understanding the attention mechanism.

---

**Fact:** The Transformer model employs 8 parallel attention layers or heads.

**Quotes:** In this work we employ h= 8 parallel attention layers, or heads.

**Justification:** The use of 8 parallel attention layers is a specific aspect of the attention mechanism within the Transformer model, relevant to understanding how attention works in practice.

---

**Fact:** The computational cost of multi-head attention with reduced dimensions is similar to single-head attention with full dimensionality.

**Quotes:** Due to the reduced dimension of each head, the total computational cost is similar to that of single-head attention with full dimensionality.

**Justification:** The fact is relevant as it relates to the efficiency and computational aspects of the attention mechanism, a key component in understanding how multi-head attention operates relative to single-head attention.

---

**Fact:** Encoder-decoder attention layers allow every position in the decoder to attend over all positions in the input sequence.

**Quotes:** In "encoder-decoder attention" layers, the queries come from the previous decoder layer, and the memory keys and values come from the output of the encoder. This allows every position in the decoder to attend over all positions in the input sequence.

**Justification:** The fact directly explains a key component of the attention mechanism in the encoder-decoder architecture.

---

**Fact:** Self-attention layers in the encoder allow each position to attend to all positions in the previous layer.

**Quotes:** The encoder contains self-attention layers. In a self-attention layer all of the keys, values and queries come from the same place, in this case, the output of the previous layer in the encoder. Each position in the encoder can attend to all positions in the previous layer of the encoder.

**Justification:** The statement describes a key component of the attention mechanism, specifically the self-attention aspect used in encoders, which is essential for understanding how attention works.

---

**Fact:** Self-attention layers in the decoder allow each position to attend to all previous positions in the decoder.

**Quotes:** Similarly, self-attention layers in the decoder allow each position in the decoder to attend to all positions in the decoder up to and including that position.

**Justification:** The fact is relevant because self-attention layers are a key component of the attention mechanism, explaining how information from different positions is integrated within a model.

---

**Fact:** The Transformer model implements masking in scaled dot-product attention to prevent leftward information flow in the decoder.

**Quotes:** We need to prevent leftward information flow in the decoder to preserve the auto-regressive property. We implement this inside of scaled dot-product attention by masking out (setting to −∞) all values in the input of the softmax which correspond to illegal connections.

**Justification:** The fact directly relates to the functioning of the attention mechanism in Transformer models, particularly in managing data flow in the decoding process.

---

**Fact:** Self-attention layer connects all positions with a constant number of sequentially executed operations.

**Quotes:** self-attention layer connects all positions with a constant number of sequentially executed operations

**Justification:** The statement directly explains a fundamental aspect of the attention mechanism, which is the self-attention layer, crucial for understanding how it operates.

---

**Fact:** Recurrent layer requires O(n) sequential operations.

**Quotes:** a recurrent layer requires O(n)sequential operations

**Justification:** Understanding the attention mechanism involves knowing its advantages over traditional sequential operations like those in recurrent layers.

---

**Fact:** Computational complexity per layer for self-attention is O(n²·d).

**Quotes:** Self-Attention O(n2·d)

**Justification:** The computational complexity provides insight into the performance and efficiency characteristics of the attention mechanism, which is crucial for understanding how it works.

---

**Fact:** The maximum path length for self-attention is O(1).

**Quotes:** Self-Attention O(1)

**Justification:** The statement about the maximum path length for self-attention is directly related to understanding the attention mechanism's efficiency and scalability.

---

**Fact:** Sequential operations for recurrent layers are O(n).

**Quotes:** Recurrent O(n)

**Justification:** Understanding the complexity of recurrent layers provides context for how the attention mechanism improves upon them by reducing computational complexity.

---

**Fact:** Restricted self-attention has a per-layer complexity of O(r·n·d).

**Quotes:** Self-Attention (restricted) O(r·n·d)

**Justification:** The fact about restricted self-attention's complexity relates to understanding the computational efficiency of the attention mechanism.

---

**Fact:** Sequential operations for restricted self-attention are O(1).

**Quotes:** Self-Attention (restricted) O(1)

**Justification:** Understanding the computational complexity of self-attention mechanisms, such as O(1), is crucial for grasping how these mechanisms function efficiently.

---

**Fact:** The maximum path length for restricted self-attention is O(n/r).

**Quotes:** Self-Attention (restricted) O(n/r)

**Justification:** The fact discusses a characteristic of attention mechanisms, specifically restricted self-attention, which is relevant to understanding how the attention mechanism works.

---

**Fact:** The model does not use recurrence or convolution.

**Quotes:** our model contains no recurrence and no convolution

**Justification:** Understanding the absence of recurrence or convolution is crucial because it helps explain why the attention mechanism is used, highlighting its unique approach to handling dependencies in sequences.

---

**Fact:** Positional encodings are added to the input embeddings of the encoder and decoder stacks.

**Quotes:** we add "positional encodings" to the input embeddings at the bottoms of the encoder and decoder stacks

**Justification:** Understanding positional encodings is part of comprehending the attention mechanism as they help in incorporating order information in the sequences processed by the attention mechanism.

---

**Fact:** Positional encodings have the same dimension as the embeddings, so they can be summed.

**Quotes:** The positional encodings have the same dimension dmodel as the embeddings, so that the two can be summed

**Justification:** Understanding positional encodings is essential to comprehending how the attention mechanism processes sequences in models like Transformers.

---

**Fact:** The positional encoding uses sine and cosine functions of different frequencies.

**Quotes:** we use sine and cosine functions of different frequencies

**Justification:** Positional encoding is a crucial component in the attention mechanism, providing information about the order of the sequence.

---

**Fact:** Each dimension of the positional encoding corresponds to a sinusoid with wavelengths forming a geometric progression.

**Quotes:** each dimension of the positional encoding corresponds to a sinusoid. The wavelengths form a geometric progression

**Justification:** The positional encoding is a component of the attention mechanism in transformer models, which directly relates to understanding how attention works.

---

**Fact:** Sinusoidal positional encoding may allow the model to extrapolate to longer sequences.

**Quotes:** we chose the sinusoidal version because it may allow the model to extrapolate to sequence lengths longer than the ones encountered during training

**Justification:** Sinusoidal positional encoding is a component of the attention mechanism in transformer models, which helps the model understand positional information.

---

**Fact:** Self-attention can be restricted to a neighborhood of size r in the input sequence to increase the maximum path length to O(n/r).

**Quotes:** Self-attention could be restricted to considering only a neighborhood of size r in the input sequence, This would increase the maximum path length to O(n/r)

**Justification:** The statement directly discusses a modification to the self-attention mechanism, which is central to understanding how attention operates in machine learning.

---

**Fact:** With k = n, the complexity of a separable convolution equals the combination of a self-attention layer and a point-wise feed-forward layer.

**Quotes:** Even with k=n, however, the complexity of a separable convolution is equal to the combination of a self-attention layer and a point-wise feed-forward layer

**Justification:** The statement compares the complexity of separable convolution with self-attention layers, which directly relates to understanding how self-attention mechanisms work in terms of computational cost.

---

**Fact:** Self-attention could yield more interpretable models as individual attention heads learn different tasks, some related to syntactic and semantic structures.

**Quotes:** As a side benefit, self-attention could yield more interpretable models, individual attention heads clearly learn to perform different tasks, many appear to exhibit behavior related to the syntactic and semantic structure of the sentences

**Justification:** Self-attention directly relates to the understanding of how the attention mechanism operates, as it is a core component that facilitates diverse interpretative capabilities in models.

---

**Fact:** Experiment (A) varies the number of attention heads and corresponding key and value dimensions.

**Quotes:** In Table 3 rows (A), we vary the number of attention heads and the attention key and value dimensions.

**Justification:** The experiment directly explores components of the attention mechanism, making it relevant to understanding how the attention mechanism works.

---

**Fact:** Experiment (B) suggests reducing the attention key size hurts the model quality.

**Quotes:** In Table 3 rows (B), we observe that reducing the attention key size dkhurts model quality.

**Justification:** The fact directly discusses an experiment related to the attention mechanism's key size, which is relevant to understanding how the attention mechanism works.

---

**Fact:** Replacing sinusoidal positional encoding with learned positional embeddings showed nearly identical results to the base model.

**Quotes:** In row (E) we replace our sinusoidal positional encoding with learned positional embeddings [ 9], and observe nearly identical results to the base model.

**Justification:** Understanding modifications to positional encoding, like replacing sinusoidal encoding, is relevant to understanding how attention mechanisms process sequence information.

---

**Fact:** A 4-layer Transformer model with model size of 1024 was trained for English constituency parsing using the WSJ portion of the Penn Treebank.

**Quotes:** We trained a 4-layer transformer with dmodel = 1024 on the Wall Street Journal (WSJ) portion of the Penn Treebank [ 25].

**Justification:** The fact mentions a 4-layer Transformer model, which is relevant as understanding such models often involves comprehending the attention mechanism they employ.

---

**Fact:** Experiments for constituency parsing involved selecting dropout, attention and residual learning rates, and beam size on the Section 22 development set, leaving other parameters unchanged from the English-to-German base translation model.

**Quotes:** We performed only a small number of experiments to select the dropout, both attention and residual (section 5.4), learning rates and beam size

**Justification:** The fact mentions the use of attention in the context of parsing models, which is directly related to understanding how attention operates in machine learning.

---

**Fact:** The Transformer is the first sequence transduction model based entirely on attention.

**Quotes:** In this work, we presented the Transformer, the first sequence transduction model based entirely on attention

**Justification:** The fact directly relates to your objective by identifying the Transformer model as a key example of applying the attention mechanism.

---

**Fact:** The Transformer achieved a new state of the art on WMT 2014 English-to-German and English-to-French translation tasks.

**Quotes:** On both WMT 2014 English-to-German and WMT 2014 English-to-French translation tasks, we achieve a new state of the art

**Justification:** The attention mechanism is a key component of the Transformer model, which is referenced for its translation achievements.

---

**Fact:** The Transformer aims to extend to input and output modalities other than text, including images, audio, and video.

**Quotes:** We plan to extend the Transformer to problems involving input and output modalities other than text and to investigate local, restricted attention mechanisms to efficiently handle large inputs and outputs such as images, audio and video

**Justification:** The attention mechanism is a key component of Transformers that allows them to process various input and output modalities.

---

**Fact:** The paper by Kyunghyun Cho et al. is about learning phrase representations using RNN encoder-decoder for statistical machine translation.

**Quotes:** Kyunghyun Cho, learning phrase representations using rnn encoder-decoder for statistical machine translation

**Justification:** Understanding the RNN encoder-decoder architecture is foundational for grasping how attention mechanisms enhance translation models.

---

**Fact:** Alex Graves authored a paper on generating sequences with recurrent neural networks.

**Quotes:** Alex Graves, Generating sequences with recurrent neural networks

**Justification:** Understanding RNNs is foundational to grasping the attention mechanism's role in sequence generation.

---

**Fact:** Łukasz Kaiser and Samy Bengio questioned if active memory can replace attention in 2016.

**Quotes:** Łukasz Kaiser, Can active memory replace attention

**Justification:** The fact directly addresses a specific aspect of the attention mechanism and alternative approaches, making it relevant to understanding its workings.

---

**Fact:** Yoon Kim et al. worked on structured attention networks in 2017.

**Quotes:** Yoon Kim, Structured attention networks

**Justification:** The fact is relevant because Yoon Kim's work on structured attention networks directly contributes to understanding the attention mechanism.

---

**Fact:** Zhouhan Lin et al. published a structured self-attentive sentence embedding study in 2017.

**Quotes:** Zhouhan Lin, structured self-attentive sentence embedding

**Justification:** The study by Zhouhan Lin et al. is directly related to the attention mechanism, providing insights into self-attentive models, which are crucial for understanding attention mechanisms.

---

**Fact:** Minh-Thang Luong, Hieu Pham, and Christopher D. Manning examined effective approaches to attention-based neural machine translation in 2015.

**Quotes:** Minh-Thang Luong, attention-based neural machine translation

**Justification:** The fact discusses a study on attention-based neural machine translation, which is directly related to understanding how the attention mechanism works.

---

**Fact:** Ankur Parikh, Oscar Täckström, Dipanjan Das, and Jakob Uszkoreit developed a decomposable attention model presented at the Empirical Methods in Natural Language Processing in 2016.

**Quotes:** Ankur Parikh, Oscar Täckström, Dipanjan Das, and Jakob Uszkoreit, A decomposable attention model, Empirical Methods in Natural Language Processing, 2016

**Justification:** The fact is relevant because it mentions a decomposable attention model that aligns with the objective of understanding how attention mechanisms work.

---

**Fact:** Sainbayar Sukhbaatar and colleagues developed end-to-end memory networks, included in the Advances in Neural Information Processing Systems proceedings in 2015.

**Quotes:** Sainbayar Sukhbaatar, End-to-end memory networks, Advances in Neural Information Processing Systems, 2015

**Justification:** End-to-end memory networks, developed by Sainbayar Sukhbaatar and colleagues, utilize the attention mechanism to selectively focus on relevant memory parts, which is fundamental to understanding how attention mechanisms work.

---

**Fact:** Jie Zhou and colleagues proposed deep recurrent models with fast-forward connections for neural machine translation, presented in a 2016 CoRR paper.

**Quotes:** Jie Zhou, deep recurrent models with fast-forward connections for neural machine translation, CoRR, 2016

**Justification:** Understanding deep recurrent models and their advancements is helpful in grasping how attention mechanisms evolved to improve neural machine translation.

---

**Fact:** The attention mechanism follows long-distance dependencies, specifically focusing on the word 'making' to complete the phrase ‘making...more difficult’.

**Quotes:** attention mechanism following long-distance dependencies in the encoder self-attention in layer 5 of 6, Many of the attention heads attend to a distant dependency of the verb ‘making’, completing the phrase ‘making...more difficult’

**Justification:** The fact directly relates to understanding how attention can be focused on specific words to manage dependencies in sequences.

---

**Fact:** Different colors represent different attention heads, and attention is shown only for the word ‘making’.

**Quotes:** Different colors represent different heads, Attentions here shown only for the word ‘making’

**Justification:** The mention of different colors representing attention heads directly relates to explaining how the attention mechanism processes 'making'.

---

**Fact:** Figure 4 shows two attention heads in layer 5 of 6, involved in anaphora resolution.

**Quotes:** Figure, 4, :, Two, attention, heads,, also, in, layer, 5, of, 6,, apparently, involved, in, anaphora, resolution

**Justification:** Anaphora resolution is a task handled by attention mechanisms, making this fact directly relevant to understanding how attention works.

---

**Fact:** The attentions for the word 'its' are very sharp.

**Quotes:** Note, that, the, attentions, are, very, sharp, for, this, word

**Justification:** Understanding why the attentions for specific words like 'its' are sharp is crucial for comprehending the workings of the attention mechanism.

---

**Fact:** Figure 5 illustrates that attention heads exhibit behavior related to sentence structure.

**Quotes:** Figure, 5, :, Many, of, the, attention, heads, exhibit, behaviour, that, seems, related, to, the, structure, of, the, sentence, .

**Justification:** The detail about attention heads and sentence structure directly pertains to how the attention mechanism operates.

---

**Fact:** Two examples are given from different encoder self-attention heads at layer 5 of 6.

**Quotes:** We, give, two, such, examples, above, ,, from, two, different, heads, from, the, encoder, self-attention, at, layer, 5, of, 6, .

**Justification:** Understanding examples from attention heads in a neural network layer directly relates to grasping the mechanics of the attention mechanism.

---

**Fact:** The heads learned to perform different tasks.

**Quotes:** The, heads, clearly, learned, to, perform, different, tasks, .

**Justification:** Understanding the role of heads in performing different tasks is crucial to grasping the functionality of the attention mechanism.

---

In [None]:
# import json

# # Convert the paper_facts list to a JSON serializable format
# serialized_data = [fact.dict() for fact in paper_facts]

# # Specify the file path to save the data

# file_path = "paper_facts.json"

# # Write the serialized data to the file
# with open(file_path, "w") as file:
#     json.dump(serialized_data, file)

In [17]:
SYS_MSG_EXPLANATION = """
You are an explainer expert. Given a resource material and a question or objective,
you will explain topics to students based solely on the resource material in a concrete and
understandable way.
Your explanation will be in a nicely organized markdown structure and format.
"""
def explain(prompt):
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "system", "content": SYS_MSG_EXPLANATION},
                  {"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [18]:
prompt_to_reinforce = "I want to understand the attention and self-attention mechanisms in depth, use only this resource below to coompile an explanation: \n\n {markdown_string}."

explanation = explain(prompt_to_reinforce)

Markdown(explanation)

Absolutely! To help you understand attention and self-attention mechanisms, I will compile an organized explanation based solely on the provided markdown resource:

---

# Understanding Attention and Self-Attention Mechanisms

## 1. Introduction to Attention Mechanisms

### What is Attention in Neural Networks?
Attention mechanisms in neural networks aim to mimic the human ability to focus on a specific subset of information when processing large amounts of data. This capability allows models to weigh different parts of the input data differently, giving more importance to relevant parts and less to irrelevant parts.

### Key Components
- **Query (Q)**: Represents what we are searching for.
- **Keys (K)**: Represents all possible candidates.
- **Values (V)**: Represents the data associated with each key.

### How Attention is Computed
Attention mechanisms calculate a weighted sum of values (V), where the weight assigned to each value is determined by its corresponding key (K) and the query (Q).

#### Scaled Dot-Product Attention
1. **Dot Product**: Compute the dot product of the query with all keys.
2. **Scale**: Scale the dot products by dividing them by the square root of the dimension of the key vectors to stabilize gradients.
3. **Softmax**: Pass the scaled scores through a softmax function to obtain the weights.
4. **Weighted Sum**: Multiply the weights with the value vectors to obtain the final attention output.

### Formula
\[ Attention(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right) V \]

Where \( d_k \) is the dimension of the key vectors.

## 2. Introduction to Self-Attention Mechanisms

### What is Self-Attention?
Self-attention is a special type of attention mechanism where the query, key, and value vectors all come from the same place. Specifically, each position in the input sequence pays attention to all positions in the sequence (including itself) to compute a weighted sum of values.

### Benefits of Self-Attention
- **Parallelization**: Unlike RNNs, self-attention mechanisms allow for parallel computation.
- **Context Understanding**: Enables the model to capture the long-range dependencies in sequences effectively.
- **Scalability**: Scales better with input length compared to traditional sequence models.

### How Self-Attention Works
1. **Input**: Obtain input tensor, typically with dimensions (sequence length, embedding dimension).
2. **Linear Transformations**: Apply linear transformations to obtain the query, key, and value matrices from the input tensor.
3. **Attention Calculation**: Compute scaled dot-product attention for all input positions with respect to each other.
4. **Output**: Generate the output tensor by summing the calculated attentions and passing them through another linear transformation for further processing.

## 3. Application in Practice: The Transformer Model

### Transformer Architecture
The Transformer model leverages self-attention mechanisms extensively and consists of an encoder and a decoder stack:
- **Encoder**: Repeated layers that apply self-attention and feed-forward neural networks.
- **Decoder**: Similar to the encoder but includes additional layers for attending to the encoder's outputs.

### Positional Encoding
Since self-attention mechanisms do not inherently incorporate positional information (unlike RNNs), the Transformer model includes positional encodings to introduce a sense of order to the sequences.

### Multi-Head Attention
Rather than computing a single attention function, multi-head attention involves running multiple attention mechanisms in parallel. This allows the model to jointly attend to information from different representation subspaces at different positions.

### Formula for Multi-Head Attention
\[ MultiHead(Q, K, V) = \text{Concat(head_1, head_2, ..., head_h)}W^O \]
where each head \( head_i \) is computed as:
\[ head_i = Attention(QW_i^Q, KW_i^K, VW_i^V) \]

## Summary
- **Attention mechanisms** enhance neural networks by allowing specific parts of input data to be weighted differently.
- **Self-attention** uses the same input for queries, keys, and values, enabling models to capture dependencies within the same sequence.
- **Transformers** utilize self-attention mechanisms, positional encoding, and multi-head attention to achieve state-of-the-art performance in various tasks.

---

I hope this explanation clarifies the concepts of attention and self-attention mechanisms for you! If you have any further questions, feel free to ask.

In [43]:
SYS_MSG_QA = """
'You are a helpful Q&A expert.
You take in context information and you output a list of questions and answers 
where each answer is a list of Facts with statements and their corresponding quotes
that support the statement from the original context.'
"""

def create_qa(prompt_question):
    '''Creates Q&A out of context.'''
    response = client.beta.chat.completions.parse(
        model=MODEL,
        messages=[{'role': 'system', 'content': SYS_MSG_QA},
                  {'role': 'user', 'content': prompt_question}],
        response_format=QuestionAnswer
    )
    return response.choices[0].message

In [44]:
output_qa = create_qa("Create a Q&A from the following text:\n\n" + page1)

output_qa

ParsedChatCompletionMessage[QuestionAnswer](content='{"questions":["What permission does Google grant regarding the reproduction of tables and figures in the paper?","What is the main contribution of the paper \'Attention Is All You Need\'?","What is the BLEU score achieved by the Transformer model on the WMT 2014 English-to-German translation task?","How long did it take to train the Transformer model on the WMT 2014 English-to-French translation task?","What does the Transformer model achieve when applied to English constituency parsing?"],"answers":[{"fact":"Google grants permission to reproduce tables and figures in the paper for journalistic or scholarly works, provided proper attribution is provided.","substring_quote":["Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works."]},{"fact":"The main contribution of the paper is the introduction of the Transformer model, which is based solely on attention me

In [46]:
questions = output_qa.parsed.questions
answers = output_qa.parsed.answers


for q,a in zip(questions, answers):
    print(f"Question: {q}")
    answer = input("Answer: ")
    print(f"Answer: {a.fact}")
    print(f"Quote: {a.substring_quote}")
    print("\n")

Question: What permission does Google grant regarding the reproduction of tables and figures in the paper?
Answer: Google grants permission to reproduce tables and figures in the paper for journalistic or scholarly works, provided proper attribution is provided.
Quote: ['Google hereby grants permission to reproduce the tables and figures in this paper solely for use in journalistic or scholarly works.']


Question: What is the main contribution of the paper 'Attention Is All You Need'?
Answer: The main contribution of the paper is the introduction of the Transformer model, which is based solely on attention mechanisms and does not use recurrence or convolutions.
Quote: ['We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely.']


Question: What is the BLEU score achieved by the Transformer model on the WMT 2014 English-to-German translation task?
Answer: The Transformer model achieved a B

Obviously questions could improve by implementing more complex architectures like RAG and advanced evaluation,
but the idea here is to find the usable primitives for people with simple Python scripting skills to be able to enhance their workflows
reliably.