In [35]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from jsonlines import jsonlines

from IPython.display import display, Markdown
from pprint import pprint


In [36]:
musique_dir = '../musique/musique/'


In [37]:
import panel as pn
pn.extension()

# Explore

### Data JSONLines

In [38]:
with jsonlines.open(musique_dir + 'data/musique_full_v1.0_train.jsonl') as reader:
    num_lines = sum(1 for _ in reader)

print(f'Number of lines in the file: {num_lines}')


Number of lines in the file: 39876


In [39]:
with jsonlines.open(musique_dir + '/data/musique_full_v1.0_train.jsonl') as reader:
    lines = [reader.read() for _ in range(1000)]
display(Markdown('**Line Example**'), pprint(lines[1]))


{'answer': 'north',
 'answer_aliases': ['North', 'N'],
 'answerable': True,
 'id': '2hop__269805_135710',
 'paragraphs': [{'idx': 0,
                 'is_supporting': False,
                 'paragraph_text': 'Milton F. Pavlic (1909–1942) was a United '
                                   'States Navy officer killed in action '
                                   'during World War II for whom a U.S. Navy '
                                   'high-speed transport was named.',
                 'title': 'Milton F. Pavlic'},
                {'idx': 1,
                 'is_supporting': False,
                 'paragraph_text': 'Osmund Holm-Hansen (also known as Oz '
                                   'Holm-Hansen) is a Norwegian-born American '
                                   'scientist, for whom Mount Holm-Hansen, in '
                                   'Antarctica is named. A plant physiologist '
                                   'by training, from 1962 Holm-Hansen was the '
           

**Line Example**

None

### Answerable/unanswerable lines

In [40]:
unanswerable_lines = [line for line in lines if not line['answerable']]
num_unanswerable_lines = len(unanswerable_lines)
print(f'Number of unanswerable lines: {num_unanswerable_lines}')


Number of unanswerable lines: 494


In [41]:
lines[1]

{'id': '2hop__269805_135710',
 'paragraphs': [{'idx': 0,
   'title': 'Milton F. Pavlic',
   'paragraph_text': 'Milton F. Pavlic (1909–1942) was a United States Navy officer killed in action during World War II for whom a U.S. Navy high-speed transport was named.',
   'is_supporting': False},
  {'idx': 1,
   'title': 'Osmund Holm-Hansen',
   'paragraph_text': 'Osmund Holm-Hansen (also known as Oz Holm-Hansen) is a Norwegian-born American scientist, for whom Mount Holm-Hansen, in Antarctica is named. A plant physiologist by training, from 1962 Holm-Hansen was the head of polar research at the Scripps Institution of Oceanography.',
   'is_supporting': False},
  {'idx': 2,
   'title': 'Sapphire Princess',
   'paragraph_text': '"Sapphire Princess" was built in Japan by Mitsubishi Heavy Industries, the second Princess Cruises ship to be built in a Japanese shipyard. Her only sister ship is "Diamond Princess", with whom she swapped names during construction.',
   'is_supporting': False},
  {'

### Tokenizer

In [42]:
import tiktoken

display(Markdown('**Token models**'), tiktoken.model.MODEL_TO_ENCODING)
tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo')
display(Markdown('**Tokenizer we are using**'), tokenizer)


**Token models**

{'gpt-4': 'cl100k_base',
 'gpt-3.5-turbo': 'cl100k_base',
 'gpt-3.5': 'cl100k_base',
 'gpt-35-turbo': 'cl100k_base',
 'davinci-002': 'cl100k_base',
 'babbage-002': 'cl100k_base',
 'text-embedding-ada-002': 'cl100k_base',
 'text-embedding-3-small': 'cl100k_base',
 'text-embedding-3-large': 'cl100k_base',
 'text-davinci-003': 'p50k_base',
 'text-davinci-002': 'p50k_base',
 'text-davinci-001': 'r50k_base',
 'text-curie-001': 'r50k_base',
 'text-babbage-001': 'r50k_base',
 'text-ada-001': 'r50k_base',
 'davinci': 'r50k_base',
 'curie': 'r50k_base',
 'babbage': 'r50k_base',
 'ada': 'r50k_base',
 'code-davinci-002': 'p50k_base',
 'code-davinci-001': 'p50k_base',
 'code-cushman-002': 'p50k_base',
 'code-cushman-001': 'p50k_base',
 'davinci-codex': 'p50k_base',
 'cushman-codex': 'p50k_base',
 'text-davinci-edit-001': 'p50k_edit',
 'code-davinci-edit-001': 'p50k_edit',
 'text-similarity-davinci-001': 'r50k_base',
 'text-similarity-curie-001': 'r50k_base',
 'text-similarity-babbage-001': 'r50k_b

**Tokenizer we are using**

<Encoding 'cl100k_base'>

In [43]:
test_line = lines[1]
test_paragraphs = test_line['paragraphs']

display(Markdown('**Paragraph Example**'), test_paragraphs[0])
test_tokens = tokenizer.encode(test_paragraphs[0]['paragraph_text'])
display(Markdown('**Tokens**'), test_tokens)
display(Markdown('**Number of Tokens**'), len(test_tokens))



**Paragraph Example**

{'idx': 0,
 'title': 'Milton F. Pavlic',
 'paragraph_text': 'Milton F. Pavlic (1909–1942) was a United States Navy officer killed in action during World War II for whom a U.S. Navy high-speed transport was named.',
 'is_supporting': False}

**Tokens**

[44,
 16695,
 435,
 13,
 43856,
 416,
 320,
 7028,
 24,
 4235,
 6393,
 17,
 8,
 574,
 264,
 3723,
 4273,
 19574,
 9640,
 7577,
 304,
 1957,
 2391,
 4435,
 5111,
 8105,
 369,
 8884,
 264,
 549,
 815,
 13,
 19574,
 1579,
 30699,
 7710,
 574,
 7086,
 13]

**Number of Tokens**

39

In [44]:
for paragraph in test_paragraphs:
    paragraph_text = paragraph['paragraph_text']
    paragraph_tokens = tokenizer.encode(paragraph_text)
    print(f'Number of tokens in paragraph: {len(paragraph_tokens)}')

Number of tokens in paragraph: 39
Number of tokens in paragraph: 68
Number of tokens in paragraph: 45
Number of tokens in paragraph: 64
Number of tokens in paragraph: 59
Number of tokens in paragraph: 131
Number of tokens in paragraph: 176
Number of tokens in paragraph: 59
Number of tokens in paragraph: 71
Number of tokens in paragraph: 86
Number of tokens in paragraph: 42
Number of tokens in paragraph: 36
Number of tokens in paragraph: 102
Number of tokens in paragraph: 61
Number of tokens in paragraph: 58
Number of tokens in paragraph: 57
Number of tokens in paragraph: 39
Number of tokens in paragraph: 35
Number of tokens in paragraph: 59
Number of tokens in paragraph: 48


In [45]:
pn.pane.JSON(lines[5])

BokehModel(combine_events=True, render_bundle={'docs_json': {'7099e600-3100-41ea-8fec-bc5a3e19a006': {'version…

### Prediction format

In [46]:
display(Markdown('**Examples of predictions**'))
with jsonlines.open(musique_dir + 'predictions/musique_ans_v1.0_dev_end2end_model_predictions.jsonl', 'r') as file:
    for i in range(5):
        display(pprint(file.read()))

**Examples of predictions**

{'id': '2hop__460946_294723',
 'predicted_answer': 'Jennifer Garner',
 'predicted_answerable': True,
 'predicted_support_idxs': [0, 10]}


None

{'id': '2hop__252311_366220',
 'predicted_answer': 'Steven Spielberg',
 'predicted_answerable': True,
 'predicted_support_idxs': [10, 18]}


None

{'id': '2hop__701895_752697',
 'predicted_answer': 'Cypriot part was merged into the Bank of Cyprus '
                     '(including insured deposits under 100,000 Euro) and the '
                     "'bad' part or legacy entity holds all the overseas "
                     'operations as well as uninsured deposits above 100,000 '
                     'Euro, old shares and bonds. The uninsured depositors '
                     'were subject to a bail-in and became the new '
                     'shareholders of the legacy entity. As at May 2017, the '
                     'legacy entity is one of the largest shareholders of Bank '
                     'of Cyprus with 4.8% but does not hold a board seat. All '
                     'the overseas operations, of the now defunct Cyprus '
                     'Popular Bank, are also held by the legacy entity, until '
                     'they are sold by the Special Administrator, at first Ms '
                     'Andri Antoniadou, who

None

{'id': '2hop__259228_793698',
 'predicted_answer': 'Fairfield, Connecticut. Its main offices are located at '
                     '30 Rockefeller Plaza at Rockefeller Center in New York '
                     'City, known now as the Comcast Building. It was formerly '
                     'known as the GE Building for the prominent GE logo on '
                     "the roof; NBC's headquarters and main studios are also "
                     'located in the building. Through its RCA subsidiary, it '
                     'has been associated with the center since its '
                     'construction in the 1930s. GE moved its corporate '
                     'headquarters from the GE Building on Lexington Avenue to '
                     'Fairfield in 1974. [[PP]] The lander is named after the '
                     'Philae obelisk, which bears a bilingual inscription and '
                     'was used along with the Rosetta Stone to decipher '
                     'Egyptian hie

None

{'id': '2hop__481349_302087',
 'predicted_answer': 'Bombardier Inc. the former CRJ100 and CRJ200 series are '
                     'no longer in production but still in active airline '
                     'service, while the more recent CRJ700, CRJ900 and '
                     'CRJ1000 series are in production and in service. [[PP]] '
                     'Products offered through the Great Value brand are often '
                     'claimed to be as good as national brand offerings, but '
                     'are typically sold at a lower price because of lower '
                     'marketing and advertising expense. As a house or store '
                     'brand, the Great Value line does not consist of goods '
                     'produced by Walmart, but is a labeling system for items '
                     'manufactured and packaged by a number of agricultural '
                     'and food corporations, such as ConAgra, Sara Lee which, '
                     'in add

None

# Doc loading and splitting

In [47]:
lines[-2]

{'id': '2hop__604134_131944',
 'paragraphs': [{'idx': 0,
   'title': 'Commonwealth of the Philippines',
   'paragraph_text': "The Commonwealth of the Philippines (; ) was the administrative body that governed the Philippines from 1935 to 1946, aside from a period of exile in the Second World War from 1942 to 1945 when Japan occupied the country. It replaced the Insular Government, a United States territorial government, and was established by the Tydings–McDuffie Act. The Commonwealth was designed as a transitional administration in preparation for the country's full achievement of independence.",
   'is_supporting': False},
  {'idx': 1,
   'title': 'Lake Oesa',
   'paragraph_text': 'Lake Oesa is a body of water located at an elevation of 2,267m (7438 ft) in the mountains of Yoho National Park, near Field, British Columbia, Canada.',
   'is_supporting': False},
  {'idx': 2,
   'title': 'Arafura Swamp',
   'paragraph_text': 'The Arafura Swamp is a large inland freshwater wetland in Arnh

In [48]:
paragraphs = lines[-2]['paragraphs']
paragraphs




[{'idx': 0,
  'title': 'Commonwealth of the Philippines',
  'paragraph_text': "The Commonwealth of the Philippines (; ) was the administrative body that governed the Philippines from 1935 to 1946, aside from a period of exile in the Second World War from 1942 to 1945 when Japan occupied the country. It replaced the Insular Government, a United States territorial government, and was established by the Tydings–McDuffie Act. The Commonwealth was designed as a transitional administration in preparation for the country's full achievement of independence.",
  'is_supporting': False},
 {'idx': 1,
  'title': 'Lake Oesa',
  'paragraph_text': 'Lake Oesa is a body of water located at an elevation of 2,267m (7438 ft) in the mountains of Yoho National Park, near Field, British Columbia, Canada.',
  'is_supporting': False},
 {'idx': 2,
  'title': 'Arafura Swamp',
  'paragraph_text': 'The Arafura Swamp is a large inland freshwater wetland in Arnhem Land, in the Top End of the Northern Territory of Au

In [62]:
import tiktoken

def token_len(text: str, model: str = "gpt-4") -> int:
    encoder = tiktoken.encoding_for_model(model)
    return len(encoder.encode(text))

pprint(paragraphs[0]['paragraph_text'])
print('Token length: ', token_len(paragraphs[0]['paragraph_text']))


('The Commonwealth of the Philippines (; ) was the administrative body that '
 'governed the Philippines from 1935 to 1946, aside from a period of exile in '
 'the Second World War from 1942 to 1945 when Japan occupied the country. It '
 'replaced the Insular Government, a United States territorial government, and '
 'was established by the Tydings–McDuffie Act. The Commonwealth was designed '
 "as a transitional administration in preparation for the country's full "
 'achievement of independence.')
Token length:  95


In [70]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=20, chunk_overlap=5, length_function=token_len)
splits0 = text_splitter.split_text(paragraphs[0]['paragraph_text'])
splits0_tups = [('Token length: ' + str(token_len(s)), s) for s in splits0]
splits1 = text_splitter.split_text(paragraphs[1]['paragraph_text'])
splits1_tups = [('Token length: ' + str(token_len(s)), s) for s in splits1]

display(Markdown('**Paragraph 1**'))
pprint(splits0_tups)
display(Markdown('**Paragraph 2**'))
display(splits1_tups)

**Paragraph 1**

[('Token length: 20',
  'The Commonwealth of the Philippines (; ) was the administrative body that '
  'governed the Philippines from 1935 to'),
 ('Token length: 20',
  'from 1935 to 1946, aside from a period of exile in the Second World War'),
 ('Token length: 20',
  'in the Second World War from 1942 to 1945 when Japan occupied the country. '
  'It'),
 ('Token length: 20',
  'occupied the country. It replaced the Insular Government, a United States '
  'territorial government, and was established'),
 ('Token length: 20',
  'government, and was established by the Tydings–McDuffie Act. The '
  'Commonwealth was designed'),
 ('Token length: 19',
  'The Commonwealth was designed as a transitional administration in '
  "preparation for the country's full achievement of independence.")]


**Paragraph 2**

[('Token length: 18',
  'Lake Oesa is a body of water located at an elevation of 2,267m'),
 ('Token length: 19',
  '2,267m (7438 ft) in the mountains of Yoho National Park, near'),
 ('Token length: 11', 'National Park, near Field, British Columbia, Canada.')]

# Prompting

In [149]:
from dotenv import load_dotenv
load_dotenv()

True

In [10]:
from dotenv import load_dotenv
from langchain_anthropic import ChatAnthropic

load_dotenv()

chat_model = ChatAnthropic(model_name='claude-3-haiku-20240307')
joke = chat_model.invoke("Tell me a mid joke about airplanes and horses")
display(joke)
joke.pretty_print()

AIMessage(content="Here's a mildly silly joke about airplanes and horses:\n\nWhy did the horse refuse to get on the airplane? Because it already had a stable flight plan!", response_metadata={'id': 'msg_01K9jCUru7b4TiBBC6eaWRxf', 'model': 'claude-3-haiku-20240307', 'stop_reason': 'end_turn', 'stop_sequence': None, 'usage': {'input_tokens': 18, 'output_tokens': 39}}, id='run-2cb963b0-3180-432e-95a7-368169c5bef0-0')


Here's a mildly silly joke about airplanes and horses:

Why did the horse refuse to get on the airplane? Because it already had a stable flight plan!


In [72]:
#| code-fold: false

from pydantic import BaseModel, Field
from typing import Dict, List, Union, Tuple, Optional
import json

class Node(BaseModel):
    semantic_id: str = Field(..., description="The unique identifier of the node that is \
                             a reference to create edges between different nodes.")
    category: str = Field(..., description="The category of the node")
    attributes: Optional[Dict[str, Union[str, int, bool]]] = Field(None, description="Additional properties of the node")

class Edge(BaseModel):
    from_node: str = Field(..., description="The id of the node from which the edge originates. Only semantic_ids belong here, nothing else.")
    to_node: str = Field(..., description="The id of the node to which the edge connects. Only semantic_ids belong here, nothing else.")
    category: str = Field(..., description="The type of the relationship")
    attributes: Optional[Dict[str, Union[str, int, bool]]] = Field(None, description="Additional properties of the edge")

class Graph(BaseModel):
    nodes: List[Node] = Field(...,description="A list of nodes in the graph")
    edges: List[Edge] = Field(...,description="A list of edges in the graph")

Graph.model_json_schema()

{'$defs': {'Edge': {'properties': {'from_node': {'description': 'The id of the node from which the edge originates. Only semantic_ids belong here, nothing else.',
     'title': 'From Node',
     'type': 'string'},
    'to_node': {'description': 'The id of the node to which the edge connects. Only semantic_ids belong here, nothing else.',
     'title': 'To Node',
     'type': 'string'},
    'category': {'description': 'The type of the relationship',
     'title': 'Category',
     'type': 'string'},
    'attributes': {'anyOf': [{'additionalProperties': {'anyOf': [{'type': 'string'},
         {'type': 'integer'},
         {'type': 'boolean'}]},
       'type': 'object'},
      {'type': 'null'}],
     'default': None,
     'description': 'Additional properties of the edge',
     'title': 'Attributes'}},
   'required': ['from_node', 'to_node', 'category'],
   'title': 'Edge',
   'type': 'object'},
  'Node': {'properties': {'semantic_id': {'description': 'The unique identifier of the node tha

In [13]:
Graph.model_json_schema()

{'$defs': {'Edge': {'properties': {'from_node': {'description': 'The id of the node from which the edge originates. Only semantic_ids belong here, nothing else.',
     'title': 'From Node',
     'type': 'string'},
    'to_node': {'description': 'The id of the node to which the edge connects. Only semantic_ids belong here, nothing else.',
     'title': 'To Node',
     'type': 'string'},
    'category': {'description': 'The type of the relationship',
     'title': 'Category',
     'type': 'string'},
    'attributes': {'anyOf': [{'additionalProperties': {'anyOf': [{'type': 'string'},
         {'type': 'integer'},
         {'type': 'boolean'}]},
       'type': 'object'},
      {'type': 'null'}],
     'default': None,
     'description': 'Additional properties of the edge',
     'title': 'Attributes'}},
   'required': ['from_node', 'to_node', 'category'],
   'title': 'Edge',
   'type': 'object'},
  'Node': {'properties': {'semantic_id': {'description': 'The unique identifier of the node tha

In [75]:
json.dumps(Graph.model_json_schema())


'{"$defs": {"Edge": {"properties": {"from_node": {"description": "The id of the node from which the edge originates. Only semantic_ids belong here, nothing else.", "title": "From Node", "type": "string"}, "to_node": {"description": "The id of the node to which the edge connects. Only semantic_ids belong here, nothing else.", "title": "To Node", "type": "string"}, "category": {"description": "The type of the relationship", "title": "Category", "type": "string"}, "attributes": {"anyOf": [{"additionalProperties": {"anyOf": [{"type": "string"}, {"type": "integer"}, {"type": "boolean"}]}, "type": "object"}, {"type": "null"}], "default": null, "description": "Additional properties of the edge", "title": "Attributes"}}, "required": ["from_node", "to_node", "category"], "title": "Edge", "type": "object"}, "Node": {"properties": {"semantic_id": {"description": "The unique identifier of the node that is                              a reference to create edges between different nodes.", "title": 

In [84]:
#| code-fold: false

json_rules = \
"""We need to create a JSON object that contains a list of nodes and edges that connect the nodes.
Both, nodes and edges, have optional attributes.
Your goal is to extract as much pertinent information from the passage as possible and create nodes and edges with the extracted information.
If history is provided, it will be in the JSON schema you are given. You may create new connections between the nodes and edges in the history and the new nodes you are producing.
If you wish to change/update any of the node attributes in the provided history based on newly gathered information, simply reuse the semantic_ids of the nodes you wish to change.
If you wish to modify/update the edge attributes in the history, reuse the semantic_ids of the 'from' and 'to' nodes of any edge you wish to change.
Use the following schema and make sure to read the descriptions:
""" 

json_prompt_instructions = \
    json_rules + \
    json.dumps(Graph.model_json_schema()) + \
    "\n-----\n"

pprint(json_prompt_instructions)

('We need to create a JSON object that contains a list of nodes and edges that '
 'connect the nodes.\n'
 'Both, nodes and edges, have optional attributes.\n'
 'Your goal is to extract as much pertinent information from the passage as '
 'possible and create nodes and edges with the extracted information.\n'
 'If history is provided, it will be in the JSON schema you are given. You may '
 'create new connections between the nodes and edges in the history and the '
 'new nodes you are producing.\n'
 'If you wish to change/update any of the node attributes in the provided '
 'history based on newly gathered information, simply reuse the semantic_ids '
 'of the nodes you wish to change.\n'
 'If you wish to modify/update the edge attributes in the history, reuse the '
 "semantic_ids of the 'from' and 'to' nodes of any edge you wish to change.\n"
 'Use the following schema and make sure to read the descriptions:\n'
 '{"$defs": {"Edge": {"properties": {"from_node": {"description": "The id of

In [118]:
#| code-fold: false

graph_creator_content = \
"""You are a brilliant and efficient creator of JSON objects that capture the essence of passages and who follows instructions unbelievably well.
You will be first given instructions and a json schema, then you will be provided a passage to extract the information from.
Your instructions are:
{instructions}
History:
{history}
"""

In [86]:
#| code-fold: false

pass_passage_content = "Below is the passage to extract the values from.\n*****\nPassage:\n{passage}"

In [136]:
#| code-fold: false

from langchain_core.prompts import (
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)

graph_analyst_template = SystemMessagePromptTemplate.from_template(template=graph_creator_content,
                                                                   input_variables=['history', 'instructions'])
pass_passage_template = HumanMessagePromptTemplate.from_template(pass_passage_content, input_variables=['passage'])

gen_template = graph_analyst_template + pass_passage_template

gen_template.invoke({'passage': paragraphs[0]['paragraph_text'],
                     'history': '',
                     'instructions': json_prompt_instructions})



ChatPromptValue(messages=[SystemMessage(content='You are a brilliant and efficient creator of JSON objects that capture the essence of passages and who follows instructions unbelievably well.\nYou will be first given instructions and a json schema, then you will be provided a passage to extract the information from.\nYour instructions are:\nWe need to create a JSON object that contains a list of nodes and edges that connect the nodes.\nBoth, nodes and edges, have optional attributes.\nYour goal is to extract as much pertinent information from the passage as possible and create nodes and edges with the extracted information.\nIf history is provided, it will be in the JSON schema you are given. You may create new connections between the nodes and edges in the history and the new nodes you are producing.\nIf you wish to change/update any of the node attributes in the provided history based on newly gathered information, simply reuse the semantic_ids of the nodes you wish to change.\nIf yo

In [128]:
graph_analyst_template = SystemMessagePromptTemplate.from_template(template=graph_creator_content,
                                                    input_variables=['history'],
                                                    partial_variables={'instructions': json_prompt_instructions})
graph_analyst_template.format(history='history')



KeyError: 'instructions'