# What

As discussed in #146 it seems that a searching problem is what this will turn into.
After the creation of #163 this branch and notebook is the base for further work.

Using a vector dba nd rag could be the best way for people to search through all of the reports.

Currently **this** notebook has a few things:
- Getting and preparing the datasets which will be used by other notebooks
- A simple test of vector database and RAg (this is now deprecated)

There are other notebooks in this folder which are:
- [#165_basic_safety_issue_rag.ipynb](#165_basic_safety_issue_rag.ipynb) which is to answer #165 and does waht hte title suggests

## Modules

In [None]:
# from me
from engine.OpenAICaller import openAICaller
from engine.Extract_Analyze import ReportExtracting
import engine.Modes as Modes

# third party

from importlib import reload

import lancedb

import yaml
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()


import torch
from sentence_transformers import SentenceTransformer
import voyageai
from tenacity import retry, stop_after_attempt, wait_random_exponential
from transformers import AutoTokenizer

from dotenv import load_dotenv


# built in
import os

from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
import multiprocessing
import random
import regex as re

load_dotenv('../../.env')


# Getting data

The initial idea would be to have all of the important text inside the vetor database. This means that the searching could happen with most of the report.

In [None]:
output_folder = '../../output'
reports = [dir for dir in os.listdir(output_folder) if os.path.isdir(os.path.join(output_folder, dir))]

## Text

Since I am going to want to look at all aspects of the report this means that I need to get all of the reports

In [None]:
report_text = []

for report in reports:

    report_text_path = os.path.join(output_folder, report, f'{report}.txt')
    
    if not os.path.exists(report_text_path):
        continue

    with open(report_text_path, 'r') as f:
        text = f.read()

    if text == None:
        continue

    report_text.append({
        'report_id': report,
        'text': text
    })

report_text_df = pd.DataFrame(report_text)

report_text_df

## Important text

In [None]:
# load all important text

with open('../../config.yaml', 'r') as f:
    config = yaml.safe_load(f)

important_text_extractor = ReportExtracting.ReportExtractingProcessor(output_folder, config['engine']['output']['reports'])

important_texts = []

for report in reports:

    text = important_text_extractor.get_important_text(report, False)

    if text == None:
        continue

    important_texts.append({
        'report_id': report,
        'important_text': text
    })

important_texts_df = pd.DataFrame(important_texts)

important_texts_df

### Token count

In [None]:
important_texts_df['gpt-3.5_tokens'] = important_texts_df['important_text'].apply(lambda x: openAICaller.get_tokens([x])[0])

In [None]:
important_texts_df.hist(column='gpt-3.5_tokens')

# Check what token cutoff is reasonable

max_tokens = 16_000/1.2

print(f'{important_texts_df[important_texts_df["gpt-3.5_tokens"] < max_tokens].shape[0] / important_texts_df.shape[0] * 100:.2f}% are under {max_tokens:.2f} tokens. With a total sum of {important_texts_df["gpt-3.5_tokens"].sum()} tokens.')

important_texts_df[important_texts_df['gpt-3.5_tokens'] > max_tokens]



## Safety issues

In [None]:
safety_issues_df = pd.read_csv('data/safety_issues.csv')[['report_id', 'safety_issue']].rename({'safety_issue': 'si'}, axis = 1)
safety_issues_df['si'] = safety_issues_df['si'].apply(lambda x: x.strip())

safety_issues_df

# Preparing data

In [None]:
all_reports_df = pd.read_pickle('data/reports_extracted_sections.pkl')
all_reports_df

In [None]:
all_reports_df.query('report_id == "2020_103"').loc[111, 'extracted_sections']

## Splitting document

I think something quite usueful about the vector databases will be not just finding the reports but also the specific report sections. For this I want to get a dataset that has the report represented as a strucutred document with sections and such. Then I could embed each of these and have much more fine-grained searching.

I am going to ask TAIC directly to see if they have a dataset of word documents. I have heard back and it sounds like they wont have that dataset available. I will haveto work with the report extractor and see how well that goes. think it shouldn't be too complex.


In [None]:
reload(ReportExtracting)

report_text_df['content_section'] = report_text_df.apply(lambda row: ReportExtracting.ReportExtractor(row['text'], row['report_id']).extract_contents_section(), axis=1)

report_text_df

In [None]:
def get_potential_section():
    sections = list(map(str, range(1,15)))

    subsections = [
        [
            section + '.' + str(subsection)
            for subsection in
            range(1,100)
        ]
        for section in 
        sections
    ]

    paragraphs = [
        [
            [
                subsection + '.' + str(paragraph)
                for paragraph in
                range(1,100)
            ]
            for subsection in
            section
        ]
        for section in 
        subsections
    ]

    return paragraphs

In [None]:
reload(ReportExtracting)

all_potential_sections = get_potential_section()

def extract_sections(num_sections, report_text, debug = False):
    get_parts_regex = r'(((\d{1,2}).\d{1,2}).\d{1,2})'
    
    extreactor = ReportExtracting.ReportSectionExtractor(report_text, num_sections)

    sections = []

    for section in all_potential_sections:
        if debug: print(f"Looking at section {re.search(get_parts_regex, section[0][0]).group(3)}")

        subsection_missing_count = 0
        for sub_section in section:
            sub_section_str = re.search(get_parts_regex, sub_section[0]).group(2)
            if debug: print(f" Looking at subsection {sub_section_str}")

            paragraph_missing_count = 0

            paragraphs = []
            for paragraph in sub_section:
                if debug: print(f"  Looking for paragraph {paragraph}")

                paragraph_text = extreactor.extract_section(paragraph, useLLM = False)

                if paragraph_text is None and (paragraph_missing_count > 0 or paragraph[-1] == '1'):
                    break
                elif paragraph_text is None:
                    paragraph_missing_count += 1
                    continue

                paragraphs.append({'section': paragraph, 'section_text': paragraph_text})

            if len(paragraphs) == 0:
                if debug: print(f" No paragraphs found ")
                sub_section_text = extreactor.extract_section(sub_section_str, useLLM = False)

                if sub_section_text is None and subsection_missing_count > 0:
                    if debug: print(f" No subsection found")
                    break
                elif sub_section_text is None:
                    subsection_missing_count += 1
                    continue

                sections.append({'section': sub_section_str, 'section_text': sub_section_text})
            else:
                sections.extend(paragraphs)

    df = pd.DataFrame(sections)

    if debug and df.empty: print(f"No sections extracted")

    return df

# Parallel apply function
def parallel_apply(df, func, num_sections, debug=False):
    with ProcessPoolExecutor(max_workers=multiprocessing.cpu_count()) as executor:
        futures = [
            executor.submit(func, num_sections, row['text'], debug) for index, row in df.iterrows()
        ]
        results = []
        for future in tqdm(futures, total=len(futures)):
            results.append(future.result())
    return results

test_df = report_text_df.sample(30, random_state=42)

report_text_df['extracted_sections'] = parallel_apply(report_text_df, extract_sections, 15, False)

report_text_df

Here is the calculation of how many missing sections there are.

For example if it has found 4.3 and 4.5 but not 4.4 then that counts as a missing section

In [None]:
def find_missing_sections(df):

    if df.empty: return []

    # Parse sections into a list of tuples of integers
    sections = df['section'].apply(lambda x: tuple(map(int, x.split('.')))).tolist()
    
    missing_sections = []

    def section_to_str(section_tuple):
        return '.'.join(map(str, section_tuple))
    
    # Iterate through the sections to find missing sections
    for i in range(len(sections) - 1):
        current = sections[i]
        next_section = sections[i + 1]
        
        # If the current section and next section are in the same main section
        if current[0] == next_section[0]:
            # Check for gaps in the subsection
            for sub_section in range(current[1] + 1, next_section[1]):
                missing_sections.append(f"{current[0]}.{sub_section}")
        else:
            # Handle the case where we move to the next main section
            # Add missing subsections in the current main section
            for sub_section in range(current[1] + 1, 10):  # Assuming subsections go up to .9
                missing_sections.append(f"{current[0]}.{sub_section}")
            
            # Add all subsections for the main sections between current and next
            for main_section in range(current[0] + 1, next_section[0]):
                for sub_section in range(1, 10):  # Assuming subsections go up to .9
                    missing_sections.append(f"{main_section}.{sub_section}")
            
            # Add missing subsections in the next main section up to the found subsection
            for sub_section in range(1, next_section[1]):
                missing_sections.append(f"{next_section[0]}.{sub_section}")

    # Filter out the missing sections that do not have both previous and next sections
    filtered_missing_sections = []
    for section in missing_sections:
        sec_tuple = tuple(map(int, section.split('.')))
        prev_section = (sec_tuple[0], sec_tuple[1] - 1) if sec_tuple[1] > 1 else (sec_tuple[0] - 1, 9)
        next_section = (sec_tuple[0], sec_tuple[1] + 1) if sec_tuple[1] < 9 else (sec_tuple[0] + 1, 1)
        
        if section_to_str(prev_section) in df['section'].values and section_to_str(next_section) in df['section'].values:
            filtered_missing_sections.append(section)

    return filtered_missing_sections

dev_df = report_text_df.loc[0, 'extracted_sections']

report_text_df['missing_sections'] = report_text_df['extracted_sections'].apply(lambda df: find_missing_sections(df))

report_text_df['percent_missing'] = report_text_df.apply(lambda row: len(row['missing_sections']) / (len(row['extracted_sections']) + len(row['missing_sections'])) if not row['extracted_sections'].empty else 0, axis = 1)

print(f"There are {(report_text_df['percent_missing'] > 0).sum() } number of reports with missing sections")
print(f"On average {report_text_df['percent_missing'].replace(0, np.nan).mean()*100:.2f}% of sections are missing when a report is missing sections")
print(f"Total number of missing sections {sum(report_text_df['missing_sections'].apply(len))} which is {sum(report_text_df['missing_sections'].apply(len)) / sum(report_text_df['extracted_sections'].apply(len)) *100:.2f}% of total sections")


In [None]:
# This cell is here for manually inspecting the process of section extraction.


reload(ReportExtracting)

report_id = '2014_004'

with open(f'/home/james/code/TAIC-report-summary/output/{report_id}/{report_id}.txt', 'r') as f:
    text = f.read()

sectionExtractor = ReportExtracting.ReportSectionExtractor(text, report_id)

section_str = '4.2.5'

print(sectionExtractor._get_section_start_end_regexs(section_str)[1][2])

search_bounds = sectionExtractor._get_section_search_bounds(section_str,sectionExtractor._get_section_start_end_regexs(section_str)[1])
display(text[search_bounds[0]:search_bounds[1]])

sectionExtractor.extract_section(section_str, useLLM= False)


This extraction is entierly implemented in the `ReportExtracting` modeule. I will list some of the problems that have been identified:

| problem | examples |
| --- | ---- | 
| Missing the ending of the section and getting multiple sections | 2013_107 and section 3.3 has all of the 3.4 and it is missing 3.4. |
| References to paragraphs in previous paragraphs | 2014_005 4.6.11. |
| Final paragraph with following appendices | 2011_203 with 8.2. It doesn't know when the end of the section is. |
| Missing page in PDF conversion and thus missing many subsequent section, subsections etc. | 2019_204 3.18, 2019_204 2.10, 2018_001 3.16 and 2.5, 2018_005 3.37 and 2.8 |
| End of the reports, last section | 2018_001  8.2, 2010_201 8.4, 2011_203 8.2|


## merge important text and sections togather

In [None]:
all_reports_df = report_text_df.merge(important_texts_df, on='report_id', how='inner')

for index, row in all_reports_df.iterrows():
    if row['extracted_sections'].empty:
        print(f"Deleting report {row['report_id']}")
        all_reports_df.drop(index, inplace=True)

all_reports_df.reset_index(drop=True, inplace=True)

all_reports_df.to_pickle('data/reports_extracted_sections.pkl')

all_reports_df

In [None]:
all_sections_long_df = pd.concat(all_reports_df.apply(lambda x: x['extracted_sections'].assign(report_id=x['report_id']), axis=1).tolist(), ignore_index=True)

all_sections_long_df

In [None]:
all_sections_long_df.query('report_id == "2020_103"')

# Embedding



In [None]:
def embed_documents(df, embedding_function, document_column_name, embedding_column_name, batch_size=100, max_workers=10, debug = False):
    """
    Given a dataframe with atleast the document column and embedding column name it will generate embeddins for all of the documents that dont have embeddings.pkl
    It does this by calling the embedding_function on batches of the documents.
    There is multithreading to speed up the process.

    Args:
        df: The dataframe
        embedding_function: The function that will be called to generate embeddings for documents. It must be`f([embedding]) -> [embedding]`
        document_column_name: The name of the column that contains the documents
        embedding_column_name: The name of the column that will contain the embeddings
    Returns the dataframe with the missing embeddings filled in.
    """

    # check which new columns needs to be computed, i.e 

    missing_embeddings = df[df[embedding_column_name].isna()]

    if len(missing_embeddings) == 0:
        if debug: print(f"No missing embeddings")
        return df

    section_texts = missing_embeddings[document_column_name].tolist()

    print(f"There are {len(section_texts)} missing embeddings with {len(df)} number of documents")
    
    # Split section_texts into batches of size `batch_size`
    batches = [section_texts[i:i+batch_size] for i in range(0, len(section_texts), batch_size)]

    if debug: 
        for batch in batches: print(len(batch))

    embeddings = [None] * len(missing_embeddings)
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(embedding_function, batch): i for i, batch in enumerate(batches)}

        for future in tqdm(as_completed(futures), total=len(futures)):
            batch_embeddings = future.result()
            batch_index = futures[future]
            if debug:
                print(f'Looking at batch {batch_index}, with return length {len(batch_embeddings)}')
            # Place embeddings in the correct positions
            start_index = batch_index * batch_size
            end_index = start_index + len(batch_embeddings)
            if debug: print(f"Setting embeddings {start_index} to {end_index}")
            embeddings[start_index:end_index] = batch_embeddings

    embeddings = pd.Series(embeddings, index=missing_embeddings.index) 
    # Update the dataframe with the computed embeddings
    df_embedding_column = df.loc[missing_embeddings.index, embedding_column_name]
    if debug:
        display(df_embedding_column)
        print(f'Finished embedding the documents there are {len(embeddings)} embeddings, which look like:')
        display(embeddings)
        print(f'Each embedding is in {len(embeddings[missing_embeddings.index[0]])} dimensions')

    df.loc[missing_embeddings.index, embedding_column_name] = embeddings 
    
    return df

## Using sentence transformers

I have started looking at this but using sentence transformers won't work locally with models of a large enough input size to ahndle these large documents.

There are two options.

Firstly using hugging faces dedicated inference API which has been done for the safety theme genration.  
Secondly splitting the document into smaller parts so that only specific sections are extracted.

In [None]:
def embed_sections_sentence_transformers(df, model):
    torch.cuda.empty_cache()

    embeddings = model.encode(df['section_text'], show_progress_bar=True)

    list_of_embeddings = list(embeddings)

    return df.assign(extracted_section_embeddings = list_of_embeddings)

### Alibaba-NLP/gte-large-en-v1.5

This has been choosen as it has the highest retrival score on MTEB. However as they use custom code it was too difficult to get setup so it will be put on hold.

In [None]:
model = SentenceTransformer('Alibaba-NLP/gte-large-en-v1.5', trust_remote_code=True)

In [None]:
all_reports_df['extracted_sections'] = all_reports_df['extracted_sections'].progress_apply(embed_sections_sentence_transformers, args = (model,))


## Using voyageai

Voyage AI seems to have the largest embedding model that can take the input up to 16,000 tokens. This is much better than offered by openai embeddings models. With this extra size it allows me to vector search the whole document and not have to split it up into chunks. Alternatively I could start experimenting with Anthropic and their model offerings.

However I have an idea of splitting the documents up into chunks and using it to provide more accurate and usual information to the LLM prompt.

In [None]:
vo = voyageai.Client(max_retries = 10)
@retry(wait=wait_random_exponential(multiplier=1, min = 1, max=60))  
def embed_with_backoff(**kwargs):
    print('trying')
    return vo.embed(**kwargs)


def embed_batch(batch):
    return vo.embed(texts = batch, model="voyage-large-2-instruct", input_type="document", truncation=False).embeddings

tokenizer = AutoTokenizer.from_pretrained('voyageai/voyage')

### Embeddings on long form dataframe of individual sections

In [None]:
section_embeddings_file_name = os.path.join('embeddings', 'voyageai_section_embeddings.pkl')

In [None]:
all_sections_long_df['num_tokens'] = all_sections_long_df['section_text'].apply(lambda x: len(tokenizer.tokenize(x)))

In [None]:
all_sections_long_df = all_sections_long_df.query('num_tokens < 15000')

In [None]:
voyageai_loaded = pd.read_pickle(section_embeddings_file_name)
voyageai_loaded

In [None]:
merged_section_embeddings = pd.merge(voyageai_loaded, all_sections_long_df, on=['report_id', 'section', 'section_text', 'num_tokens'], how='right')
merged_section_embeddings

In [None]:
voyageai_embeddings = embed_documents(merged_section_embeddings, embed_batch, 'section_text', 'section_text_embedding', batch_size=50, debug=False)

voyageai_embeddings

In [None]:
voyageai_embeddings['year'] = voyageai_embeddings['report_id'].apply(lambda x: int(x[0:4]))
voyageai_embeddings['mode'] = voyageai_embeddings['report_id'].apply(lambda x: Modes.get_report_mode_from_id(x).value)

In [None]:
voyageai_embeddings.to_pickle(section_embeddings_file_name)
voyageai_embeddings

### Embedding of important texts

In [None]:
important_text_embedding_file_name = os.path.join('embeddings', 'voyageai_important_text_embeddings.pkl')

In [None]:
try:
    important_text_voyageai_embedding_loaded = pd.read_pickle(important_text_embedding_file_name)
except:
    important_text_voyageai_embedding_loaded = pd.DataFrame(columns=['report_id', 'important_text', 'important_text_embedding'])

important_text_voyageai_embedding_loaded

In [None]:
merged_important_text_embeddings = important_text_voyageai_embedding_loaded.merge(all_reports_df[['report_id', 'important_text']], on=['report_id','important_text'], how='right')
merged_important_text_embeddings

In [None]:
merged_important_text_embeddings['important_text_token_length'] = merged_important_text_embeddings['important_text'].apply(lambda text: len(tokenizer.tokenize(text)))
print("These are the rpeorts that will be truncated. I will wait for larger embedding models to aleviate this problem")
      
display(merged_important_text_embeddings[merged_important_text_embeddings['important_text_token_length'] > 16000])

print(f"There are a total of {merged_important_text_embeddings['important_text_token_length'].sum()} token in these reports")

In [None]:
def embed_important_text(important_text_batch):
    return vo.embed(important_text_batch, model="voyage-large-2", input_type="document", truncation=True).embeddings

max_tokens_per_batch = 120_000

average_token_length = merged_important_text_embeddings['important_text_token_length'].mean()

batch_size = int(max_tokens_per_batch // (average_token_length*1.5))

print(f"Batch size will be {batch_size}")

voyageai_embeddings_important_text = embed_documents(merged_important_text_embeddings, embed_important_text, 'important_text', 'important_text_embedding', batch_size, debug=True)
voyageai_embeddings_important_text

In [None]:
voyageai_embeddings_important_text.to_pickle(important_text_embedding_file_name)
voyageai_embeddings_important_text

### Embedding safety issues

In [None]:
embeddings_file_path = 'embeddings/voyageai_si_embeddings.pkl'

if os.path.exists(embeddings_file_path):
    embeddings = pd.read_pickle(embeddings_file_path)
else:
    embeddings = pd.DataFrame(columns=['report_id', 'si', 'si_embedding'])

display(embeddings)


embeddings = embeddings.merge(safety_issues_df, on=['report_id', 'si'], how ='outer')
embeddings.rename(columns={'vector': 'si_embedding'}, inplace=True)

embeddings


In [None]:
embeddings = embed_documents(embeddings, embed_batch, 'si', 'si_embedding', debug = True)

embeddings['mode'] = embeddings.report_id.apply(lambda x: Modes.get_report_mode_from_id(x).value)
embeddings.to_pickle(embeddings_file_path)
embeddings

# Vector database

I will now test out importing the embeddings into a vector database.



## Getting dataset ready

There are three things that we need for this data set
    - ids
    - payload
    - embeddings

The ids will be converted from the report_ids

payload will include all the data we are interested in:

Embedding have been calculated above using voyageAI.

### Getting embeddings and ids

In [None]:
embeddings = pd.read_pickle('embeddings.pkl')

embeddings['id'] = embeddings['report_id'].apply(lambda x: int(x.replace('_','')))

embeddings

### Getting payload data

I will do this by using the search function from the viewer to turn the whole output folder into a dataframe.

Then I can join it in with the embeddings.

In [None]:
# I am just going to load it in from using the webapp and export button.

search_results = pd.read_csv('search_results.csv')

payload_df = search_results[search_results.columns.drop(['NoMatches', 'linksVisual', 'ErrorMessage'])]

payload_df.rename(columns={'ReportID': 'report_id'}, inplace=True)

payload_df = payload_df.join(important_texts_df.set_index('report_id'), on='report_id')

payload_df

### Merging payload with embeddings

In [None]:
merged_df = pd.merge(embeddings, payload_df, on='report_id', how='left')

merged_df.rename(columns={'embedding': 'vector'}, inplace=True)

merged_df['mode'] = merged_df['report_id'].apply(lambda x: Modes.Mode.as_char(Modes.get_report_mode_from_id(x)))
merged_df['year'] = merged_df['report_id'].apply(lambda x: int(x[0:4]))

merged_df.to_csv('merged_df.csv')

merged_df.dropna(subset=['text'], inplace=True)

merged_df

## Creating vector database

In [None]:
# Setting up lanceDB

!poetry add lancedb

In [None]:
# Create db

uri = 'vector_db'

db = lancedb.connect(uri)

In [None]:
# Create new table
tbl_name = 'taic-reports'

if tbl_name in db.table_names(tbl_name):
    db.drop_table(tbl_name)

tbl = db.create_table(
    tbl_name,
    data = merged_df
) 


In [None]:
vo = voyageai.Client()

def search_vectorDB(text, tbl, results = 10):
    search_text = text

    search_text_embeded = vo.embed(search_text, model="voyage-large-2", input_type="document").embeddings[0]

    return tbl.search(search_text_embeded) \
    .limit(results) \
    .to_pandas()












**Example below**

In [None]:
search_vectorDB('fire onboard a boat', tbl, results = 10)

# Rag

To setup a rag I need to have a returned documents that are smaller than the context limit of 128k.

Given each report is about 10 tokens this only allows for about 10 reports in the context. It might be better to have smaller chunks that then link to the original one.

In [None]:
!poetry add --group dev langchain langchain-community langchainhub langchain-openai langchain_voyageai

In [None]:
# from langchain import hub
# from langchain_core.output_parsers import StrOutputParser
# from langchain_core.runnables import RunnablePassthrough
# from langchain.vectorstores import LanceDB
# from langchain_community.document_loaders import DataFrameLoader
# from langchain_openai import ChatOpenAI, OpenAIEmbeddings
# from langchain_voyageai import VoyageAIEmbeddings

In [None]:

llm = ChatOpenAI(model="gpt-4-turbo")

In [None]:
openai_embedder = OpenAIEmbeddings(model="text-embedding-3-large")

voyageai_embedder = VoyageAIEmbeddings(model="voyage-large-2", truncation=True)

In [None]:
df_for_vector_store = merged_df[['report_id', 'CompleteThemeSummary', 'text']]

df_for_vector_store['text'] = df_for_vector_store.apply(
    lambda x: 
    f"Report {x['report_id']} \n\n{x['text']}", axis=1)

In [None]:
loader = DataFrameLoader(df_for_vector_store)
docs = loader.load()

docs


In [None]:
vectorstore = LanceDB.from_documents(
    documents = docs,
    embedding = voyageai_embedder
)

In [None]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

In [None]:
retriever.invoke("Fire onboard marine vessel")

I need to know that it can atleast retrieve relevent reports. I think that this will involve and require more work so that specfic reports can be quiered. Then I can work on the rag and see how that goes.

In [None]:
# from langchan_core.runnables import RunnableParallel

prompt = hub.pull("rlm/rag-prompt")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain)


In [None]:
answer = rag_chain_with_source.invoke("What are some examples of safety issues that came from fire onboard a marine vessel?")