### Implement an HTML Splitter

In [1]:
from typing import List, Dict
import os
import requests
from bs4 import BeautifulSoup, Tag, ResultSet
from markdownify import MarkdownConverter
import chromadb
import chromadb.utils.embedding_functions as embedding_functions
from langchain.text_splitter import RecursiveCharacterTextSplitter
from operator import itemgetter
from langchain_core.runnables import RunnableLambda
from langchain_core.prompts.chat import ChatPromptTemplate
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser, JsonOutputFunctionsParser
from langchain.schema.output_parser import StrOutputParser
from langchain_openai.chat_models import ChatOpenAI
import pandas as pd
import mlflow



In [2]:
# Cleaner methods
from bs4 import ResultSet, Tag
def remove_links_from_p(tag:Tag) -> None:
    for link in tag.find_all('a'):
        link.replace_with(link.text)

def remove_citations_from_p(tag:Tag) -> None:
    for citation in tag.find_all(class_="reference"):
        citation.decompose()

def md(soup:str, **options):
    return MarkdownConverter(**options).convert_soup(soup)

def format_paragraph(element):
    remove_links_from_p(element)
    remove_citations_from_p(element)
    return md(element)

In [3]:
def load_html_from_wiki_url(url:str) -> Tag:
    res = requests.get(url, headers={})
    if res.status_code == 200:
        soup=BeautifulSoup(res.content,'html.parser')
        return soup
    else:
        raise Exception(f"Error loading page: {res.status_code}")
    
def group_paragraphs_by_section(soup:Tag) -> Dict[str, List[Tag]]:
    current_subtitle = "Overview"
    grouped_content = {}
    elements = soup.find_all(['h2', 'h3', 'p'])

    for element in elements:
        if element.name in ['h2', 'h3']:
            current_subtitle = element.get_text(strip=True)
            grouped_content[current_subtitle] = []
        elif element.name == 'p':
            grouped_content.setdefault(current_subtitle, []).append(element)

    for subtitle, paragraphs in grouped_content.items():
        grouped_content[subtitle] = [format_paragraph(p) for p in paragraphs]

    return grouped_content

def load_and_clean_from_wiki_url(url:str):
    soup = load_html_from_wiki_url(url)
    main = soup.find(id="mw-content-text")
    return group_paragraphs_by_section(main)

In [4]:
from langchain_core.documents import Document

def split_wiki_pars(paragraph_content, splitter) -> List[Document]:
    docs = []
    for subtitle, paragraphs in paragraph_content.items():
        if paragraphs is None:
            continue
        split_paragraphs = splitter.create_documents(
            texts=paragraphs,
            metadatas=[{"subtitle": subtitle, "type": "paragraph"} for _ in paragraphs]
        )
        docs.extend(split_paragraphs)
    return docs

In [5]:
def fill_collection_from_wiki_url(wiki_url, collection, splitter, tags={}):
    grouped_paragraphs = load_and_clean_from_wiki_url(wiki_url)
    docs = split_wiki_pars(grouped_paragraphs, splitter)
    collection.add(
        documents=[doc.page_content for doc in docs],
        ids=[str(i) for i in range(len(docs))],
        metadatas=[doc.metadata | tags for doc in docs]
    )
    

In [6]:
from chromadb import Collection

def get_embeddings_for_question(x):
    question = x['question']
    collection:Collection = x['collection']
    metadata_filters = x.get("metadata_filters", {})
    document_filters = x.get("document_filters", {})
    sim_docs = collection.query(query_texts=[question], n_results=3, where=metadata_filters, where_document=document_filters)['documents'][0]

    #Combining the documents
    full_context = "\n".join([doc for doc in sim_docs])

    return {"input": full_context}

def combine_docs(x):
    """combines list of docs"""
    docs = x['input']
    full_context = " ".join([doc.page_content for doc in docs])
    return {"input": full_context}

def invoke_model_with_question(x):
    model = ChatOpenAI(temperature=0.0)
    return model.invoke(x['input'].messages)


In [7]:
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer."""

prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    ("human", "{input}")
])

"""
Person_QA_Wiki_Chain:
inputs:
    question: Single question for the LLM over a single context that is to be queried in the collection
    collection: collection in which to query context for 
    metadata_filters (optional): Filter based on metadata 
    document_filters (optional): Filter based on documents (supports operator actions)
"""
person_qa_wiki_chain = (
    {
        "input": get_embeddings_for_question | prompt,
    }
    | RunnableLambda(invoke_model_with_question) 
    | StrOutputParser()
)

### Perform Evaluation

In [8]:
import functools

def model(input_df:pd.DataFrame, **kwargs):
    answer = []
    for index, row in input_df.iterrows():
        answer.append(person_qa_wiki_chain.invoke({"question": row["questions"]} | kwargs))
    return answer

def model_factory(**kwargs):
    def wrapped_model(input_df):
        return model(input_df, **kwargs)
    return wrapped_model

In [9]:
# Metrics
from mlflow.metrics.genai import relevance, faithfulness, EvaluationExample

relevance_metric = relevance(
    model="openai:/gpt-4"
)

faithfulness_metric = faithfulness(
    model="openai:/gpt-4"
)

#### Create and Reset Initial Collection

In [11]:

client = chromadb.Client()

openai_ef = embedding_functions.OpenAIEmbeddingFunction(
            api_key=os.environ["OPENAI_API_KEY"],
            model_name="text-embedding-ada-002"
)

#client.delete_collection("wiki_data_bs4_mlflow")
wiki_collection = client.create_collection("wiki_data_bs4_mlflow", embedding_function=openai_ef)


In [12]:
# Initialize Data Inside of Collection
DICKINSON_URL = "https://en.wikipedia.org/wiki/John_Dickinson"
default_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)


fill_collection_from_wiki_url(
    wiki_url=DICKINSON_URL,
    collection=wiki_collection,
    splitter=default_splitter,
    tags={"subject": "John Dickinson"}
)

#### Narrow Collection to Specific Context

#### Validate Basic Chain

In [13]:
eval_df = pd.DataFrame(
    {
        "questions": [
            "Did John Dickinson support independence?",
            "When was the John Dickinson born?",
            "How many siblings did John Dickinson have?"
        ],
    }
)

In [14]:
model_configs = {
    "collection": wiki_collection,
    "metadata_filters": {"subject": "John Dickinson"}
}

basic_model = model_factory(**model_configs)

results = basic_model(eval_df)

In [15]:
for question, answer in zip(eval_df['questions'], results):
    print(f"Question: {question}")
    print(f"Answer: {answer}\n")

Question: Did John Dickinson support independence?
Answer: Based on the given context, John Dickinson did not sign the Declaration of Independence.

Question: When was the John Dickinson born?
Answer: John Dickinson was a Founding Father of the United States, known as the "Penman of the Revolution" for his writings advocating for the rights of the American colonies. He served as president of Delaware from 1781 to 1783 and president of Pennsylvania from 1782 to 1785. However, he did not actually resign as president of Delaware when he was elected president of Pennsylvania, causing controversy and leading to his formal resignation in January 1783.

Question: How many siblings did John Dickinson have?
Answer: The Dickinson family had a total of 16 children.



#### Perform Evaluation

In [16]:
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("llm_wiki_rag")
os.environ["OPENAI_API_KEY"]='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'

In [20]:
model_configs = {
    "collection": wiki_collection,
    "metadata_filters": {"subject": "John Dickinson"}
}

results = mlflow.evaluate(
    model_factory(**model_configs),
    eval_df,
    #model_type="question_answering",
    #extra_metrics=[faithfulness_metric, relevance_metric, mlflow.metrics.ari_grade_level(), mlflow.metrics.flesch_kincaid_grade_level()],
    extra_metrics=[mlflow.metrics.flesch_kincaid_grade_level(), mlflow.metrics.ari_grade_level()],
    evaluator_config={
        "col_mapping": {
            "inputs": "questions",
            "context": "source_documents",
        }
    },
)

print(results.metrics)

  string_columns = trimmed_df.columns[(df.applymap(type) == str).all(0)]
  data = data.applymap(_hash_array_like_element_as_bytes)
2024/02/03 20:48:00 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.


2024/02/03 20:48:00 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.
2024/02/03 20:48:08 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2024/02/03 20:48:08 INFO mlflow.models.evaluation.default_evaluator: Evaluating metrics: flesch_kincaid_grade_level
2024/02/03 20:48:08 INFO mlflow.models.evaluation.default_evaluator: Evaluating metrics: ari_grade_level


{'flesch_kincaid_grade_level/v1/mean': 9.966666666666667, 'flesch_kincaid_grade_level/v1/variance': 2.668888888888889, 'flesch_kincaid_grade_level/v1/p90': 11.58, 'ari_grade_level/v1/mean': 9.266666666666667, 'ari_grade_level/v1/variance': 14.675555555555556, 'ari_grade_level/v1/p90': 12.56}


In [18]:
results.tables["eval_results_table"]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,questions,outputs,flesch_kincaid_grade_level/v1/score
0,Did John Dickinson support independence?,"Based on the given context, John Dickinson did...",9.9
1,When was the John Dickinson born?,John Dickinson was a Founding Father of the Un...,12.0
2,How many siblings did John Dickinson have?,The Dickinson family had a total of 16 children.,8.0


In [21]:
experiment_description = (
    "This is a test experiment to see how well the LLM can answer questions about John Dickinson."
)

experiment_tags = {
    "subject": "John Dickinson",
    "source": "Wikipedia",
    "type": "RAG",
    "mlflow.note.content": experiment_description
}

llm_rag_experiment = mlflow.create_experiment(
    name="llm_wiki_rag_test_dickinson",
    tags=experiment_tags
)

mlflow.set_experiment("llm_wiki_rag_test_dickinson")

<Experiment: artifact_location='mlflow-artifacts:/480502914628502779', creation_time=1707015245531, experiment_id='480502914628502779', last_update_time=1707015245531, lifecycle_stage='active', name='llm_wiki_rag_test_dickinson', tags={'mlflow.note.content': 'This is a test experiment to see how well the LLM can '
                        'answer questions about John Dickinson.',
 'source': 'Wikipedia',
 'subject': 'John Dickinson',
 'type': 'RAG'}>

In [37]:
import openai

model_configs = {
    "collection": wiki_collection,
    "metadata_filters": {"subject": "John Dickinson"}
}

with mlflow.start_run(run_name="test_1") as run:
    # logged_model_info = mlflow.openai.log_model(
    #     model="text-embedding-ada-002",
    #     task=openai.ChatCompletion,
    #     artifact_path="model",
    #     messages=[
    #         {"role": "system", "content": prompt},
    #         {"role": "user", "content": "{question}"},
    #     ],
    # )
    mlflow.set_tags({
        "prompt_template": template
    })
    results = mlflow.evaluate(
        model_factory(**model_configs),
        eval_df,
        #model_type="question_answering",
        #extra_metrics=[faithfulness_metric, relevance_metric, mlflow.metrics.ari_grade_level(), mlflow.metrics.flesch_kincaid_grade_level()],
        extra_metrics=[mlflow.metrics.flesch_kincaid_grade_level(), mlflow.metrics.ari_grade_level()],
        evaluator_config={
            "col_mapping": {
                "inputs": "questions",
                "context": "source_documents",
            }
        },


    )

  string_columns = trimmed_df.columns[(df.applymap(type) == str).all(0)]
  data = data.applymap(_hash_array_like_element_as_bytes)
2024/02/03 21:20:25 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
2024/02/03 21:20:25 INFO mlflow.models.evaluation.default_evaluator: Computing model predictions.


2024/02/03 21:20:33 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
2024/02/03 21:20:33 INFO mlflow.models.evaluation.default_evaluator: Evaluating metrics: flesch_kincaid_grade_level
2024/02/03 21:20:33 INFO mlflow.models.evaluation.default_evaluator: Evaluating metrics: ari_grade_level


In [34]:
eval_table = results.tables["eval_results_table"]
eval_table

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,questions,outputs,flesch_kincaid_grade_level/v1/score,ari_grade_level/v1/score
0,Did John Dickinson support independence?,"Based on the given context, John Dickinson did...",9.9,10.8
1,When was the John Dickinson born?,John Dickinson served as the president of whic...,4.8,7.1
2,How many siblings did John Dickinson have?,The Dickinson family had a total of 16 children.,8.0,4.0
