In [None]:
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.document_loaders import UnstructuredFileLoader, DirectoryLoader
from langchain_community.embeddings import SentenceTransformerEmbeddings
import os
from langchain_community.vectorstores import FAISS
from datasets import load_dataset
from langchain_openai import ChatOpenAI
from typing import Dict, TypedDict
from langchain import hub
from langchain_core.output_parsers import JsonOutputParser
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
import pprint
from langgraph.graph import END, StateGraph
from scipy.spatial import distance
from sentence_transformers import SentenceTransformer, util
import tensorflow_hub as tensor_hub
from langchain.prompts import PromptTemplate

In [None]:
os.environ["OPENAI_API_KEY"] = "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"

In [None]:
loader = DirectoryLoader('data/', glob="**/*.pdf", show_progress=True, loader_cls=UnstructuredFileLoader)
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=20)
docs = text_splitter.split_documents(documents)
embeddings = SentenceTransformerEmbeddings(model_name='sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
db  = FAISS.from_documents(
    docs,
    embeddings)
retriever = db.as_retriever()

In [None]:
class GraphState(TypedDict):
    """
    Represents the state of our graph.

    Attributes:
        keys: A dictionary where each key is a string.
    """

    keys: Dict[str, any]

In [None]:
### Nodes ###


def retrieve(state):
    """
    Retrieve documents

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): New key added to state, documents, that contains retrieved documents
    """
    print("---RETRIEVE---")
    state_dict = state["keys"]
    question = state_dict["question"]
    documents = retriever.get_relevant_documents(question)
    transform_attempts = state_dict.get("transform_attempts", 0)  # Initialize transform_attempts if not present
    return {"keys": {"documents": documents, "question": question, "transform_attempts": transform_attempts}}



def generate(state):
    """
    Generate answer

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): New key added to state, generation, that contains generation
    """
    print("---GENERATE---")
    state_dict = state["keys"]
    question = state_dict["question"]
    documents = state_dict["documents"]
    transform_attempts = state_dict["transform_attempts"]
    


    # Prompt
    prompt = hub.pull("rlm/rag-prompt")

    # LLM
    llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")

    # Post-processing
    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    # Chain
    rag_chain = prompt | llm | StrOutputParser()

    # Run
    generation = rag_chain.invoke({"context": documents, "question": question})
    return {
        "keys": {"documents": documents, "question": question, "generation": generation, "transform_attempts": transform_attempts}
    }


def grade_documents(state):
    """
    Determines whether the retrieved documents are relevant to the question.

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): Updates documents key with relevant documents
    """

    print("---CHECK RELEVANCE---")
    state_dict = state["keys"]
    question = state_dict["question"]
    documents = state_dict["documents"]
    transform_attempts = state_dict["transform_attempts"]

    # LLM
    llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")

    prompt = PromptTemplate(
        template="""Du er en vurderer som vurderer relevansen til et hentet dokument for et brukerspørsmål. \n
        Her er det hentede dokumentet: \n\n {context} \n\n
        Her er brukerspørsmålet: {question} \n
        Hvis dokumentet inneholder nøkkelord relatert til brukerspørsmålet, vurder det som relevant. \n
        Det trenger ikke å være en streng test. Målet er å filtrere ut feilaktige hentinger. \n
        Gi en binær score 'ja' eller 'nei' for å indikere om dokumentet er relevant for spørsmålet. \n
        Gi den binære scoren som en JSON med en enkelt nøkkel 'score' og ingen innledning eller forklaring.""",
        input_variables=["question", "context"],
    )

    chain = prompt | llm | JsonOutputParser()

    # Score
    filtered_docs = []
    search = "No"  # Default do not opt for web search to supplement retrieval
    for d in documents:
        score = chain.invoke(
            {
                "question": question,
                "context": d.page_content,
            }
        )
        grade = score["score"]
        if grade == "yes":
            print("---GRADE: DOCUMENT RELEVANT---")
            filtered_docs.append(d)
        else:
            print("---GRADE: DOCUMENT NOT RELEVANT---")
            search = "Yes"  # Perform web search
            continue

    return {
        "keys": {
            "documents": filtered_docs,
            "question": question,
            "transform_attempts": transform_attempts
        }
    }

def transform_query(state):
    """
    Transform the query to produce a better question.

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): Updates question key with a re-phrased question
    """

    print("---TRANSFORM QUERY---")
    state_dict = state["keys"]
    question = state_dict["question"]
    documents = state_dict["documents"]
    transform_attempts = state_dict.get("transform_attempts", 1)

    # Create a prompt template with format instructions and the query
    prompt = PromptTemplate(
        template="""Du genererer spørsmål som er godt optimalisert for gjenfinning. \n
        Se på inndataene og prøv å resonnere om den underliggende semantiske hensikten / betydningen. \n
        Her er det opprinnelige spørsmålet: \n
        ------- \n
        {question} \n
        ------- \n
        Gi et forbedret spørsmål uten noen innledning, bare svar med det oppdaterte spørsmålet:  """,
        input_variables=["question"],
    )

    # LLM
    llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")

    # Prompt
    chain = prompt | llm | StrOutputParser()
    better_question = chain.invoke({"question": question})

    # Update transform_attempts in state dictionary
    state_dict["transform_attempts"] = transform_attempts + 1

    return {
        "keys": {"documents": documents, "question": better_question, "transform_attempts": transform_attempts + 1}
    }




def decide_to_generate(state):
    """
    Determines whether to generate an answer, retry retrieval with a transformed query, or stop if max attempts reached.

    Args:
        state (dict): The current state of the agent, including all keys.

    Returns:
        str: Next node to call
    """

    print("---DECIDE TO GENERATE---")
    state_dict = state["keys"]
    filtered_documents = state_dict["documents"]
    transform_attempts = state_dict["transform_attempts"]
    transform_attempts = state_dict.get("transform_attempts", 0)
    print(state_dict)

    if len(filtered_documents) == 0:
        # No relevant documents found
        if transform_attempts < 10:
            # Retry retrieval with a transformed query
            print(transform_attempts)
            print("---DECISION: RETRY RETRIEVAL WITH TRANSFORMED QUERY---")
            return "transform_query"
        else:
            # Max attempts reached, generate answer
            print("---DECISION: MAX ATTEMPTS REACHED, GENERATE ANSWER---")
            return "generate"
    else:
        # We have relevant documents, so generate answer
        print("---DECISION: GENERATE---")
        return "generate"


In [None]:
workflow = StateGraph(GraphState)

# Define the nodes
workflow.add_node("retrieve", retrieve)             # Retrieve
workflow.add_node("grade_documents", grade_documents)  # Grade documents
workflow.add_node("generate", generate)             # Generate
workflow.add_node("transform_query", transform_query)   # Transform query

# Build graph
workflow.set_entry_point("retrieve")
workflow.add_edge("retrieve", "grade_documents")

# Conditional edges based on relevance of documents
workflow.add_conditional_edges(
    "grade_documents",
    decide_to_generate,
    {
        "transform_query": "transform_query",  # If no relevant documents found, transform query
        "generate": "generate",                # If relevant documents found, generate answer
    },
)

# Edge to handle retrying retrieval with transformed query
workflow.add_edge("transform_query", "grade_documents")

# Edge for generating answer after transforming query
workflow.add_edge("generate", END)

# Compile
app = workflow.compile()


In [None]:
dataset = load_dataset('csv', data_files=r'/Users/adrianfolge/Documents/lokal:skole/Master/data/synthetic_data/question_with_answers.csv', split="train[:50]")

In [None]:
list_of_answers = []

for i in range(50):
    question = dataset["Question"][i]
    inputs = {
        "keys": {
            "question": question,
        }
    }
    for output in app.stream(inputs):
        for key, value in output.items():
            # Node
            pprint.pprint(f"Node '{key}':")
            # Optional: print full state at each node
            # pprint.pprint(value["keys"], indent=2, width=80, depth=None)
        pprint.pprint("\n---\n")

    # Final generation
    pprint.pprint(value["keys"]["generation"])
    list_of_answers.append(value["keys"]["generation"])

In [None]:
def embed(input, model):
    return model(input)

def SAS(preds, refs, model):
    similarities = []
    embeddings_preds = model.encode(preds)
    embeddings_refs = model.encode(refs)
    for i in range(len(embeddings_preds)):
        similarity = util.pytorch_cos_sim(embeddings_preds[i], embeddings_refs[i])
        similarities.append(similarity[0][0].item())
    average_similarity_score = sum(similarities) / len(similarities)
    return average_similarity_score

def evaluate_predictions(references, predictions):
    ## SAS encoder score
    module_url = "https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/universal-sentence-encoder/versions/2"
    encoder_model = tensor_hub.load(module_url)
    
    list_of_similarity_scores = []
    for i in range(len(predictions)):
        similarity_score = 1-distance.cosine(embed([predictions[i]], encoder_model)[0, :],embed([references[i]], encoder_model)[0, :])
        list_of_similarity_scores.append(similarity_score)
    average_score = sum(list_of_similarity_scores) / len(list_of_similarity_scores)

    ## SAS transformer score
    transformer_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')


    data = {
        "Metric": "Average SAS transformer Score",
        "Score":  SAS(predictions, references, transformer_model)
    }
    return data

In [None]:
references = dataset["Answer"]

content_list = []

# Choose the LLM that will drive the agent
llm = ChatOpenAI(model="gpt-3.5-turbo-1106")

# Define the prompt template
prompt_template = PromptTemplate(
    template="""Task: Answer Evaluation
You are given a reference answer and a predicted answer. Your task is to determine whether the predicted answer matches the reference answer correctly. It does not have to be an exact match, but it should be somewhat the same.
- The reference answer is the correct answer.
- The predicted answer is the answer generated by a model or provided by a user.
Your response should indicate whether the predicted answer is correct or not.
Reference answer: {reference}
Predicted answer: {prediction}
Is the predicted answer correct? [Yes/No]
agent_scratchpad: This is the scratchpad where you can store intermediate information.""",
    input_variables=["prediction", "reference"]
)
chain = prompt_template | llm

for num in range(50):
    score = chain.invoke(
        {
            "reference": references[num],
            "prediction": list_of_answers[num],
        }
    )
    content_list.append(score.content)



In [None]:
count_yes = content_list.count('Yes')
count_no = content_list.count('No')

# Displaying the counts
print("Number of 'Yes':", count_yes)
print("Number of 'No':", count_no)