In [None]:
%pip install langchain-pinecone

In [None]:
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings import HuggingFaceEmbeddings
import os
import pandas as pd
from langchain.schema import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from tqdm import tqdm
from langchain_groq import ChatGroq
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import PromptTemplate

os.environ['PINECONE_API_KEY'] = "4e291d9c-27e4-438e-b424-57a7ea0ba08a"
pinecone_api_key = os.environ.get("PINECONE_API_KEY")
groq_api_key = "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"

In [None]:
# Define the relative path to the JSON file
relative_path = os.path.join('.', 'final_results.json')
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
# Load the JSON file into a DataFrame
data = pd.read_json(relative_path)

documents = [
    Document(
        page_content=row['article_text'],
        metadata={
            'title': row['title'],
        }
    )
    for _, row in data.iterrows()
]

In [None]:
# Initialize an empty list to hold the split documents
split_documents = []

# Split the article texts and create documents
for _, row in tqdm(data.iterrows(), total=data.shape[0], desc="Splitting documents"):
    article_text = row['article_text']
    title = row['title'] if pd.notnull(row['title']) else ""
    chunks = text_splitter.split_text(article_text)
    
    for chunk in chunks:
        split_documents.append(
            Document(
                page_content=chunk,
                metadata={'title': title}
            )
        )

In [None]:
# Initialize Pinecone
pc = Pinecone(api_key=pinecone_api_key)

# Connect to the Pinecone index
index_name = "elrond-index"
index = pc.Index(index_name)

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = PineconeVectorStore.from_documents(split_documents, embeddings, pinecone_api_key=pinecone_api_key, index_name=index_name )

In [None]:
query = "What is U-boot?"

retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})
retrieved_docs = retriever.invoke(query)

In [None]:
llm = ChatGroq(temperature=0, groq_api_key=groq_api_key, model="mixtral-8x7b-32768")
prompt = """
You are an assistant for question-answering tasks specifically on Embedded Linux and its components like U-boot, Linux kernel, hardware and software stack. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. Answer the questions with a maximum of 512 words.

Question: {question} 

Context: {context} 

Answer:
"""
custom_rag_prompt = PromptTemplate.from_template(prompt)

In [None]:

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)

for chunk in rag_chain.stream(query):
    print(chunk, end="", flush=True)

## Guide for Agentic Evaluation 

In [None]:
import os
from crewai import Agent, Task, Crew, Process
from crewai_tools import SerperDevTool
from langchain_openai import ChatOpenAI

# You can choose to use a local model through Ollama for example. See https://docs.crewai.com/how-to/LLM-Connections/ for more information.
os.environ["OPENAI_MODEL_NAME"] = 'gpt-4-0125-preview'
os.environ["OPENAI_API_KEY"] = API_KEY_OPENAI

search_tool = SerperDevTool()

# Define your agents with roles and goals
Engineer_Reviewer = Agent(
  role="Embedded Linux Software engineer evaluating AI responses",
  goal="""Give a score from 0 to 1 to the AI answer, defining how it compares to a human answer in terms of clarity, completeness, accuracy, and relevance to embedded Linux topics. 
  The score 0 means that the AI answer is very different or unrelated compared to the human answer. 
  The score 1 means that the AI answer fully accomplishes the goal and is very similar to the human answer, exhibiting attributes like precision, technical correctness, and 
  effective communication of concepts pertinent to Embedded Linux development.""",
  backstory="""You are a senior Embedded Linux Software engineer for Toradex, specialized in Linux kernel, U-boot, Firmware, Real-time, Yocto Project.
  You possess in-depth knowledge of Toradex offerings, such as Torizon and the hardware families (Colibri, Apalis, and Verdin). 
  You have extensive experience in assessing technical responses and are accustomed to evaluating clarity and accuracy in technical communication within the field.""",
  verbose=True,
  allow_delegation=False,
  llm=ChatOpenAI(model_name="gpt-4-0125-preview", temperature=0)
)

Customer_Reviewer = Agent(
  role="Toradex customer evaluating and comparing AI and human responses on community forums",
  goal="""Assess the AI-generated response against a corresponding human response from the community forum. Evaluate both responses based on how well they solve the problem presented, their clarity, directness, and relevance. Assign a score from 0 to 1, where 0 indicates that the AI response is completely divergent from the effective human response, and 1 indicates that the AI response is equivalent to or exceeds the human response in solving the query.""",
  backstory="""You are a Toradex customer who regularly uses products like the Apalis, Colibri, and Verdin modules and engages with the Torizon platform. You rely on the Toradex community forum for solving technical issues and are accustomed to evaluating the quality and effectiveness of the solutions provided.""",
  verbose=True,
  allow_delegation=False,
  llm=ChatOpenAI(model_name="gpt-4-0125-preview", temperature=0)
)

Reviewer = Agent(
  role="Toradex customer evaluating and comparing AI and human responses on community forums",
  goal="""Consider both scores and return the average of the two values""",
  backstory="""You are a Toradex customer who regularly uses products like the Apalis, Colibri, and Verdin modules and engages with the Torizon platform. You rely on the Toradex community forum for solving technical issues and are accustomed to evaluating the quality and effectiveness of the solutions provided.""",
  verbose=True,
  allow_delegation=False,
  llm=ChatOpenAI(model_name="gpt-4-0125-preview", temperature=0)
)
# Create tasks for your agents
task1 = Task(
  description="""Evaluate the given AI-generated response to a technical query related to Embedded Linux development. 
  Consider factors such as clarity, completeness, accuracy, and relevance to the topic in your evaluation. 
  You will take into consideration the following aspects
  1) Compare the AI response to what a well-informed human expert in Embedded Linux Software Engineering would provide.
  2) Compare if the answers have simillar contents and solves the problem in simillar ways 
  3) The answer is clear and direct

  Be extremaly critical and severe on your comparison!
  
  You will compare the answers:
  Human answer: {human_answer}

  AI answer: {ai_answer}

  """,
  expected_output="""A numeric score ranging from 0 to 1, where 0 indicates that the AI-generated response is very different or unrelated compared to what a knowledgeable human expert would provide, 
  and 1 indicates that the AI-generated response is on par with a human expert in terms of clarity, completeness, accuracy, and relevance to Embedded Linux development.
  The answer MUST be in the form below, having only a number between 0.0 and 1.0:
  
  [score]
  """,
  agent=Engineer_Reviewer,
  #async_execution=True
)

task2 = Task(
  description="""Analyze and compare an AI-generated response with a human-provided response to a query posted on the community.toradex.com forum regarding an issue with using the Yocto Project on the Verdin development board. 
  Evaluate both responses based on clarity, directness, relevance, and problem-solving effectiveness. Consider how closely the AI-generated response aligns with the quality of the human response.
  Be extremaly critical and severe on your comparison!
  
  You will compare the answers:
  Human answer: {human_answer}

  AI answer: {ai_answer}
  """,
  expected_output="""A numeric score from 0 to 1, where 0 means the AI response is vastly inferior or unrelated compared to the human response, 
  and 1 means the AI response matches or surpasses the human response in addressing the query effectively. 
  The answer MUST be in the form below, having only a number between 0.0 and 1.0:
  
  [score]
  """,
  agent=Customer_Reviewer,
  #async_execution=True
)

# Instantiate your crew with a sequential process
crew = Crew(
  agents=[Engineer_Reviewer, Customer_Reviewer],
  tasks=[task1, task2],
  verbose=False, # You can set it to 1 or 2 to different logging levels
  full_output=True,
)

In [None]:
import random

res = range(0, 600)#random.sample(range(1, 599), 100)
res = list(res)

average_scores_list = []

for index in res:
    row = data_test[index]
    inputs = {
        'human_answer': row['answer'],
        'ai_answer': row['generated_answer']
    }
    # Kickoff the crew with the correctly structured inputs
    result = crew.kickoff(inputs=inputs)
    # Extracting scores from the task outputs
    scores = [float(task.exported_output) for task in result['tasks_outputs']]
    
    # Calculating the average score
    average_score = round(sum(scores) / len(scores), 2)
    average_scores_list.append(average_score)

average_score = round(sum(average_scores_list) / len(average_scores_list), 2)
print("Overall Average Score:", average_score)