In [1]:
import os
from langchain_community.vectorstores import FAISS
from langchain.chat_models import init_chat_model
from langchain_huggingface import HuggingFaceEmbeddings
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = "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"


if not os.environ.get("GROQ_API_KEY"):
  os.environ["GROQ_API_KEY"] = "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"


llm = init_chat_model("llama3-8b-8192", model_provider="groq")

#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 token hugging-face
embedding_model = HuggingFaceEmbeddings(model_name="/home/ciccia/.cache/huggingface/hub/models--sentence-transformers--all-mpnet-base-v2/snapshots/12e86a3c702fc3c50205a8db88f0ec7c0b6b94a0")

INDEXING PART


Loading a CSV file

In [2]:
from langchain_community.document_loaders import CSVLoader
from langchain_core.documents import Document

# Caricare il CSV
loader = CSVLoader(file_path="./students.csv")
data = loader.load()

# Trasformare i dati in frasi strutturate
documents = [
    Document(
        page_content=f"{doc.metadata['row']}: {doc.page_content.splitlines()[1].split(': ')[1]}, {doc.page_content.splitlines()[2].split(': ')[1]}, "
                     f"with ID {doc.page_content.splitlines()[0].split(': ')[1]}, is a {doc.page_content.splitlines()[5].split(': ')[1]} student "
                     f"born in {doc.page_content.splitlines()[3].split(': ')[1]}, with {doc.page_content.splitlines()[4].split(': ')[1]} nationality "
                     f"and the enrollment in the university was done in {doc.page_content.splitlines()[6].split(': ')[1]}.",
        metadata=doc.metadata
    )
    for doc in data
]

# Stampare il primo documento per verifica
print(documents[4])


page_content='4: Elena, Gonzalez, with ID 5, is a Female student born in 1999-09-05, with Spanish nationality and the enrollment in the university was done in 2022-02-14.' metadata={'source': './students.csv', 'row': 4}


Vector store

In [3]:
# Crea direttamente il vector store e lascia che LangChain calcoli gli embeddings
vector_store = FAISS.from_documents(
    documents=documents,  # I documenti da indicizzare
    embedding=embedding_model  # LangChain calcola gli embeddings automaticamente
)
print(vector_store.index_to_docstore_id)
print("FAISS vector store created successfully!")


{0: '3e03f8c7-e485-430a-8e83-498766c318d2', 1: 'a24d840f-6c5e-4a30-929c-68ad29990eed', 2: '86485dce-b1f3-410a-b2d7-33c0f663c4bf', 3: '2f8911e7-e830-4f57-81b5-40a8dc14aa31', 4: '9abca1ad-d710-4ea0-97b6-79851f1d8622'}
FAISS vector store created successfully!


Extract sentences from csv data to make data more understandable for LLM

RETRIEVAL AND GENERATION PART

In [4]:
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
from langchain_core.documents import Document
from langchain import hub
# Define prompt for question-answering
prompt = hub.pull("rlm/rag-prompt")
# Define state for application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str
# Define application steps
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"], k = 5)
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}

Control flow: Finally, we compile our application into a single graph object. In this case, we are just connecting the retrieval and generation steps into a single sequence.

In [5]:
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [6]:
#Test RAG application
question = "When did Giulia Rossi enroll and how did you get the information?"
result = graph.invoke({"question": question})
print(result["answer"])
#Test RAG application
question = "How many students are enrolled?"
result = graph.invoke({"question": question})
print(result["answer"])
#question which requires SQL rules --> LLM answers : "I don't know. The given context does not provide information about the number of students enrolled in a specific year."
question = "In which year the large number of students enrolled? Try to count the number of students enrolled per year"
result = graph.invoke({"question": question})
print(result["answer"])

question = "Where Elena Gonzales comes from?"
result = graph.invoke({"question": question})
print(result["answer"])

question = "Which is the youngest student?"
result = graph.invoke({"question": question})
print(result["answer"])

Giulia Rossi enrolled in the university on 2022-09-01. This information was obtained from the provided context, specifically from the student's record with ID 1.
There are 5 students enrolled in total.
Based on the provided context, the year with the large number of students enrolled is 2022, with three students (Giulia, Marco, and Elena) enrolling in the university that year.
Elena Gonzales comes from Spain, as indicated by her Spanish nationality.
The youngest student is Sophie, Durand, with a birthdate of 2000-07-30.
