In [3]:
from dotenv import load_dotenv
import os

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
    raise EnvironmentError("OPENAI_API_KEY not found in .env")

os.environ["OPENAI_API_KEY"] = openai_api_key

langsmith_api_key = os.getenv("LANGSMITH_API_KEY")
if not langsmith_api_key:
    raise EnvironmentError("LANGSMITH_API_KEY not found in .env")

os.environ["LANGSMITH_API_KEY"] = langsmith_api_key

google_api_key = os.getenv("GOOGLE_API_KEY")
if not google_api_key:
    raise EnvironmentError("GOOGLE_API_KEY not found in .env")

os.environ["GOOGLE_API_KEY"] = google_api_key

aws_access_key = os.getenv("AWS_ACCESS_KEY")
if not aws_access_key:
    raise EnvironmentError("AWS_ACCESS_KEY not found in .env")

os.environ["AWS_ACCESS_KEY"] = aws_access_key

In [8]:
import bs4
from langchain_community.document_loaders import WebBaseLoader

bs4_strainer = bs4.SoupStrainer(id="contenu")
loader = WebBaseLoader(
    web_paths=("https://www.polemia.com/reveil-des-nations-orban-appelle-a-suivre-lexemple-de-trump/",),
    bs_kwargs={"parse_only": bs4_strainer},
)
docs = loader.load()
print(f"Total characters: {len(docs[0].page_content)}")

Total characters: 3997


In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(docs)

print(f"Split blog post into {len(all_splits)} sub-documents.")

Split blog post into 7 sub-documents.


In [7]:
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

embedding_model = OpenAIEmbeddings(model="text-embedding-3-large")

# This requires faiss-cpu 
faiss_vector_store = FAISS.from_documents(all_splits, embedding_model)

### For sync operations: use an s3 bucket

In [13]:
import boto3

s3 = boto3.client('s3', region_name="eu-north-1")

### Save to local index

In [20]:
faiss_vector_store.save_local("./vectorstore")

### Syncronize with bucket

In [23]:
s3.upload_file("./vectorstore/index.faiss", "rag-faiss-index-bucket", "vectorstores/index.faiss")
s3.upload_file("./vectorstore/index.pkl", "rag-faiss-index-bucket", "vectorstores/index.pkl")

### Now read from bucket

In [25]:
os.makedirs("./vectorstore", exist_ok=True)
s3.download_file("rag-faiss-index-bucket", "vectorstores/index.faiss", "./vectorstore/index.faiss")
s3.download_file("rag-faiss-index-bucket", "vectorstores/index.pkl", "./vectorstore/index.pkl")

### Load vector store

In [30]:
vector_store = FAISS.load_local("./vectorstore", embedding_model, allow_dangerous_deserialization=True)

In [31]:
from langchain_core.prompts import PromptTemplate


prompt = PromptTemplate.from_template("""You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use three sentences maximum and keep the answer concise. 
Question: {question} 
Context: {context} 
Answer:""")

example_messages = prompt.invoke(
    {"context": "(context goes here)", "question": "(question goes here)"}
).to_messages()

assert len(example_messages) == 1
print(example_messages[0].content)

You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use three sentences maximum and keep the answer concise. 
Question: (question goes here) 
Context: (context goes here) 
Answer:


In [32]:
from langchain_core.documents import Document
from typing_extensions import List, TypedDict
from langchain.chat_models import init_chat_model


class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    llm = init_chat_model("gemini-2.5-flash", model_provider="google_genai")
    response = llm.invoke(messages)
    return {"answer": response.content}

In [33]:
from langgraph.graph import START, StateGraph

graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [34]:
result = graph.invoke({"question": "Qu'est-ce qui a donné sa victoire à Trump ?"})

print(f"Context: {result['context']}\n\n")
print(f"Answer: {result['answer']}")

Context: [Document(id='53124be4-5c8a-458a-81b4-d0affcf533eb', metadata={'source': 'https://www.polemia.com/reveil-des-nations-orban-appelle-a-suivre-lexemple-de-trump/', 'start_index': 3256}, page_content='La paix : pas de nouveau front à l’est !\nLa souveraineté : pas de centralisation bruxelloise\nLes libertés : que les citoyens soient libres de penser et d’agir\nLa culture enracinée dans les traditions contre l’idéologie nihiliste\n\nComment imposer cette option ? Par la victoire des forces nationales dans chaque pays. Les nations sont le levier du changement européen. Tout commence et se décide à la base, par le succès politique de terrain. «\xa0Fight! Fight! Fight!\xa0» : ce qui a donné à Trump sa victoire.\nLa victoire de Trump est un atout en ce sens qu’il liquide en Amérique l’État profond, et que cette libération doit se propager pleinement en Europe.\nThibaud Gibelin\n29/05/2025\nSource : Fil Twitter de Thibaud Gibelin\nCrédit image : IA'), Document(id='c2f1b95d-2e61-4f09-a49