In [6]:
import requests
from bs4 import BeautifulSoup
import re

## scrap websites

In [7]:
urls = [
    "https://www.formula1.com/en/latest.html",
    # "https://www.motorsport.com/f1/news/",
]

In [8]:
def scrape_url(url):
    """Récupère et nettoie le texte d'une page web."""
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        paragraphs = soup.find_all("p")  # On récupère les paragraphes
        text = " ".join([p.get_text() for p in paragraphs])
        return text
    return ""

In [9]:
documents = {url: scrape_url(url) for url in urls}


In [10]:
len(documents["https://www.formula1.com/en/latest.html"])

1639

## chunk

In [11]:
def split_text(text, chunk_size=500):
    """Découpe un texte en morceaux de taille définie."""
    sentences = re.split(r'(?<=[.!?])\s+', text)  # On coupe aux points
    chunks, chunk = [], ""

    for sentence in sentences:
        if len(chunk) + len(sentence) < chunk_size:
            chunk += sentence + " "
        else:
            chunks.append(chunk.strip())
            chunk = sentence + " "
    
    if chunk:  # Ajouter le dernier morceau
        chunks.append(chunk.strip())
    
    return chunks

In [15]:
# Découper les documents en chunks
text_chunks = {url: split_text(text) for url, text in documents.items()}

In [13]:
text_chunks

{'https://www.formula1.com/en/latest.html': ['',
  "MUST-SEE: Behind the scenes of how the driving sequences for Apple Original Films' 'F1' movie were shot PADDOCK INSIDER: Tsunoda's Red Bull drive is the chance he believed he was ready for – and he'll want to seize it Russell admits surprise at pace of Mercedes’ car as he predicts ‘interesting test’ for team in Japan 'He’s an extremely fast driver’ – Gasly assesses whether former team mate Tsunoda can succeed in ‘complicated’ Red Bull opportunity ‘It’s been quite a challenge ‘ – Sainz gives his honest reflections on first Williams outings BEYOND THE GRID: Jaime Alguersuari on his record-breaking debut to high-pressure exit WEEKEND WARM-UP: McLaren search for a third straight win as Tsunoda gets set for Red Bull debut on home soil  EXCLUSIVE: Perez reveals talks with ‘a few’ teams as he hints at possible F1 return ANALYSIS: Why Red Bull decided swift action was needed as Tsunoda is promoted in place of Lawson STRATEGY GUIDE: What are t

## Convertir le texte en embeddings et stocker dans FAISS

In [21]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Charger un modèle d'embedding
model = SentenceTransformer("all-MiniLM-L6-v2")

# Transformer les chunks en embeddings
all_chunks = [chunk for chunks in text_chunks.values() for chunk in chunks]
embeddings = np.array(model.encode(all_chunks))

# Création de l'index FAISS
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)  # L2 = distance euclidienne
index.add(embeddings)

# Sauvegarde de l’index
faiss.write_index(index, "faiss_index.bin")
np.save("chunks.npy", np.array(all_chunks))  # Sauvegarde des textes


## Rechercher les chunks pertinents lorsqu'une question est posée

In [17]:
def search_question(question, top_k=3):
    """Recherche les chunks les plus proches d'une question dans l'index FAISS."""
    index = faiss.read_index("faiss_index.bin")
    all_chunks = np.load("chunks.npy", allow_pickle=True)
    
    question_embedding = np.array(model.encode([question]))  # Embedding de la question
    _, indices = index.search(question_embedding, top_k)  # Recherche des plus proches voisins
    
    return [all_chunks[i] for i in indices[0]]

In [22]:
question = "Qui a gagné le championnat du monde de F1 en 2024 ?"
relevant_chunks = search_question(question)
print(relevant_chunks)

['PALMER: Piastri was too hard on himself after his slip on Sunday – his race in Australia was one of his best yet TECH WEEKLY: Why Kick Sauber are performing much better than their pre-season testing form suggested TECH WEEKLY: The key reasons behind Leclerc and Hamilton’s disqualifications – but do Ferrari have a headache with the SF-25? TECH WEEKLY: Has McLaren’s secret weapon for the 2025 season been revealed?', 'Formula 2 racer Victor Martins joins Williams Driver Academy  F1 ACADEMY: Untouchable Pin sees off Weug to take Shanghai Race 2 victory McLaren Mercedes Red Bull Racing Ferrari © 2003-2025 Formula One World Championship Limited', "MUST-SEE: Behind the scenes of how the driving sequences for Apple Original Films' 'F1' movie were shot PADDOCK INSIDER: Tsunoda's Red Bull drive is the chance he believed he was ready for – and he'll want to seize it Russell admits surprise at pace of Mercedes’ car as he predicts ‘interesting test’ for team in Japan 'He’s an extremely fast drive

In [25]:
relevant_chunks

['PALMER: Piastri was too hard on himself after his slip on Sunday – his race in Australia was one of his best yet TECH WEEKLY: Why Kick Sauber are performing much better than their pre-season testing form suggested TECH WEEKLY: The key reasons behind Leclerc and Hamilton’s disqualifications – but do Ferrari have a headache with the SF-25? TECH WEEKLY: Has McLaren’s secret weapon for the 2025 season been revealed?',
 'Formula 2 racer Victor Martins joins Williams Driver Academy  F1 ACADEMY: Untouchable Pin sees off Weug to take Shanghai Race 2 victory McLaren Mercedes Red Bull Racing Ferrari © 2003-2025 Formula One World Championship Limited',
 "MUST-SEE: Behind the scenes of how the driving sequences for Apple Original Films' 'F1' movie were shot PADDOCK INSIDER: Tsunoda's Red Bull drive is the chance he believed he was ready for – and he'll want to seize it Russell admits surprise at pace of Mercedes’ car as he predicts ‘interesting test’ for team in Japan 'He’s an extremely fast dri

## Générer une réponse avec un modèle GPT

In [None]:
import openai

openai.api_key = "TA_CLE_API_OPENAI"  # Remplace avec ta clé OpenAI

def generate_answer(question, context):
    """Génère une réponse basée sur le contexte extrait."""
    prompt = f"Réponds à la question suivante en te basant uniquement sur ces informations :\n\n{context}\n\nQuestion : {question}\nRéponse :"
    
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}]
    )

    return response["choices"][0]["message"]["content"]

# Utiliser les chunks trouvés pour générer une réponse
context = " ".join(relevant_chunks)
answer = generate_answer(question, context)
print("Réponse :", answer)
