# Data & Ingestion 

## Web Scraping --> Get raw data

In [None]:
import sys
from pathlib import Path

sys.path.append(str(Path('..').resolve()))

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import os
import hashlib

def scrape_esilv_website():
    start_url = "https://www.esilv.fr/"
    output_dir = "data/raw/web/"
    os.makedirs(output_dir, exist_ok=True)

    visited = set()
    to_visit = [start_url]

    while to_visit:
        url = to_visit.pop(0)
        if url in visited:
            continue
        visited.add(url)

        # ❌ Ignorer les pages en anglais
        if "/en/" in urlparse(url).path:
            continue

        try:
            response = requests.get(url)
            if response.status_code != 200:
                continue
            soup = BeautifulSoup(response.text, "html.parser")

            # Nettoyer texte
            for script in soup(["script", "style"]):
                script.decompose()
            text = soup.get_text(separator="\n")
            text = "\n".join([line.strip() for line in text.splitlines() if line.strip()])

            # Nom de fichier basé sur URL, plus court et unique
            path_part = urlparse(url).path.replace("/", "_").strip("_") or "home"
            # Tronquer si trop long
            max_length = 50
            if len(path_part) > max_length:
                hash_part = hashlib.md5(path_part.encode()).hexdigest()[:8]
                path_part = path_part[:max_length] + "_" + hash_part
            filename = f"{path_part}.txt"

            with open(os.path.join(output_dir, filename), "w", encoding="utf-8") as f:
                f.write(text)

            # Trouver tous les liens internes
            for a_tag in soup.find_all("a", href=True):
                link = urljoin(start_url, a_tag['href'])

                # ❌ Ignorer les liens vers /download/
                if link.startswith("https://www.esilv.fr/download/"):
                    continue
                # ❌ Ignorer les pages en anglais
                if "/en/" in urlparse(link).path:
                    continue
                # Ajouter seulement les liens internes
                if urlparse(link).netloc == urlparse(start_url).netloc:
                    if link not in visited and link not in to_visit:
                        to_visit.append(link)

            print(f"[OK] {url}")
        except Exception as e:
            print(f"[Erreur] {url} -> {e}")


In [None]:
scrape_esilv_website()

## Processed Data

### Clean up the text

Remove:
- Multiple spaces
- Unnecessary line breaks
- Recurring ads/menus
- Special characters
- Overly short content

In [None]:
import re

def clean_text(text):
    text = re.sub(r"\s+", " ", text)
    
    text = text.replace("\xa0", " ")

    lines = [line.strip() for line in text.split("\n") if len(line.strip()) > 20]
    
    return "\n".join(lines)

### Chunking

- RAG with between 300 and 600 words

In [None]:
def chunk_text(text, max_words=300):
    words = text.split()
    chunks = []

    for i in range(0, len(words), max_words):
        chunk = " ".join(words[i:i+max_words])
        chunks.append(chunk)

    return chunks

### Complete pipeline for scraped files
- Loads all .txt files from data/raw/web
- Cleans
- Chops into chunks
- Saves to data/processed/web

In [None]:
import os

raw_dir = "../data/raw/web/"
processed_dir = "../data/processed/web/"
os.makedirs(processed_dir, exist_ok=True)

for filename in os.listdir(raw_dir):
    if filename.endswith(".txt"):
        with open(os.path.join(raw_dir, filename), "r", encoding="utf-8") as f:
            raw_text = f.read()
        
        cleaned = clean_text(raw_text)
        chunks = chunk_text(cleaned, max_words=300)

        # Sauvegarder chaque chunk
        for i, chunk in enumerate(chunks):
            out_path = os.path.join(processed_dir, f"{filename.replace('.txt','')}_chunk{i}.txt")
            with open(out_path, "w", encoding="utf-8") as out:
                out.write(chunk)

        print(f"[OK] {filename} → {len(chunks)} chunks")

### PDF Processing (Brochures)

In [None]:
import pdfplumber

raw_pdf_dir = "../data/raw/brochures/"
processed_pdf_dir = "../data/processed/brochures/"
os.makedirs(processed_pdf_dir, exist_ok=True)

for filename in os.listdir(raw_pdf_dir):
    if filename.endswith(".pdf"):
        with pdfplumber.open(os.path.join(raw_pdf_dir, filename)) as pdf:
            text = ""
            for page in pdf.pages:
                text += page.extract_text() + "\n"
        
        cleaned = clean_text(text)
        chunks = chunk_text(cleaned)

        for i, chunk in enumerate(chunks):
            out_path = os.path.join(processed_pdf_dir, f"{filename.replace('.pdf','')}_chunk{i}.txt")
            with open(out_path, "w", encoding="utf-8") as out:
                out.write(chunk)

        print(f"[OK] {filename} → {len(chunks)} chunks")

### Combine everything into a final dataset

In [None]:
combined_dir = "../data/processed/combined/"
os.makedirs(combined_dir, exist_ok=True)

for folder in ["../data/processed/web/", "../data/processed/brochures/"]:
    for f in os.listdir(folder):
        src = os.path.join(folder, f)
        dst = os.path.join(combined_dir, f)
        with open(src, "r", encoding="utf-8") as infile, open(dst, "w", encoding="utf-8") as outfile:
            outfile.write(infile.read())

## Embeddings

In [None]:
import os
import json
from pathlib import Path
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
from tqdm import tqdm

data_dir = Path("../data/processed/combined/")
model_name = "all-MiniLM-L6-v2"  # rapide et bon pour RAG
batch_size = 64
index_path = Path("embeddings/faiss_index.idx")
meta_path = Path("embeddings/metadata.json")

files = [p for p in data_dir.glob("*.txt")]
texts = []
metas = []
for i, p in enumerate(sorted(files)):
    txt = p.read_text(encoding="utf-8").strip()
    if len(txt) < 50:
        continue
    texts.append(txt)
    metas.append({"id": i, "source": str(p.name)})

model = SentenceTransformer(model_name)
dim = model.get_sentence_embedding_dimension()

embeddings = []
for i in tqdm(range(0, len(texts), batch_size)):
    batch = texts[i:i+batch_size]
    emb = model.encode(batch, convert_to_numpy=True, show_progress_bar=False)
    embeddings.append(emb)
embeddings = np.vstack(embeddings).astype("float32")

index = faiss.IndexFlatL2(dim)
index.add(embeddings)

os.makedirs(index_path.parent, exist_ok=True)
faiss.write_index(index, str(index_path))
with open(meta_path, "w", encoding="utf-8") as f:
    json.dump(metas, f, ensure_ascii=False, indent=2)

print(f"Saved {len(texts)} embeddings, dim={dim}")

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 1047/1047 [1:27:54<00:00,  5.04s/it]


Saved 66986 embeddings, dim=384


### Querying some FAISS indexes

In [None]:
import faiss
import numpy as np
import json
from sentence_transformers import SentenceTransformer

index = faiss.read_index("embeddings/faiss_index.idx")
with open("embeddings/metadata.json", "r", encoding="utf-8") as f:
    metas = json.load(f)

model = SentenceTransformer("all-MiniLM-L6-v2")
q = "Quelle est la mission de l'école ?"  # exemple
q_emb = model.encode([q], convert_to_numpy=True).astype("float32")

k = 5
D, I = index.search(q_emb, k)
for rank, idx in enumerate(I[0]):
    print(rank+1, "score:", float(D[0][rank]), "source:", metas[idx]["source"])

1 score: 0.7864742279052734 source: actuariat-best-job-2015-double-diplome-isup_chunk4.txt
2 score: 0.78709876537323 source: bachelor-en-ecole-dingenieurs-quels-stages-pour-qu_0f77f7c5_chunk0.txt
3 score: 0.8098337054252625 source: plaquette_apprentissage_esilv_chunk13.txt
4 score: 0.8124736547470093 source: lingenieur-dans-le-monde-quels-diplomes-et-niveaux_69fb9fa3_chunk5.txt
5 score: 0.8165364265441895 source: jeremie-diplome-2008-en-ingenierie-financiere-est-_890ba857_chunk3.txt
