# INTRODUCTION TO A RAG-IMPLEMENTATION

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Charger un modèle open-source
model_name = "EleutherAI/gpt-neo-125M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)


In [5]:
# Ajouter un pad_token_id si nécessaire
tokenizer.pad_token = tokenizer.eos_token

# Prompt simple
prompt = "Who's Charles Darwin"

# Tokenisation avec attention_mask et padding
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)

# Génération
output = model.generate(
    inputs.input_ids,
    max_length=100,  # Longueur max de la génération
    num_return_sequences=1,  # Nombre de textes générés
    temperature=0.7,  # Contrôle de la créativité
    do_sample=True,  # Active le sampling
    pad_token_id=tokenizer.eos_token_id  # Définit un token de padding
)

# Décodage du texte généré
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("Texte généré :")
print(generated_text)

Texte généré :
Who's Charles Darwin?"

"I don't know."

"Not exactly. But I do know that if you wanted to be a man, you'd have to learn the language. But you'd be the most difficult man to learn. So you're going to learn to speak English."

"English?"

"Yes. And I'm going to learn how to read, how to write, how to read and write."

"Why are you so interested in


# PDF READER


In [6]:
from PyPDF2 import PdfReader

# Charger le PDF
pdf_path = "/data/health_systems/Psychology_is_improving_brain_health_and_aging.pdf"
reader = PdfReader(pdf_path)

# Extraire tout le texte
document_text = ""
for page in reader.pages:
    document_text += page.extract_text()

# Afficher un aperçu du texte extrait
print("Texte extrait (aperçu) :", document_text[:500])

# Afficher le nombre total de caractères
print("Nombre total de caractères :", len(document_text))

def split_text(text, max_length=100):
    sentences = text.split(". ")
    chunks, chunk = [], []
    current_length = 0

    for sentence in sentences:
        chunk.append(sentence)
        current_length += len(sentence.split())
        if current_length >= max_length:
            chunks.append(". ".join(chunk))
            chunk, current_length = [], 0

    if chunk:
        chunks.append(". ".join(chunk))
    return chunks

segments = split_text(document_text)
print("Nombre de segments :", len(segments))


FileNotFoundError: [Errno 2] No such file or directory: '/data/health_systems/Psychology_is_improving_brain_health_and_aging.pdf'

In [None]:

def read_text_file(file_path):
    """Lire le contenu d'un fichier texte"""
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def split_text_by_characters(text, max_length=1000):
    """Diviser le texte en segments de longueur maximale (en caractères)"""
    chunks = []
    start = 0
    while start < len(text):
        end = start + max_length
        chunks.append(text[start:end])
        start = end
    return chunks

# Exemple d'utilisation avec un fichier texte
file_path = "/data/Darwin_data/darwin"  # Remplacez par le chemin de votre fichier
document_text = read_text_file(file_path)

# Diviser le texte en segments (par exemple, 1000 caractères par segment)
segments = split_text_by_characters(document_text, max_length=1000)

# Afficher le nombre de segments créés et un extrait du premier segment
print(f"Nombre de segments créés : {len(segments)}")
print("Extrait du premier segment :", segments[0][:200])  # Afficher un extrait de 200 caractères du premier segment


# WIKIPEDIA READER

In [8]:
import re

def clean_wikipedia_text(text):
    """
    Nettoie un texte Wikipédia :
    - Supprime les balises <ref>, les modèles {{...}} et les balises HTML.
    - Supprime les doubles crochets [[...]] tout en conservant leur contenu.
    - Réduit les espaces multiples.
    """
    text = re.sub(r"<ref[^>]*>.*?</ref>", "", text, flags=re.DOTALL)  # Supprimer les balises <ref>
    text = re.sub(r"{{[^}]*}}", "", text)  # Supprimer les modèles {{...}}
    text = re.sub(r"<[^>]*>", "", text)  # Supprimer les balises HTML
    text = re.sub(r"\[\[([^|\]]+\|)?([^\]]+)\]\]", r"\2", text)  # Supprimer les [[]] tout en conservant le contenu
    text = re.sub(r"\s+", " ", text).strip()  # Réduire les espaces multiples
    return text

def split_text(text, max_length=1000):
    """
    Divise le texte en segments d'une longueur maximale donnée.
    """
    segments = []
    while len(text) > max_length:
        split_index = text[:max_length].rfind(".")
        if split_index == -1:  # Si aucun point trouvé, couper brutalement
            split_index = max_length
        segments.append(text[:split_index+1].strip())
        text = text[split_index+1:].strip()
    if text:  # Ajouter le dernier segment
        segments.append(text)
    return segments

# Exemple d'utilisation
input_file_path = "/Users/enzosebiane/PycharmProjects/BigDataProject/Darwin_data/darwin"  # Chemin vers votre fichier texte

# Lire et nettoyer le texte
with open(input_file_path, "r", encoding="utf-8") as file:
    raw_text = file.read()

cleaned_text = clean_wikipedia_text(raw_text)
segments = split_text(cleaned_text, max_length=500)

# Afficher le nombre de segments créés et un extrait du premier segment
print(f"Nombre de segments créés : {len(segments)}")
print("Extrait du premier segment :", segments[0][:200])  # Afficher un extrait de 200 caractères du premier segment


Nombre de segments créés : 165
Extrait du premier segment : | image = Charles Darwin seated crop.jpg | alt = Three quarter length studio photo showing Darwin's characteristic large forehead and bushy eyebrows with deep set eyes, pug nose and mouth set in a det


# Context (RAG)

In [9]:
import faiss
from sentence_transformers import SentenceTransformer

# Charger le modèle d'embedding
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Convertir les segments en embeddings
embeddings = embedder.encode(segments)

# Créer un index FAISS
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

print("Index FAISS créé avec", index.ntotal, "documents")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Index FAISS créé avec 165 documents


In [10]:
# Question utilisateur
query =  "Who's Charles Darwin"

query_embedding = embedder.encode([query])

# Recherche dans l'index
k = 2 # Nombre de résultats à récupérer
distances, indices = index.search(query_embedding, k)

# Récupérer les documents pertinents
retrieved_segments = [segments[i] for i in indices[0]]
print("Segments pertinents :", retrieved_segments)


Segments pertinents : ["| caption = Darwin, , when he was preparing ''On the Origin of Species'' | birth_name = Charles Robert Darwin | birth_date = | birth_place = Shrewsbury, Shropshire, England | death_date = | death_place = Down House, Down, Kent, England | resting_place = Westminster Abbey | alma_mater = | known_for = Natural selection | spouse = | children = 10, including William, Henrietta, George, Francis, Leonard and Horace | parents = | family = Darwin–Wedgwood | awards = ; 12 February 1809&nbsp;– 19 April 188", "==Biography== ===Early life and education=== Darwin was born in Shrewsbury, Shropshire, on 12 February 1809, at his family's home, The Mount. He was the fifth of six children of wealthy society doctor and financier Robert Darwin and Susannah Darwin (née Wedgwood). His grandfathers Erasmus Darwin and Josiah Wedgwood were both prominent abolitionists."]


In [12]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Charger le modèle LLM
model_name = "EleutherAI/gpt-neo-125M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Définir le token de remplissage
tokenizer.pad_token = tokenizer.eos_token

# Préparer le contexte pour la génération
context = " ".join(retrieved_segments)
prompt = f"Contexte : {context}\n\nQuestion : {query}\n\nRéponse :"

# Tokenisation avec padding
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)

output = model.generate(
    inputs.input_ids,
    max_length = 300,  # Longueur max de la génération
    num_return_sequences=1,  # Nombre de textes générés
    temperature=0.7,  # Contrôle de la créativité
    do_sample=True,  # Active le sampling
    pad_token_id=tokenizer.eos_token_id  # Définit un token de padding
)


# Afficher la réponse générée
response = tokenizer.decode(output[0], skip_special_tokens=True)
print("Réponse générée :")
print(response)

Réponse générée :
Contexte : | caption = Darwin,, when he was preparing ''On the Origin of Species'' | birth_name = Charles Robert Darwin | birth_date = | birth_place = Shrewsbury, Shropshire, England | death_date = | death_place = Down House, Down, Kent, England | resting_place = Westminster Abbey | alma_mater = | known_for = Natural selection | spouse = | children = 10, including William, Henrietta, George, Francis, Leonard and Horace | parents = | family = Darwin–Wedgwood | awards = ; 12 February 1809&nbsp;– 19 April 188 ==Biography== ===Early life and education=== Darwin was born in Shrewsbury, Shropshire, on 12 February 1809, at his family's home, The Mount. He was the fifth of six children of wealthy society doctor and financier Robert Darwin and Susannah Darwin (née Wedgwood). His grandfathers Erasmus Darwin and Josiah Wedgwood were both prominent abolitionists.

Question : Who's Charles Darwin

Réponse : Darwin had two sons: Charles, who was born on 13 April 1809; and Charles, 