# INTRODUCTION TO A RAG-IMPLEMENTATION

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load an open-source model
model_name = "EleutherAI/gpt-neo-125M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)


In [5]:
# Add a pad_token_id if necessary
tokenizer.pad_token = tokenizer.eos_token

# Simple prompt
prompt = "Who's Charles Darwin"

# Tokenization with attention_mask and padding
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)

# Text generation
output = model.generate(
    inputs.input_ids,
    max_length=100,  # Maximum length of the generated text
    num_return_sequences=1,  # Number of text sequences generated
    temperature=0.7,  # Controls the creativity of the model
    do_sample=True,  # Enables sampling
    pad_token_id=tokenizer.eos_token_id  # Defines a padding token
)

# Decode the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated text:")
print(generated_text)


Texte généré :
Who's Charles Darwin?"

"I don't know."

"Not exactly. But I do know that if you wanted to be a man, you'd have to learn the language. But you'd be the most difficult man to learn. So you're going to learn to speak English."

"English?"

"Yes. And I'm going to learn how to read, how to write, how to read and write."

"Why are you so interested in


# PDF READER


In [6]:
from PyPDF2 import PdfReader

# Load the PDF
pdf_path = "/data/health_systems/Psychology_is_improving_brain_health_and_aging.pdf"
reader = PdfReader(pdf_path)

# Extract all the text
document_text = ""
for page in reader.pages:
    document_text += page.extract_text()

# Preview the extracted text
print("Extracted text (preview):", document_text[:500])

# Print the total character count
print("Total character count:", len(document_text))

def split_text(text, max_length=100):
    sentences = text.split(". ")
    chunks, chunk = [], []
    current_length = 0

    for sentence in sentences:
        chunk.append(sentence)
        current_length += len(sentence.split())
        if current_length >= max_length:
            chunks.append(". ".join(chunk))
            chunk, current_length = [], 0

    if chunk:
        chunks.append(". ".join(chunk))
    return chunks

segments = split_text(document_text)
print("Number of segments:", len(segments))


FileNotFoundError: [Errno 2] No such file or directory: '/data/health_systems/Psychology_is_improving_brain_health_and_aging.pdf'

In [None]:
def read_text_file(file_path):
    """Read the content of a text file"""
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def split_text_by_characters(text, max_length=1000):
    """Split the text into chunks of maximum length (in characters)"""
    chunks = []
    start = 0
    while start < len(text):
        end = start + max_length
        chunks.append(text[start:end])
        start = end
    return chunks

# Example usage with a text file
file_path = "/data/Darwin_data/darwin"  # Replace with the path to your file
document_text = read_text_file(file_path)

# Split the text into chunks (e.g., 1000 characters per chunk)
segments = split_text_by_characters(document_text, max_length=1000)

# Display the number of segments created and a preview of the first segment
print(f"Number of segments created: {len(segments)}")
print("Preview of the first segment:", segments[0][:200])  # Display a preview of 200 characters from the first segment


# WIKIPEDIA READER

In [8]:
import re

def clean_wikipedia_text(text):
    """
    Cleans Wikipedia text:
    - Removes <ref> tags, {{...}} templates, and HTML tags.
    - Removes [[...]] while keeping their content.
    - Reduces multiple spaces.
    """
    text = re.sub(r"<ref[^>]*>.*?</ref>", "", text, flags=re.DOTALL)  # Remove <ref> tags
    text = re.sub(r"{{[^}]*}}", "", text)  # Remove {{...}} templates
    text = re.sub(r"<[^>]*>", "", text)  # Remove HTML tags
    text = re.sub(r"\[\[([^|\]]+\|)?([^\]]+)\]\]", r"\2", text)  # Remove [[]] while keeping the content
    text = re.sub(r"\s+", " ", text).strip()  # Reduce multiple spaces
    return text

def split_text(text, max_length=1000):
    """
    Splits the text into segments with a given maximum length.
    """
    segments = []
    while len(text) > max_length:
        split_index = text[:max_length].rfind(".")
        if split_index == -1:  # If no period is found, cut abruptly
            split_index = max_length
        segments.append(text[:split_index+1].strip())
        text = text[split_index+1:].strip()
    if text:  # Add the last segment
        segments.append(text)
    return segments

# Example usage
input_file_path = "/Users/enzosebiane/PycharmProjects/BigDataProject/Darwin_data/darwin"  # Path to your text file

# Read and clean the text
with open(input_file_path, "r", encoding="utf-8") as file:
    raw_text = file.read()

cleaned_text = clean_wikipedia_text(raw_text)
segments = split_text(cleaned_text, max_length=500)

# Display the number of segments created and a preview of the first segment
print(f"Number of segments created: {len(segments)}")
print("Preview of the first segment:", segments[0][:200])  # Display a preview of 200 characters from the first segment


Nombre de segments créés : 165
Extrait du premier segment : | image = Charles Darwin seated crop.jpg | alt = Three quarter length studio photo showing Darwin's characteristic large forehead and bushy eyebrows with deep set eyes, pug nose and mouth set in a det


# Context (RAG)

In [9]:
import faiss
from sentence_transformers import SentenceTransformer

# Load the embedding model
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Convert the segments into embeddings
embeddings = embedder.encode(segments)

# Create a FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)  # Using L2 distance (Euclidean distance) for similarity
index.add(embeddings)

print("FAISS index created with", index.ntotal, "documents")  # Display the number of documents in the index



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Index FAISS créé avec 165 documents


In [10]:
# User query
query = "Who's Charles Darwin"

# Convert the query into an embedding
query_embedding = embedder.encode([query])

# Perform the search in the index
k = 2  # Number of results to retrieve
distances, indices = index.search(query_embedding, k)

# Retrieve the relevant segments based on the indices
retrieved_segments = [segments[i] for i in indices[0]]
print("Relevant segments:", retrieved_segments)  # Display the retrieved relevant segments



Segments pertinents : ["| caption = Darwin, , when he was preparing ''On the Origin of Species'' | birth_name = Charles Robert Darwin | birth_date = | birth_place = Shrewsbury, Shropshire, England | death_date = | death_place = Down House, Down, Kent, England | resting_place = Westminster Abbey | alma_mater = | known_for = Natural selection | spouse = | children = 10, including William, Henrietta, George, Francis, Leonard and Horace | parents = | family = Darwin–Wedgwood | awards = ; 12 February 1809&nbsp;– 19 April 188", "==Biography== ===Early life and education=== Darwin was born in Shrewsbury, Shropshire, on 12 February 1809, at his family's home, The Mount. He was the fifth of six children of wealthy society doctor and financier Robert Darwin and Susannah Darwin (née Wedgwood). His grandfathers Erasmus Darwin and Josiah Wedgwood were both prominent abolitionists."]


In [12]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the LLM model
model_name = "EleutherAI/gpt-neo-125M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Define the padding token
tokenizer.pad_token = tokenizer.eos_token

# Prepare the context for generation
context = " ".join(retrieved_segments)
prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"

# Tokenize with padding
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)

output = model.generate(
    inputs.input_ids,
    max_length=300,  # Maximum length of the generated output
    num_return_sequences=1,  # Number of generated texts
    temperature=0.7,  # Controls the creativity of the output
    do_sample=True,  # Enables sampling
    pad_token_id=tokenizer.eos_token_id  # Define the padding token ID
)

# Display the generated answer
response = tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated answer:")
print(response)


Réponse générée :
Contexte : | caption = Darwin,, when he was preparing ''On the Origin of Species'' | birth_name = Charles Robert Darwin | birth_date = | birth_place = Shrewsbury, Shropshire, England | death_date = | death_place = Down House, Down, Kent, England | resting_place = Westminster Abbey | alma_mater = | known_for = Natural selection | spouse = | children = 10, including William, Henrietta, George, Francis, Leonard and Horace | parents = | family = Darwin–Wedgwood | awards = ; 12 February 1809&nbsp;– 19 April 188 ==Biography== ===Early life and education=== Darwin was born in Shrewsbury, Shropshire, on 12 February 1809, at his family's home, The Mount. He was the fifth of six children of wealthy society doctor and financier Robert Darwin and Susannah Darwin (née Wedgwood). His grandfathers Erasmus Darwin and Josiah Wedgwood were both prominent abolitionists.

Question : Who's Charles Darwin

Réponse : Darwin had two sons: Charles, who was born on 13 April 1809; and Charles, 