<a href="https://colab.research.google.com/github/Daria-Mir/dream_islands_RAG/blob/main/RAG_Dream_Islands_ENG_new.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Step 1: Install and activate all necessary libraries**

In [None]:
!pip install PyMuPDF

In [None]:
!pip install transformers sentence-transformers faiss-cpu

In [None]:
import numpy as np #for numerical operations
import os, os.path as osp #for loading knowledge base files
import pymupdf as pf #for operations with pdf files

#Libraries for data cleaning
import re
import string
import nltk
from nltk.corpus import stopwords

#Libraries for using Hugging Face (chunking, embeddings, vector database, RAG pipeline)
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS

#Libraries for tokenization and embeddings
from sentence_transformers import SentenceTransformer, util
from transformers import T5Tokenizer
from transformers import AutoTokenizer, pipeline

#Libraries for the Generation part (Gemini)
from sklearn.metrics.pairwise import cosine_similarity
import google.generativeai as genai

# **Step 2: Get access to GitHub repository where all files for the company RAG knowledge base are saved**

In [None]:
# Clone the specific branch (data) of your GitHub repository into Colab
!git clone --branch data https://github.com/Daria-Mir/dream_islands_RAG

In [None]:
# List the files in the 'data' branch directory
data_dir = '/content/dream_islands_RAG'
os.listdir(data_dir)

# **Step 3: Check that files all formats (.txt, .csv, .pdf) are connected to Colab**

In [None]:
# Read a .txt file
txt_file_path = os.path.join(data_dir, 'islands.txt')
with open(txt_file_path, 'r') as file:
    text_content = file.read()

print(text_content[:500])  # Display the first 500 characters

In [None]:
# Read a .pdf file
pdf_file_path = os.path.join(data_dir, 'dream_islands_intro.pdf')
doc = pf.open(pdf_file_path)

# Extract text from the first page
pdf_text = doc[1].get_text()
print(pdf_text[:1500])  # Display the first 500 characters of the text

# **Step 4: Initialize necessary instruments**


*   Text Splitter for chunking
*   Tokenizer
*   Embeddings with Hugging Face






In [None]:
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
tokenizer = AutoTokenizer.from_pretrained("gpt2")
huggingface_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


# **Step 5: Creation of Knowledge base**

In [None]:
# Knowledge base paths
data_dir = '/content/dream_islands_RAG'
txt_files = ['di_welness_art.txt', 'islands.txt', 'top_20_reatreats.txt', 'di_fasting_art.txt', 'di_workshops_description.txt', 'schedule_tanya.txt']
pdf_files = ['activities.pdf', 'dream_islands_intro.pdf', 'tanya.pdf']

In [None]:
# Create an empty knowledge base
knowledge_base = []

# **Step 6: Pre-processing of files for the knowledge base**

1.   Cleaning
2.   Chunking
3.   Tokenization
1.   Adding all files in different formats to the knowledge base








In [None]:
# Function to clean the text data
nltk.download('stopwords')

def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = text.strip()  # Strip leading/trailing spaces

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])

    return text

In [None]:
# Function to chunk and clean text
def preprocess_and_chunk(doc_text):
    cleaned_text = clean_text(doc_text)
    chunks = text_splitter.split_text(cleaned_text)
    return chunks

In [None]:
# Function to tokenize the chunks
def tokenize_and_chunk(doc):
    cleaned_doc = clean_text(doc)  # Clean the text
    chunks = chunk_text(cleaned_doc, max_tokens=512)  # Chunk into manageable sizes
    tokenized_chunks = [[tokenizer.encode(chunk, add_special_tokens=False)] for chunk in chunks]  # Tokenize each chunk
    return tokenized_chunks

In [None]:
# Read and process each txt file
for file_name in txt_files:
    with open(os.path.join(data_dir, file_name), 'r') as txt_file:
        doc_text = txt_file.read()
        chunks = preprocess_and_chunk(doc_text)
        knowledge_base.extend([tokenizer.encode(chunk) for chunk in chunks])

# Read and process each pdf file
for pdf_file in pdf_files:
    pdf_doc = pf.open(os.path.join(data_dir, pdf_file))
    pdf_text = "\n".join([page.get_text() for page in pdf_doc])
    chunks = preprocess_and_chunk(pdf_text)
    knowledge_base.extend([tokenizer.encode(chunk) for chunk in chunks])

In [None]:
# Check the first few chunks of text
print(knowledge_base[:5])  # Display the first 5 chunks

# **Step 7: Embedding with Hugging Face**

In [None]:
#Generate embeddings for the tokenized chunks
texts = [tokenizer.decode(chunk) for chunk in knowledge_base] # Decode tokenized chunks to get the original text
embeddings = huggingface_embeddings.embed_documents(texts) # Embed the texts using the HuggingFaceEmbeddings object
text_embedding_pairs = list(zip(texts, embeddings)) # Create pairs of text and embeddings

In [None]:
#Create FAISS index from the embeddings
faiss_index = FAISS.from_embeddings(text_embedding_pairs, huggingface_embeddings)

# **Step 8: Creating RAG Pipeline with Gemini Generation**

In [None]:
# Initialize Google AI Studio API
api_key = "GOOGLE_API_KEY_HERE"
if not api_key:
    raise ValueError("Please set GOOGLE_API_KEY environment variable")

In [None]:
class RAGSystem:
    def __init__(self, api_key):
        print("Initializing RAG system...")
        # Retrieval configuration
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.sentences = knowledge_base  # Assuming knowledge_base is a list of tokenized chunks (lists of integers)
        self.embeddings = self.model.encode([tokenizer.decode(chunk) for chunk in self.sentences], convert_to_tensor=True)  # Decode tokenized chunks

        # Gemini configuration
        genai.configure(api_key=api_key)

        # Generation configurations
        generation_config = {
            "temperature": 0.7,
            "max_output_tokens": 2048,
        }

        # Safety configurations
        safety_settings = [
            {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
            {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
            {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
            {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
        ]

        # Gemini model initialization
        self.llm = genai.GenerativeModel(
            model_name="gemini-1.5-flash",
            generation_config=generation_config,
            safety_settings=safety_settings
        )

        print("RAG system successfully initialized!")

    def retrieve(self, query, k=3):
        """Retrieve the top k most relevant documents"""
        query_embedding = self.model.encode([query], convert_to_tensor=True)
        similarities = cosine_similarity(query_embedding, self.embeddings)[0]
        top_k_indices = similarities.argsort()[-k:][::-1]

        results = []
        for idx in top_k_indices:
            # Decode the list of token IDs representing the chunk
            # text = tokenizer.decode(self.sentences[idx][0], skip_special_tokens=True) #Modified
            text = tokenizer.decode(self.sentences[idx]) #Modified
            results.append({
                'text': text,  # Now 'text' is the actual text string
                'similarity': similarities[idx]
            })
        return results

    def generate(self, query, retrieved_docs):
        """Generate a response using Gemini"""
        # Build the prompt with the retrieved context
        context = "\n".join([doc['text'] for doc in retrieved_docs])

        prompt = f"""
CONTEXT:
{context}

USER QUERY:
{query}

INSTRUCTIONS:
Generate a response to the user's question using ONLY the information provided in the context above.
If some information is not present in the context, do not invent it.
Provide a clear and well-structured response.
"""

        # Generate the response
        response = self.llm.generate_content(prompt)
        return response.text

    def query(self, user_query, k=3):
        """Complete RAG process: retrieval + generation"""
        print("1. Retrieving relevant documents...")
        retrieved_docs = self.retrieve(user_query, k)

        print("2. Generating response...")
        response = self.generate(user_query, retrieved_docs)

        return {
            'response': response,
            'retrieved_docs': retrieved_docs
        }

def print_full_results(results):
    """Print the full results of the RAG system"""
    print("\nRetrieved Documents:")
    print("-" * 80)
    for i, doc in enumerate(results['retrieved_docs'], 1):
        print(f"{i}. [Score: {doc['similarity']:.3f}] {doc['text']}")

    print("\nGenerated Response:")
    print("-" * 80)
    print(results['response'])

# **Step 9: Use the RAG**

In [None]:
# Initialize the system
rag = RAGSystem(api_key)

In [None]:
# Ask questions interactively
while True:
    user_query = input("Please ask a question (or type 'exit' to stop): ")

    if user_query.lower() == 'exit':
        print("Exiting the system...")
        break

    print("-" * 60)
    response = rag.query(user_query)
    print_full_results(response)