# RAG System

In [1]:
import pandas
import numpy
import torch
import torchvision
import transformers
import faiss
import fitz
import sentence_transformers

print(pandas.__version__)
print(numpy.__version__)
print(torch.__version__)
print(torchvision.__version__)
print(transformers.__version__)
print(faiss.__version__)
print(fitz.__version__)
print(sentence_transformers.__version__)

  from .autonotebook import tqdm as notebook_tqdm


2.2.3
2.2.3
2.6.0+cpu
0.21.0+cpu
4.48.3
1.10.0
1.25.3
3.4.1


## Data Preparation

### Process CSV data

Load CSV data and convert it into chunks

In [4]:
import pandas as pd

df = pd.read_csv('synthetic_data.csv')

# Combine relevant columns into a single string for each row (representing a "document")
df['data_str'] = df.apply(lambda row: f"transaction_id: {row['transaction_id']}, username: {row['username']}, email: {row['email']}, amount_spent: {row['amount_spent']}, transaction_status: {row['transaction_status']}", axis=1)

# Convert the 'data_str' column into a list for embedding
csv_data = df['data_str'].tolist()


In [5]:
df.head(5)

Unnamed: 0,transaction_id,username,email,transaction_date,payment_method,amount_spent,transaction_status,delivery_address,product_category,discount_applied,data_str
0,1,Christina Richards,thood@example.org,2025-01-14,VISA 16 digit,34203042,Failed,"02442 Richard Causeway Apt. 776\nGarzaport, NJ...",Electronics,51,"transaction_id: 1, username: Christina Richard..."
1,2,Jennifer Joseph,anne04@example.org,2025-02-14,JCB 15 digit,358924705,Pending,"439 Daniel Freeway\nTamimouth, SC 02498",Electronics,8,"transaction_id: 2, username: Jennifer Joseph, ..."
2,3,Colin Figueroa,alyssaphelps@example.com,2025-01-20,VISA 13 digit,537,Pending,"054 Barrett Mission Apt. 026\nLake Steven, OR ...",Groceries,6,"transaction_id: 3, username: Colin Figueroa, e..."
3,4,James Wilson,laura71@example.net,2025-02-08,JCB 15 digit,108,Failed,"82063 Dixon Manors Apt. 675\nDavidmouth, WA 91132",Groceries,27,"transaction_id: 4, username: James Wilson, ema..."
4,5,Briana Brooks,charles58@example.org,2025-01-31,JCB 16 digit,741,Pending,"98713 Wanda Port\nParkerton, LA 20424",Electronics,26,"transaction_id: 5, username: Briana Brooks, em..."


In [6]:
csv_data

['transaction_id: 1, username: Christina Richards, email: thood@example.org, amount_spent: 34203042, transaction_status: Failed',
 'transaction_id: 2, username: Jennifer Joseph, email: anne04@example.org, amount_spent: 358924705, transaction_status: Pending',
 'transaction_id: 3, username: Colin Figueroa, email: alyssaphelps@example.com, amount_spent: 537, transaction_status: Pending',
 'transaction_id: 4, username: James Wilson, email: laura71@example.net, amount_spent: 108, transaction_status: Failed',
 'transaction_id: 5, username: Briana Brooks, email: charles58@example.org, amount_spent: 741, transaction_status: Pending',
 'transaction_id: 6, username: Samantha Hartman, email: turnerdanielle@example.com, amount_spent: 711827381, transaction_status: Completed',
 'transaction_id: 7, username: Cassie Le, email: leejenna@example.com, amount_spent: 30, transaction_status: Completed',
 'transaction_id: 8, username: Matthew Smith, email: amandastafford@example.net, amount_spent: 234, tra

### Process PDF data

In [7]:
# Extract text from the PDF
pdf_path = 'Fitria Zusni Farida_synthetic_data.pdf'
doc = fitz.open(pdf_path)
pdf_text = ""

for page in doc:
    pdf_text += page.get_text("text")  # Extract text from each page

# Split the PDF text into chunks (for example, splitting by paragraphs)
pdf_chunks = pdf_text.split('\n\n')  # This splits by blank lines (adjust as needed)


In [8]:
pdf_text

' \n \nBuku Petunjuk Penggunaan \nRoboGen X2000 \nTeknologi terkini untuk rumah tangga yang lebih efisien \n \n \n \n \n \n \n \n \n \n1 \nDAFTAR ISI \n \nBagian 1: Pengenalan \nBagian 2: Fitur \nBagian 3: Spesifikasi \nBagian 4: Panduan instalasi \nBagian 5: Petunjuk penggunaan \nBagian 6: Perawatan dan pembersihan \nBagian 7: Petunjuk keselamatan \nBagian 8: Pertanyaan dan Jawaban \nBagian 9: Penyelesaian masalah \nBagian 10: Garansi dan dukungan \n \n \nPengenalan \n \nRoboGen X2000 adalah robot otomatis \ncanggih yang dirancang untuk memberikan solusi \npraktis dan efisien dalam kehidupan sehari-hari \nAnda. Dengan teknologi terbaru yang dipadukan \ndengan kecerdasan buatan, RoboGen X2000 \nmampu melakukan berbagai tugas rumah tangga \nsecara mandiri, mulai dari pembersihan rumah \nhingga \npengawasan \nkeamanan \ndengan \nkemampuan yang tidak hanya cepat, tetapi juga \nakurat.  \n \nRoboGen X2000 dilengkapi dengan sistem \nnavigasi canggih yang memungkinkan robot ini \nuntuk berge

## Vectorization (Embeddings)

### Create Embeddings

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

model_name = "bert-base-uncased" 
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def get_embedding(text):

    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)

    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

csv_embeddings = [get_embedding(row) for row in csv_data]
pdf_embeddings = [get_embedding(chunk) for chunk in pdf_chunks]

print("Embeddings generated for CSV and PDF data.")


Embeddings generated for CSV and PDF data.


In [None]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained embedding model (you can replace with Llama or Gemma 9B)
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Embed CSV data (rows from the transaction log)
csv_embeddings = model.encode(csv_data)

# Embed PDF data (chunks from the PDF text)
pdf_embeddings = model.encode(pdf_chunks)

print("Embeddings generated for CSV and PDF data.")


### Store embeddings in FAISS

In [None]:
csv_embeddings_np = np.array(csv_embeddings)
pdf_embeddings_np = np.array(pdf_embeddings)

csv_index = faiss.IndexFlatL2(csv_embeddings_np.shape[1])
csv_index.add(csv_embeddings_np)

# Create a FAISS index for the PDF embeddings
pdf_index = faiss.IndexFlatL2(pdf_embeddings_np.shape[1])
pdf_index.add(pdf_embeddings_np)

# Save FAISS indexes
faiss.write_index(csv_index, "csv_transaction_log.index")
faiss.write_index(pdf_index, "pdf_user_manual.index")

print("FAISS indexes created and saved.")


FAISS indexes created and saved.


In [15]:
csv_embeddings_np = np.array(csv_embeddings)
pdf_embeddings_np = np.array(pdf_embeddings)

# Create FAISS index for CSV and PDF embeddings (L2 distance index)
csv_index = faiss.IndexFlatL2(csv_embeddings_np.shape[1])  # Use the shape of the embeddings
csv_index.add(csv_embeddings_np)  # Add the CSV embeddings to the index

pdf_index = faiss.IndexFlatL2(pdf_embeddings_np.shape[1])  # Create a separate index for PDF
pdf_index.add(pdf_embeddings_np)  # Add PDF embeddings to the index

# Save FAISS indexes to files (optional but useful)
faiss.write_index(csv_index, "csv_index.faiss")
faiss.write_index(pdf_index, "pdf_index.faiss")

print("FAISS indexes created and saved.")

FAISS indexes created and saved.


### Query the vector store (FAISS)

In [14]:
# Function to query FAISS index and retrieve the most relevant chunks
def query_faiss(query, index, top_k=5):
    # Embed the query using the same model
    query_embedding = model.encode([query])
    
    # Search for the top K closest vectors in the FAISS index
    distances, indices = index.search(np.array(query_embedding), top_k)
    
    return indices, distances

# Example query to search in the CSV index (transaction logs)
query = "What is the transaction status of John Doe?"
indices, distances = query_faiss(query, csv_index)

# Retrieve the top K most relevant rows from the DataFrame
relevant_data = df.iloc[indices[0]]
print("Relevant Data from CSV for query:", query)
print(relevant_data)


AttributeError: 'BertModel' object has no attribute 'encode'

In [None]:
# Perform similarity search

def query_faiss(query, index, top_k=5):

    """
    query: The input query, in the form of a string
    index: The FAISS index where the embeddings of data 
    top_k: This specifies how many top similar results you want to retrieve. 
    """
    # Convert query to embedding
    query_embedding = get_embedding(query).reshape(1, -1)  # Ensure it's a 2D array for FAISS
    
    # Perform the search in FAISS
    distances, indices = index.search(query_embedding, top_k)  # Get top_k closest results
    
    return indices, distances

# Example query: Retrieve relevant CSV data based on a query
query = "What is the transaction status of James Wilson?"
indices, distances = query_faiss(query, csv_index)

# Get the relevant rows (documents) from your dataframe based on the indices
relevant_data = df.iloc[indices[0]]
print("Relevant Data from CSV for query:", query)
print(relevant_data)


Relevant Data from CSV for query: What is the transaction status of James Wilson?
     transaction_id                      username                     email  \
13               14   Mr. Christopher Lambert DDS  ericksoncole@example.net   
169             170                 Mr. Alec Pena       wguerra@example.com   
422             423  Dr. Christopher Castaneda IV    leemichael@example.net   
27               28          Mr. Christian Graves     stephen77@example.org   
187             188                  Thomas Smith  kylehamilton@example.com   

    transaction_date    payment_method  amount_spent transaction_status  \
13        2025-01-31  American Express             8             Failed   
169       2025-01-12     VISA 16 digit            23            Pending   
422       2025-02-04     VISA 16 digit           106          Completed   
27        2025-01-11     VISA 16 digit           243            Pending   
187       2025-01-04      JCB 16 digit        521736            Pend

🔍Observation:
- Not exact match: It’s important to note that the search didn’t return the row with "James Wilson" in it. Instead, it returned rows that are "semantically" closest to the query based on their embeddings, which means that embeddings generated from different transactions might have similar patterns or relationships (in terms of transaction amount, date, or other features).
- Transaction status: Despite searching for "James Wilson," FAISS returned rows with different transaction statuses (Failed, Pending, Completed), not just those relevant to "James Wilson." This indicates that the embeddings of these rows are close to the query's embedding but do not exactly match in terms of content.
- Query relavance: This could be due to how the BERT model has embedded the data or how the query is represented. If the embeddings of other rows are more similar to the query's embedding than James Wilson's row, FAISS will return those results.

## Database Setup for Vector Store

## Setting up Retrieval System

In [None]:
from transformers import pipeline
# Load the Llama 8B model (replace with actual model in Hugging Face or your local setup)
qa_pipeline = pipeline("question-answering", model="huggingface/llama-8b")

# Prepare the context (retrieved data from CSV)
context = "transaction_id: 1, username: John Doe, email: john@example.com, amount_spent: 100, transaction_status: Completed"

# Ask the question
question = "What is the transaction status of John Doe?"

# Get the answer from the model
answer = qa_pipeline(question=question, context=context)

print("Answer from Llama 8B:", answer['answer'])


## Integrating with LlaMa 8B

In [17]:
from transformers import pipeline

# Load a pre-trained model (use Llama 8B, Gemma 9B, or any other model of your choice)
qa_pipeline = pipeline("question-answering", model="huggingface/llama-8b")

# Retrieve the context from the relevant CSV row or PDF chunk
context = "transaction_id: 1, username: John Doe, email: john@example.com, amount_spent: 100, transaction_status: Completed"

# Ask the question
question = "What is the transaction status of John Doe?"

# Get the answer from the model
answer = qa_pipeline(question=question, context=context)

print("Answer from Llama 8B:", answer['answer'])


OSError: huggingface/llama-8b is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

## Improving Retrieval Process

## Benchmarking Questions

In [None]:
questions = [
    "What is the transaction status for user John Doe in the month of February?",
    "How much did user Alice spend on electronics?",
    "Which payment method was used for transaction ID 105?",
    "What products were ordered by user Bob in the past week?",
    "Did user Jane receive a discount for her last purchase?",
    "Show me all transactions for the 'Home Appliances' category.",
    "What is the most common payment method used by customers?",
    "Which users made purchases of over $300?",
    "What are the delivery addresses for the top 5 transactions by amount spent?",
    "How many 'Completed' transactions were made between January 1st and 15th?"
]

# Loop through the questions and print the answers
for question in questions:
    indices, distances = query_faiss(question, csv_index)
    context = df.iloc[indices[0]]['data_str']  # Get relevant data
    answer = qa_pipeline(question=question, context=context)
    print(f"Q: {question}")
    print(f"A: {answer['answer']}")
    print("-" * 50)
