In [None]:
%pip install -U langchain
%pip install -U langchain-core
%pip install -U langchain-community
%pip install -U langchain-text-splitters
%pip install -U pypdf
%pip install -U pymupdf
%pip install -U faiss-cpu
%pip install -U chromadb
%pip install -U sentence-transformers


In [None]:
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

from typing import Any, List
from sqlite3 import connect
from datetime import datetime 

import numpy as np
import pandas as pd
import chromadb
import os
import uuid

In [None]:
def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show example of a chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

class EmbeddingManager:
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        try:
            self.model = SentenceTransformer(self.model_name)
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            raise

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        if not self.model:
            raise ValueError("Model not loaded")
    
        embeddings = self.model.encode(texts, show_progress_bar=True)
        return embeddings

class VectorStore:
    def __init__(self, collection_name: str, persist_directory: str):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={
                    "description": "Document embeddings for RAG"
                }
            )
            print(f"Vector store initialized. Collection: {self.collection_name}")
            print(f"Existing documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")
        
        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []
        
        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)
            
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)
            
            documents_text.append(doc.page_content)
            
            embeddings_list.append(embedding.tolist())
        
        try:
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
            
        except Exception as e:
            print(f"Error adding documents to vector store: {e}")
            raise

class RAGRetriever:    
    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k: int = 5, score_threshold: float = 0.0):
        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")
        
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]
        
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )
            
            # Process results
            retrieved_docs = []
            
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]
                
                for i, (doc_id, document, metadata, distance) in enumerate(zip(ids, documents, metadatas, distances)):
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance': distance,
                            'rank': i + 1
                        })
                
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else:
                print("No documents found")
            
            return retrieved_docs
            
        except Exception as e:
            print(f"Error during retrieval: {e}")
            return []

In [None]:
embedding_manager=EmbeddingManager()
vectorstore = VectorStore('txt_messages', '../Data/vector_store')
rag_retriever=RAGRetriever(vectorstore,embedding_manager)

In [None]:
loc = os.getenv('DATA')

sms = pd.read_csv(loc+"sms.csv", header=None)
sms = sms.loc[:, [2, 4, 5, 9, 12, 18]]
sms.columns = ["Address", "Date Sent", "Date Received", "Type", "Body", "Seen"]
for i in sms[1:]:
    sms[i] = sms[i].transform(lambda x: x.split("=")[1])

sms["Address"] = sms["Address"].replace(regex="^\\+1", value="")
sms["Date Sent"] = sms["Date Sent"].transform(lambda x: datetime.fromtimestamp(int(x)//1000).strftime("%d-%m-%Y %H:%M:%S"))
sms["Type"] = sms["Type"].transform(lambda x: "Received" if x == "1" else "Sent")
sms["Seen"] = sms["Seen"].transform(lambda x: "True" if x == "1" else "False")
    
for i in sms.index:
    sms.at[i, "Date Received"] = sms.at[i, "Date Sent"] if sms.at[i, "Type"] == "Sent" else datetime.fromtimestamp(int(sms.at[i, "Date Received"])//1000).strftime("%d-%m-%Y %H:%M:%S")
        
sms_documents = []
for i in sms.index:
    sms_documents.append(
        Document(
            page_content=sms.at[i, "Body"],
            metadata={
                "Sender": sms.at[i, "Address"],
                "Date Time Sent": sms.at[i, "Date Sent"],
                "Date Time Received": sms.at[i, "Date Received"],
                "Type": sms.at[i, "Type"],
                "Seen": sms.at[i, "Seen"]
            }
    )
)


In [None]:
chunks = split_documents(sms_documents)
texts = [doc.page_content for doc in chunks]
embeddings = embedding_manager.generate_embeddings(texts)
vectorstore.add_documents(sms_documents, embeddings)
rag_retriever.retrieve("threat", score_threshold=0)

In [None]:
vectorstore=VectorStore(collection_name="txt_messages", persist_directory="../data/vector_store/")
vectorstore=VectorStore(collection_name="contacts", persist_directory="../data/vector_store/")
vectorstore=VectorStore(collection_name="call_logs", persist_directory="../data/vector_store/")

In [None]:
for e in embeddings:
    print(len(e))