In [1]:
print('Ahmed')

Ahmed


In [2]:
import os
from git import Repo
from langchain_classic.document_loaders.generic import GenericLoader
from langchain_classic.document_loaders.parsers import LanguageParser
from langchain_classic.text_splitter import Language



  from .autonotebook import tqdm as notebook_tqdm


## load docs

In [3]:
def load_repo(url: str = None, repo_dir: str = "Input_Repo"):
    """
    Clone a git repository to a local directory.
    
    Args:
        url (str): Git repository URL
        repo_dir (str): Local directory to clone into
        
    Returns:
        str: Path to the cloned repository
    """
    if not url:
        raise ValueError("Repository URL is required")
    
    # Create directory if it doesn't exist
    os.makedirs(repo_dir, exist_ok=True)
    
    # Clone the repository
    repo_name = url.split('/')[-1].replace('.git', '')
    repo_path = os.path.join(repo_dir, repo_name)
    
    # Check if repo already exists
    if not os.path.exists(repo_path):
        print(f"Cloning repository from {url} to {repo_path}")
        Repo.clone_from(url=url, to_path=repo_path)
    else:
        print(f"Repository already exists at {repo_path}")
    
    return repo_path  # Return the actual path for use in loader

def document_loader_repo(path: str):
    """
    Load Python documents from a repository path.
    
    Args:
        path (str): Path to the repository
        
    Returns:
        list: Loaded document objects
    """
    if not os.path.exists(path):
        raise FileNotFoundError(f"Path does not exist: {path}")
    
    # Create loader for Python files
    loader = GenericLoader.from_filesystem(
        path=path,
        glob="**/*",
        suffixes=['.py'],
        parser=LanguageParser(
            language=Language.PYTHON,
            parser_threshold=500  # Increased from 100 for better chunking
        )
    )
    
    # Load documents
    documents = loader.load()
    
    return documents

# Example usage:
if __name__ == "__main__":
    # Clone a repository
    repo_url = "https://github.com/Ahmed2797/Network-Security.git"
    repo_path = load_repo(url=repo_url)
    
    # Load documents from the cloned repo
    docs = document_loader_repo(repo_path)
    print(f"Loaded {len(docs)} documents")

Repository already exists at Input_Repo/Network-Security
Loaded 35 documents


In [4]:
docs

[Document(metadata={'source': 'Input_Repo/Network-Security/test_mongo.py', 'language': <Language.PYTHON: 'python'>}, page_content='from pymongo.mongo_client import MongoClient\nfrom pymongo.server_api import ServerApi\n\nuri = "mongodb+srv://tanvirahmed754575_db_user:<@password>@cluster0.oofrxbi.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"\n\n# Create a new client and connect to the server\nclient = MongoClient(uri, server_api=ServerApi(\'1\'))\n\n# Send a ping to confirm a successful connection\ntry:\n    client.admin.command(\'ping\')\n    print("Pinged your deployment. You successfully connected to MongoDB!")\nexcept Exception as e:\n    print(e)\n\n\n'),
 Document(metadata={'source': 'Input_Repo/Network-Security/app.py', 'language': <Language.PYTHON: 'python'>}, page_content='from Network_Security.pipeline.train_pipeline import Training_Pipeline\nfrom Network_Security.logging.logger import logging\nfrom Network_Security.exception.exception import NetworkSecurityExcept

## split_text

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

from langchain_classic.text_splitter import Language
from typing import List

def split_documents(documents: List[Document], 
                   chunk_size: int = 500, 
                   chunk_overlap: int = 100) -> List[Document]:
    """
    Split documents into smaller chunks for processing.
    
    Args:
        documents: List of documents to split
        chunk_size: Maximum size of each chunk in characters
        chunk_overlap: Overlap between consecutive chunks in characters
        
    Returns:
        List[Document]: List of split document chunks
    """
    # Create text splitter configured for Python code
    text_splitter = RecursiveCharacterTextSplitter.from_language(
        language=Language.PYTHON,
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    
    # Split documents into chunks
    chunks = text_splitter.split_documents(documents)
    
    print(f"Created {len(chunks)} chunks from {len(documents)} documents")
    
    # Display size of first 5 chunks for verification
    for i, chunk in enumerate(chunks[:5]):
        print(f"Chunk {i+1}: {len(chunk.page_content)} characters")
    
    return chunks


if __name__ == '__main__':
    chunks = split_documents(docs)

Created 181 chunks from 35 documents
Chunk 1: 323 characters
Chunk 2: 198 characters
Chunk 3: 357 characters
Chunk 4: 161 characters
Chunk 5: 471 characters


## Embedding MOdel

In [6]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

def embedding_model(model_name:str="sentence-transformers/all-MiniLM-l6-v2"):
    embedding = HuggingFaceEmbeddings(model_name=model_name)

    return embedding


In [7]:
embedding = embedding_model()

## FAISS vactor store

In [8]:
from langchain_community.vectorstores.faiss import FAISS
from langchain_core.documents import Document
from typing import List, Optional
import os

class VectorStore:
    def __init__(self, index_name: str = "faiss_index"):
        """
        Initialize Vector Store for FAISS.
        
        Args:
            index_name: Name for the index directory
        """
        self.index_name = index_name
    
    def create_vector_store(self, document_chunks: List[Document], embedding_model):
        """
        Create a FAISS vector store from document chunks.
        
        Args:
            document_chunks: List of document chunks to embed
            embedding_model: Embedding model instance
            
        Returns:
            FAISS: Vector store instance
        """
        # Validate input
        if not document_chunks:
            raise ValueError("Document chunks list cannot be empty")
        
        if len(document_chunks) == 0:
            raise ValueError("No document chunks provided")
        
        # Create vector store
        vector_store = FAISS.from_documents(
            documents=document_chunks,
            embedding=embedding_model
        )
        
        print(f"Created vector store with {len(document_chunks)} documents")
        return vector_store
    
    def save_vector_store(self, vector_store, custom_path: Optional[str] = None):
        """
        Save vector store to disk.
        
        Args:
            vector_store: FAISS vector store instance
            custom_path: Custom path to save index (optional)
            
        Returns:
            str: Path where index was saved
        """
        save_path = custom_path or self.index_name
        
        # Ensure directory exists
        os.makedirs(os.path.dirname(save_path) if os.path.dirname(save_path) else ".", exist_ok=True)
        
        # Save the vector store
        vector_store.save_local(save_path)
        print(f"Vector store saved to: {save_path}")
        return save_path
    
    def load_vector_store(self, embedding_model, custom_path: Optional[str] = None):
        """
        Load vector store from disk.
        
        Args:
            embedding_model: Same embedding model used during creation
            custom_path: Custom path to load index from (optional)
            
        Returns:
            FAISS: Loaded vector store instance
        """
        load_path = custom_path or self.index_name
        
        if not os.path.exists(load_path):
            raise FileNotFoundError(f"Vector store not found at: {load_path}")
        
        # Load the vector store
        vector_store = FAISS.load_local(
            folder_path=load_path,
            embeddings=embedding_model,
            allow_dangerous_deserialization=True  # Required for FAISS
        )
        print(f"Vector store loaded from: {load_path}")
        return vector_store
    
    def search_similar(self, vector_store, query: str, k: int = 4):
        """
        Search for similar documents.
        
        Args:
            vector_store: FAISS vector store instance
            query: Search query
            k: Number of results to return
            
        Returns:
            List[Document]: Similar documents
        """
        results = vector_store.similarity_search(query=query, k=k)
        return results
    
if __name__=="__main__":
    store = VectorStore()
    vactor_store = store.create_vector_store(chunks,embedding)
    save_path = store.save_vector_store(vector_store=vactor_store)
    # load_vector_store = store.load_vector_store(embedding_model=embedding)
    

Created vector store with 181 documents
Vector store saved to: faiss_index


In [9]:
load_vector_store = store.load_vector_store(embedding_model=embedding)


Vector store loaded from: faiss_index


## LLM

In [10]:
from langchain_groq import ChatGroq

def llm_load(api_key:str,model:str="llama-3.1-8b-instant"):

    llm = ChatGroq(
    model="llama-3.1-8b-instant",
    api_key=api_key,
    max_tokens=500
    )
    return llm
    

from dotenv import load_dotenv
import os

load_dotenv()
groq_api_key = os.getenv('GROQ_API_KEY')

llm = llm_load(api_key=groq_api_key)

In [17]:
from langchain_classic.memory import ConversationSummaryMemory
from langchain_classic.chains import ConversationalRetrievalChain
from langchain_core.vectorstores import VectorStore
from langchain_classic.schema import BaseRetriever


class GenerateRetriever:
    def __init__(self, vector_store: VectorStore, memory_key: str = 'chat_history'):
        """
        Initialize a conversational retriever with memory.
        
        Args:
            vector_store: Vector store containing embedded documents
            memory_key: Key for storing chat history in memory
        """
        self.vector_store = vector_store
        self.memory_key = memory_key  
    
    def create_memory(self, llm) -> ConversationSummaryMemory:
        """
        Create conversation summary memory.
        
        Args:
            llm: Language model instance
            
        Returns:
            ConversationSummaryMemory: Configured memory object
        """
        memory = ConversationSummaryMemory(
            llm=llm,
            memory_key=self.memory_key,
            return_messages=True,output_key="answer"
        )
        return memory
    
    def create_conversational_chain(self, llm, memory: ConversationSummaryMemory) -> ConversationalRetrievalChain:
        """
        Create conversational retrieval chain.
        
        Args:
            llm: Language model instance
            memory: Conversation memory object
            
        Returns:
            ConversationalRetrievalChain: Configured retrieval chain
        """
        # Create retriever from vector store
        retriever = self.vector_store.as_retriever(
            search_type="mmr",
            search_kwargs={
                "k": 3,           # Number of documents to return
                "fetch_k": 50,    # Number of documents to fetch for MMR
                "score_threshold": 0.5  # Optional: minimum similarity score
            }
        )
        
        # Create conversational retrieval chain
        conversational_chain = ConversationalRetrievalChain.from_llm(
            llm=llm,
            retriever=retriever,
            memory=memory,
            #verbose=True,  # Shows what's happening
            chain_type="stuff",  # Options: "stuff", "map_reduce", "refine", "map_rerank"
            return_source_documents=True  # Return source documents for citation
            
        )
        
        print(f"Created conversational chain with retriever")
        print(f"Search type: MMR, k: 5, fetch_k: 50")
        
        return conversational_chain
    
    def simple_retriever(self, search_type: str = "similarity", k: int = 4) -> BaseRetriever:
        """
        Create a simple retriever without conversation chain.
        
        Args:
            search_type: Type of search ("similarity", "mmr", "similarity_score_threshold")
            k: Number of documents to retrieve
            
        Returns:
            BaseRetriever: Configured retriever
        """
        search_kwargs = {"k": k}
        
        # Add additional parameters based on search type
        if search_type == "similarity_score_threshold":
            search_kwargs["score_threshold"] = 0.7
        
        retriever = self.vector_store.as_retriever(
            search_type=search_type,
            search_kwargs=search_kwargs
        )
        
        return retriever


if __name__=='__main__':
    gen_retriver = GenerateRetriever(load_vector_store)
    memory = gen_retriver.create_memory(llm=llm)
    chain = gen_retriver.create_conversational_chain(llm=llm,memory=memory)

Created conversational chain with retriever
Search type: MMR, k: 5, fetch_k: 50


In [18]:
chain

ConversationalRetrievalChain(memory=ConversationSummaryMemory(llm=ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x744d2d9e1670>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x744d2d9e1100>, model_name='llama-3.1-8b-instant', model_kwargs={}, groq_api_key=SecretStr('**********'), max_tokens=500), chat_memory=InMemoryChatMessageHistory(messages=[]), output_key='answer', return_messages=True, memory_key='chat_history'), verbose=False, combine_docs_chain=StuffDocumentsChain(verbose=False, llm_chain=LLMChain(verbose=False, prompt=ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template="Use the following pieces of context to answer the user's question.\nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n----------------\n{

System: Use the following pieces of context to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.


In [19]:
question = 'whats get_feature_extract_data'
result = chain(question)
result

{'question': 'whats get_feature_extract_data',
 'chat_history': [SystemMessage(content='', additional_kwargs={}, response_metadata={})],
 'answer': "`get_feature_extract_data` is a method of a class (not explicitly shown in the provided context) that appears to be responsible for extracting data from a MongoDB database.\n\nHere's a step-by-step breakdown of what the method does:\n\n1. It logs an informational message indicating that data extraction from MongoDB is starting.\n2. It creates an instance of the `NetworkData` class, which is not shown in the provided context.\n3. It calls the `get_dataframe` method of the `NetworkData` instance, passing the `collection_name` from `self.ingestion_config.data_ingestion_collection_path` as an argument. This method likely retrieves data from the specified MongoDB collection and returns it as a Pandas DataFrame.\n4. The method does not immediately return the DataFrame, but instead sets up a variable named `feature_data_path` that is assigned the

In [20]:
result['answer']

"`get_feature_extract_data` is a method of a class (not explicitly shown in the provided context) that appears to be responsible for extracting data from a MongoDB database.\n\nHere's a step-by-step breakdown of what the method does:\n\n1. It logs an informational message indicating that data extraction from MongoDB is starting.\n2. It creates an instance of the `NetworkData` class, which is not shown in the provided context.\n3. It calls the `get_dataframe` method of the `NetworkData` instance, passing the `collection_name` from `self.ingestion_config.data_ingestion_collection_path` as an argument. This method likely retrieves data from the specified MongoDB collection and returns it as a Pandas DataFrame.\n4. The method does not immediately return the DataFrame, but instead sets up a variable named `feature_data_path` that is assigned the value of `self.ingestion_config.data_ingestion_feature_path`. This suggests that the method might be related to feature extraction or processing, b

In [27]:
query = 'NetworkSecurity_Features'
result = chain(query)
result['answer']

'Storing feature data in an S3 bucket serves several purposes:\n\n1. **Data Versioning**: S3 allows you to store multiple versions of your data, making it easier to track changes and maintain a record of your data\'s evolution over time.\n\n2. **Data Sharing**: S3 buckets can be shared with others, facilitating collaboration and data exchange between teams or organizations.\n\n3. **Data Retrieval and Analysis**: With S3, you can easily access and analyze your feature data using various tools and libraries, such as those provided by AWS or third-party vendors.\n\n4. **Scalability and Cost-Effectiveness**: S3 offers a scalable and cost-effective solution for storing large amounts of data, as you only pay for the storage you use.\n\n5. **Data Backup and Disaster Recovery**: By storing feature data in S3, you can easily create backups and ensure business continuity in case of data loss or system failures.\n\n6. **Machine Learning and Model Training**: Feature data stored in S3 can be used 

In [24]:
def question(question:str):
    result = chain(question)
    answer = result['answer']
    return answer
    