## Data ingestion

In [15]:
### Document structure

from langchain_core.documents import Document

In [16]:
doc = Document(
    page_content="Hello, world! this is a test document", 
    metadata={
        "source": "example.pdf", 
        "page": 1, "author": "John Doe", 
        'date_created': "2021-01-01"
    })

doc

Document(metadata={'source': 'example.pdf', 'page': 1, 'author': 'John Doe', 'date_created': '2021-01-01'}, page_content='Hello, world! this is a test document')

In [19]:
### create a simple text file in data folder

import os 
os.makedirs("../data/text_files", exist_ok=True)

sample_texts={
    "../data/text_files/python_intro_text.txt": """Python is a popular, high-level programming language created by Guido van Rossum and first released in 1991. It is known for its simple and readable syntax, which makes it easy for beginners to learn. Python supports multiple programming styles, including procedural, object-oriented, and functional programming.
        It is widely used for web development, data analysis, automation, scientific computing, artificial intelligence, and more. Python runs on many platforms like Windows, macOS, and Linux, making it very versatile.
        Because Python code is easy to write and read, it is often used for rapid prototyping as well as production software.""",
    "../data/text_files/python_features_text.txt": "Some of the features of Python are: \n\n 1. Easy to learn \n 2. Interactive \n 3. Dynamically typed \n 4. Extensible and embeddable \n 5. Large standard library \n 6. Extensive support libraries \n 7. Platform independent \n 8. Free and open source"
}

# open(file_path, "w", encoding="utf-8") opens a file for writing:
# "w" mode creates a new file or overwrites existing content
# encoding="utf-8" ensures proper handling of Unicode characters
# with statement ensures the file is properly closed after writing
# f.write(content) writes the text content to the file

for file_path, content in sample_texts.items():
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(content)

print('Files created successfully!')
        

Files created successfully!


### docuemnt loader 

In [37]:
### text loader

from langchain.document_loaders import TextLoader
# or
# from langchain_community.document_loaders import TextLoader
# either way, the loader is the same

loader = TextLoader("../data/text_files/python_intro_text.txt")
docs = loader.load()

print(docs)


[Document(metadata={'source': '../data/text_files/python_intro_text.txt'}, page_content='Python is a popular, high-level programming language created by Guido van Rossum and first released in 1991. It is known for its simple and readable syntax, which makes it easy for beginners to learn. Python supports multiple programming styles, including procedural, object-oriented, and functional programming.\n        It is widely used for web development, data analysis, automation, scientific computing, artificial intelligence, and more. Python runs on many platforms like Windows, macOS, and Linux, making it very versatile.\n        Because Python code is easy to write and read, it is often used for rapid prototyping as well as production software.')]


### Directory loader

In [30]:
from langchain.document_loaders import DirectoryLoader

dirLoader = DirectoryLoader(
    "../data/text_files",
    glob="*.txt",
    loader_cls=TextLoader
)
docs = dirLoader.load()

print(docs)

[Document(metadata={'source': '../data/text_files/python_intro_text.txt'}, page_content='Python is a popular, high-level programming language created by Guido van Rossum and first released in 1991. It is known for its simple and readable syntax, which makes it easy for beginners to learn. Python supports multiple programming styles, including procedural, object-oriented, and functional programming.\n        It is widely used for web development, data analysis, automation, scientific computing, artificial intelligence, and more. Python runs on many platforms like Windows, macOS, and Linux, making it very versatile.\n        Because Python code is easy to write and read, it is often used for rapid prototyping as well as production software.'), Document(metadata={'source': '../data/text_files/python_features_text.txt'}, page_content='Some of the features of Python are: \n\n 1. Easy to learn \n 2. Interactive \n 3. Dynamically typed \n 4. Extensible and embeddable \n 5. Large standard libr

### read pdf files

In [None]:
from langchain.document_loaders import PyPDFLoader

pdf_directory_loader = DirectoryLoader(
    "../data/pdf_files",
    glob="*.pdf",
    loader_cls=PyPDFLoader
)

pdf_docs = pdf_directory_loader.load()

# the length is 34 because each page of the PDF is being treated as a separate document. This is the default behavior of PyPDFLoader in LangChain.
print(len(pdf_docs))
pdf_docs



34


[Document(metadata={'producer': 'Microsoft® Word LTSC', 'creator': 'Microsoft® Word LTSC', 'creationdate': '2024-07-02T10:53:56+03:00', 'title': 'Cybersecurity and cybercrime: Current trends and threats', 'author': 'Aleksandra Kuzior', 'keywords': 'cybercrime, cybersecurity, digital transformation, cyberspace, cyber fraud', 'moddate': '2024-07-02T10:53:56+03:00', 'source': '../data/pdf_files/12_1441_JIS_Tiutiunyk et al.pdf', 'total_pages': 20, 'page': 0, 'page_label': '1'}, page_content='220  \nKuzior, A., Tiutiunyk, I., Zielińska, A., & Kelemen, R. (2024). Cybersecurity and \ncybercrime: Current trends and threats. Journal of International Studies, 17(2), 220-\n239. doi:10.14254/2071-8330.2024/17-2/12 \nCybersecurity and cybercrime: Current \ntrends and threats \nAleksandra Kuzior \nFaculty of Organization and Management, \nSilesian University of Technology, Poland \naleksandra.kuzior@polsl.pl  \nORCID 0000-0001-9764-5320 \n \nInna Tiutiunyk \nDepartment of Financial Technologies and 

# embedding and vector store database

In [38]:
import numpy as np
from sentence_transformers import SentenceTransformer

In [None]:
# https://www.trychroma.com/
import chromadb
from chromadb.config import Settings
import uuid

# typing is used to define the types of the variables
from typing import List, Dict, Any
from sklearn.metrics.pairwise import cosine_similarity

# embedding manager

In [41]:
class EmbeddingManager:
    def __init__(self, model_name:str = 'all-MiniLM-L6-v2'):
        """
        Initialize the embedding manager with a specific model name

        Args:
            model_name: Huggingface model name for sentence embeddings
        """
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        """
        protected method to load the model from Huggingface
        Load the model from Huggingface
        """
        print(f"loading model {self.model_name}...")
        self.model = SentenceTransformer(self.model_name)
        print(f"loaded model {self.model_name} with {self.model.get_sentence_embedding_dimension()} dimensions!")

    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts

        Args:
            texts: List of texts to generate embeddings for

        Returns:
            embeddings: List of embeddings for the texts as a numpy array of shape(len(texts), embedding_dim)
        """

        print(f"generating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"generated embeddings for {len(texts)} texts!")
        print(f"embeddings shape: {embeddings.shape}")
        return embeddings






In [46]:
# initialize the embedding manager
embedding_manager = EmbeddingManager()

loading model all-MiniLM-L6-v2...
loaded model all-MiniLM-L6-v2 with 384 dimensions!
