**Demo Model** 
This is how I retrieved the data from PDF, link (and Youtube), split them into chunks, embedded them and upload them to MongoDB

In [None]:
import os
import logging
from pymongo import MongoClient
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import MongoDBAtlasVectorSearch


logging.getLogger("pypdf").setLevel(logging.ERROR)

# Define the database and collection names
MONGO_URI = os.environ.get('MONGO_URI')
DB_NAME = "ai_workbench"
COLLECTION_NAME = "documents"
ATLAS_VECTOR_SEARCH_IDX = "default" 

# I commented them out to prevent duplications on multiple run
# Uncomment to pick which one you want to upload to your MongoDB server
sources = [
    # "doc/Stack & Queue.pdf",  # Example local PDF file
    # "https://en.wikipedia.org/wiki/Oulu", # Example web page
    # "https://youtu.be/yhiauaA5bc8?si=te3ra0DrY-F-xjvU" # Example YouTube video
    # "https://youtu.be/jGwO_UgTS7I?si=WBSTHpb4LJikOFJi"
]




I faced some complications with extracting the video, mainly with faulty api and extensions, I did not want to give in and use OpenAI API.

Finally I found pytubefix to be able to extract the captions from video, and in case the captions was not English, I would use googletrans to translate them. This whole problem requires 2 additional helper function to operate

In [25]:
from googletrans import Translator

translator = Translator()
# Helper function to clean the raw SRT caption format from the Youtube's captions
def clean_srt_captions(srt_text):
    """
    Parses an SRT string and returns only the text, without
    index numbers, timestamps, or empty lines.
    """
    lines = srt_text.splitlines()
    clean_lines = []
    for line in lines:
        # Skip empty lines, lines that are just a number (index),
        # and lines with the SRT timestamp arrow '-->'
        if line.strip() and not line.strip().isdigit() and '-->' not in line:
            clean_lines.append(line.strip())
    return " ".join(clean_lines)

def translate_to_english(text, src_lang = 'auto'):
    """
    Translate the text to English using googletrans
    """
    try:
        result = translator.translate(text, dest = 'en', src = src_lang)
        return result.text
    
    except Exception as e:
        print(f"Error: {e}")
        return None

In [26]:
from langchain.document_loaders import PyPDFLoader, WebBaseLoader
from langchain.docstore.document import Document
from pytubefix import YouTube


all_doc = []

for source in sources:
    print(f"Reading {source}")
    try:
        if(source.endswith('pdf')):         #Grabbing pdf
            loader = PyPDFLoader(source)
            all_doc.extend(loader.load())

        elif 'youtube.com' in source or 'youtu.be' in source:       # Grabbing youtube
            yt = YouTube(source)
            caption = None

            # Look up available captions and pick the best direction from there
            if 'en' in yt.captions:
                caption = yt.captions['en']

            elif 'a.en' in yt.captions:
                caption = yt.captions['a.en']

            elif len(yt.captions > 0):
                caption = list(yt.caption)[0]
                print(" No English caption available, imporvising..")
            
            if caption:
                processed_caption =  clean_srt_captions(caption.generate_srt_captions())
                final_text = processed_caption
                if caption.code != 'en' and caption.code != 'a.en':
                    final_text = translate_to_english(processed_caption, src_lang= caption.code)
                
                doc = Document(
                    page_content= final_text,
                    metadata={"source": source, "title": yt.title, "original_language": caption.code}
                )
                all_doc.append(doc)
            else:
                raise ValueError("Could not found any available captions")

        elif source.startswith('https://'): #Grabbing links (notice we don't grab unsecured links)
            loader = WebBaseLoader(source)
            all_doc.extend(loader.load())
        else:
            print(f"Unknown source file: {source}")
    
    except Exception as e:
        print(f"\t\t ERROR LOADING {source}: {e}")


Reading https://youtu.be/jGwO_UgTS7I?si=WBSTHpb4LJikOFJi


**Splitting**

In [27]:
print(".. Splitting...")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 150
)

docs = text_splitter.split_documents(all_doc)
print(f" Splitted into {len(docs)} chunk(s)")

.. Splitting...
 Splitted into 83 chunk(s)


**Embedding**

In [28]:
print("..Initializing Embedding Model..")

embedding_model = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2")


..Initializing Embedding Model..


**Upload to MongoDB**

In [29]:
print(".. Connecting to MongoDB server")

client = MongoClient(MONGO_URI)
collection = client[DB_NAME][COLLECTION_NAME]

.. Connecting to MongoDB server


In [30]:
print("... Embedding to MongoDB..")

MongoDBAtlasVectorSearch.from_documents(
    documents = docs,
    embedding = embedding_model,
    collection = collection,
    index_name = ATLAS_VECTOR_SEARCH_IDX,
)
print(".... COMPLETE....")

... Embedding to MongoDB..
.... COMPLETE....
