# Scraping and Loading
This notebook scrapes information from a variety of sources, chunks it and stores it in a chroma database. 

In [68]:
#install packages
#python=3.12 for conda
%pip install -r requirements.txt -q

Note: you may need to restart the kernel to use updated packages.


In [69]:
from tqdm import tqdm
from typing import List, Tuple

## Document list
https://nps01.sharepoint.com/:w:/s/DNoKLLMTrainingProject/ESrGKy8E3_lMh9vG7lYk2V0BY4KXIErvX_8eAFb19lyc6Q?e=v5hUFq

In [70]:
# dictionaries of input documents

youtube_playlists = [
    {"name": "Trefor Bazett", "url": "https://youtube.com/playlist?list=PLHXZ9OQGMqxfUl0tcqPNTJsb7R6BqSLo6&si=NhIN_u37IFvSr0Lc"},
    {"name": "Gilbert Strang", "url": "https://youtube.com/playlist?list=PL49CF3715CB9EF31D&si=R2vA4kB9MqWZndTW"},
    {"name": "3blue1brown", "url": "https://youtube.com/playlist?list=PLZHQObOWTQDPD3MizzM2xVFitgF8hE_ab&si=PDgpnXcwZEEdFe4G"},
    {"name": "Wrath of Math", "url": "https://youtube.com/playlist?list=PLztBpqftvzxWT5z53AxSqkSaWDhAeToDG&si=ndRUFrIUIupQkpOj"},
    {"name": "Adam Panagos", "url": "https://youtube.com/playlist?list=PLdciPPorsHuk3Hp7QPPAtTkpW0o1UXQB6&si=A_H3aALHmU-rI4Ki"},
    {"name": "James Hamblin", "url": "https://youtube.com/playlist?list=PLNr8B4XHL5kGDHOrU4IeI6QNuZHur4F86&si=S16_paArJ6zCje6f"},
    {"name": "Kimberly Brehm", "url": "https://youtube.com/playlist?list=PLl-gb0E4MII03hiCrZa7YqxUMEeEPmZqK&si=tnpBY3NExUXLfxnr"},
    {"name": "Khan Academy", "url": "https://youtube.com/playlist?list=PLFD0EB975BA0CC1E0&si=XXQHp8tUdTLqd21-"},
]

youtube_videos = [
    {"name": "Zach Star - Dear Linear Algebra Students", "url": "https://youtu.be/4csuTO7UTMo?si=rt2Eow4VYIaYLEvT"},
    {"name": "Zach Star - The Applications of Matrices", "url": "https://youtu.be/rowWM-MijXU?si=2ofh0tX4G5alPre1"},
    {"name": "Zach Star - Eigenvectors and Eigenvalues", "url": "https://youtu.be/i8FukKfMKCI?si=CFnl_yrRkQvFyeez"},
    {"name": "Zach Star - Linear Algebra Tool", "url": "https://youtu.be/1_fXgJWQhVY?si=UHqsSkbdiXXafgn6"},
    {"name": "Zach Star - Dot Product Applications", "url": "https://youtu.be/TBpDMLCC2uY?si=Sf_fZPfrmnR4UChD"},
]

blogs = [
    {"name": "3blue1brown Blog", "url": "https://www.3blue1brown.com/topics/linear-algebra"},
    {"name": "Gregory Gundersen - Matrices as Functions", "url": "https://www.gregorygundersen.com/blog/2022/08/28/matrices-as-functions-and-data/"},
    {"name": "Gregory Gundersen - Positive Definite Matrices", "url": "https://www.gregorygundersen.com/blog/2022/02/27/positive-definite/"},
    {"name": "Gregory Gundersen - Linear Independence", "url": "https://www.gregorygundersen.com/blog/2021/04/24/linear-independence/"},
    {"name": "Gregory Gundersen - Matrix Inversion", "url": "https://www.gregorygundersen.com/blog/2020/12/09/matrix-inversion/"},
    {"name": "Gregory Gundersen - Outer Products", "url": "https://www.gregorygundersen.com/blog/2020/07/17/matmul/"},
    {"name": "Gregory Gundersen - Summing Quadratics", "url": "https://www.gregorygundersen.com/blog/2020/07/02/sum-quadratics/"},
    {"name": "Gregory Gundersen - Randomized SVD", "url": "https://www.gregorygundersen.com/blog/2019/01/17/randomized-svd/"},
    {"name": "Gregory Gundersen - SVD Proof", "url": "https://www.gregorygundersen.com/blog/2018/12/20/svd-proof/"},
    {"name": "Gregory Gundersen - Geometrical Understanding of Matrices", "url": "https://www.gregorygundersen.com/blog/2018/10/24/matrices/"},
    {"name": "Gregory Gundersen - Dot Product", "url": "https://www.gregorygundersen.com/blog/2018/06/26/dot-product/"},
]

pdfs = [
    {"name": "Gilbert Strang", "file": "Gilbert_Strang_Linear_Algebra_and_Its_Applicatio_230928_225121.pdf"},
    {"name": "Steven Leon", "file": "Steven Leon Linear-Algebra-with-Applications.pdf"},
    {"name": "VMLA", "file": "Introduction to Applied Linear Algebra VMLS.pdf"},
    {"name": "Jim Hefferon", "file": "Jim Hefferon linalgebra.pdf"},
]

pdf_directory = "C:\\Users\\jonathan.kasprisin\\github\\Learning\\KG_ilp\\data\\pdfs"



In [77]:
###Load PDFs
from langchain.document_loaders import PyPDFLoader
import os

#Find all PDF files in a directory and its subdirectories
def find_pdf_files(directory):
    pdf_files = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.lower().endswith('.pdf'):
                pdf_files.append(os.path.join(root, file))
    return pdf_files

pdf_files = find_pdf_files(pdf_directory)
print(f"Found {len(pdf_files)} PDF files")

all_pdf_docs=[]
for f in pdf_files:
    try:
        print(f"Loading {f}")
        loader = PyPDFLoader(f)
        docs = loader.lazy_load()
        all_pdf_docs.extend(docs)
    except Exception as e:
        print(f"Error loading {f}: {e}")

print(f"Loaded {len(all_pdf_docs)} documents")

Found 4 PDF files
Loading C:\Users\jonathan.kasprisin\github\Learning\KG_ilp\data\pdfs\Gilbert_Strang_Linear_Algebra_and_Its_Applicatio_230928_225121.pdf
Loading C:\Users\jonathan.kasprisin\github\Learning\KG_ilp\data\pdfs\Introduction to Applied Linear Algebra VMLS.pdf
Loading C:\Users\jonathan.kasprisin\github\Learning\KG_ilp\data\pdfs\Jim Hefferon linalgebra.pdf
Loading C:\Users\jonathan.kasprisin\github\Learning\KG_ilp\data\pdfs\Steven Leon Linear-Algebra-with-Applications.pdf
Loaded 2048 documents


In [75]:
from langchain.document_loaders import YoutubeLoader
from langchain_core.documents import Document
import yt_dlp

def get_youtube_metadata(url: str) -> dict:
    ydl_opts = {
        'quiet': True,
        'skip_download': True,  # Do not download the video
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info_dict = ydl.extract_info(url, download=False)
        return {
            "title": info_dict.get("title"),
            "description": info_dict.get("description"),
            "upload_date": info_dict.get("upload_date"),
            "duration": info_dict.get("duration"),
            "uploader": info_dict.get("uploader"),
            "view_count": info_dict.get("view_count"),
            "like_count": info_dict.get("like_count"),
            "dislike_count": info_dict.get("dislike_count"),
            "categories": info_dict.get("categories"),
            "tags": info_dict.get("tags"),
        }
    
def load_youtube_with_metadata(url: str, sequence =0) -> Tuple[List[Document], List[dict]]:

    try:
        loader = YoutubeLoader.from_youtube_url(url)
        yt_docs = loader.load()

        # Update metadata
        metadata = get_youtube_metadata(url)
        metadata["sequence"] = sequence
        for doc in yt_docs:
            doc.metadata.update(metadata)
        assert isinstance(yt_docs, list), "yt_docs should be a list"
        assert all(isinstance(doc, Document) for doc in yt_docs), "All items in yt_docs should be instances of Document"
        return yt_docs, None
    except Exception as e:
        print(f"Error loading transcript for from {url}: {str(e)}")
        metadata = get_youtube_metadata(url)
        error_list = [{"metadata": metadata, "content": "Transcript not available"}]
        return None, error_list
    


#YouTube playlist
def get_videos_from_playlist(playlist_url: str):
    print(f"Getting videos from playlist: {playlist_url}")
    ydl_opts = {
        'quiet': True,
        'extract_flat': True,  # Only extract video URLs, not full metadata
        'skip_download': True,
    }
   
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        try:
            info = ydl.extract_info(playlist_url, download=False)
            video_entries = info.get('entries', [])
            video_info = []
            for idx, entry in enumerate(video_entries):
                video_url = entry.get('url')
                if not video_url:
                    print(f"Warning: Missing 'url' in entry {idx} of playlist.")
                    continue
                video_info.append({"url": video_url, "title": entry.get('title', 'No title'), "sequence": idx + 1})
            return video_info

        except yt_dlp.utils.DownloadError as e:
            print(f"Error extracting playlist info: {str(e)}")
            return []
    
#DEBUG
# test_info= (get_videos_from_playlist(youtube_playlists[0]["url"]))
# print(f"Test info: {test_info[0]}")
    
# Process all videos in a playlist
def process_youtube_playlist(playlist_url: str) -> Tuple[List[Document], List[dict]]:
    videos_info = get_videos_from_playlist(playlist_url)
    all_yt_docs = []
    all_errors = []

    for video in videos_info:
        url, sequence = video["url"], video["sequence"]
        print(f"Processing video: {url}")
        yt_docs, error_from_load = load_youtube_with_metadata(url, sequence)
        if yt_docs:
            all_yt_docs.extend(yt_docs)
        if error_from_load: 
            all_errors.extend(error_from_load)

    return all_yt_docs, all_errors

In [73]:
### Load youtube transcripts and metadata from playlist or single url

from langchain.document_loaders import YoutubeLoader
from langchain_core.documents import Document
import yt_dlp

def get_youtube_metadata(url):
    ydl_opts = {
        'quiet': True,
        'skip_download': True,  # Do not download the video
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info_dict = ydl.extract_info(url, download=False)
        return {
            "title": info_dict.get("title"),
            "description": info_dict.get("description"),
            "upload_date": info_dict.get("upload_date"),
            "duration": info_dict.get("duration"),
            "uploader": info_dict.get("uploader"),
            "view_count": info_dict.get("view_count"),
            "like_count": info_dict.get("like_count"),
            "dislike_count": info_dict.get("dislike_count"),
            "categories": info_dict.get("categories"),
            "tags": info_dict.get("tags"),
        }

def load_youtube_with_metadata(url, sequence =0):
    try:
        loader = YoutubeLoader.from_youtube_url(url)
        yt_docs = loader.load()

        # Update metadata
        metadata = get_youtube_metadata(url)
        metadata["sequence"] = sequence
        for doc in yt_docs:
            doc.metadata.update(metadata)

        assert isinstance(yt_docs, Document), f"Expected Document for {url}, got {type(yt_docs)}"
        return yt_docs
    except Exception as e:
        print(f"Error loading transcript for URL {url}: {str(e)}")
        metadata = get_youtube_metadata(url)
        # Return an empty doc with just the metadata
        return [{"metadata": metadata, "content": "Transcript not available"}]

#YouTube playlist
def get_videos_from_playlist(playlist_url):
    ydl_opts = {
        'quiet': True,
        'extract_flat': False,  # True ->Only extract video URLs, not full metadata
        'skip_download': True,
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(playlist_url, download=False)
        video_entries = info.get('entries', [])
        video_info = [{"url": entry['url'], "sequence": idx + 1} for idx, entry in enumerate(video_entries)]
        return video_info
    
# Process all videos in a playlist
def process_youtube_playlist(playlist_url) -> list:
    videos_info = get_videos_from_playlist(playlist_url)
    all_yt_docs = []

    for video in videos_info:
        try:
            url, sequence = video["url"], video["sequence"]
            print(f"Processing video: {url}")
            yt_docs = load_youtube_with_metadata(url, sequence)
            #assert yt_docs is type Document
            assert yt_docs is not None
            all_yt_docs.append(yt_docs)
        except Exception as e:
            print(f"Error processing video: {url}: {str(e)}")
            
    return all_yt_docs


In [76]:
import traceback

all_yt_docs= []
error_loading_list = []
error_with_playlist = []
for playlist in tqdm(youtube_playlists, desc="Processing YouTube playlists"):
    try:
        print(f"Processing playlist: {playlist['name']}")
        yt_docs, error_list = process_youtube_playlist(playlist['url'])
        if yt_docs:
            all_yt_docs.extend(yt_docs)
        if error_list:
            error_loading_list.extend(error_list)
    except Exception as e:
        print(f"Error processing playlist: {playlist['name']}: {str(e)}")
        traceback.print_exc() #detailed error message
        error_with_playlist.append(playlist['name'])

# Iterate through individual YouTube videos
for video in youtube_videos:
    try:
        print(f"Processing video: {video['name']}")
        video_url = video['url']
        yt_docs, error_loading = load_youtube_with_metadata(video_url)
        if yt_docs:
            all_yt_docs.extend(yt_docs)
        if error_list:
            error_loading_list.extend(error_loading)
    except Exception as e:
        print(f"Error processing video: {video['name']}: {str(e)}")



Processing YouTube playlists:   0%|          | 0/8 [00:00<?, ?it/s]

Processing playlist: Trefor Bazett
Getting videos from playlist: https://youtube.com/playlist?list=PLHXZ9OQGMqxfUl0tcqPNTJsb7R6BqSLo6&si=NhIN_u37IFvSr0Lc




Processing video: https://www.youtube.com/watch?v=ZKUqtErZCiU
Processing video: https://www.youtube.com/watch?v=H8MiZMJr1kQ
Processing video: https://www.youtube.com/watch?v=qKyturOAD60
Processing video: https://www.youtube.com/watch?v=ghSkNXB08ds
Processing video: https://www.youtube.com/watch?v=MbmZk-x2FFs
Processing video: https://www.youtube.com/watch?v=B7XtnvNwEPA
Processing video: https://www.youtube.com/watch?v=B8oszxF9fhI
Processing video: https://www.youtube.com/watch?v=W01H0LcVUdQ
Processing video: https://www.youtube.com/watch?v=3lMH_no_OvA
Processing video: https://www.youtube.com/watch?v=vhm1bPMAgyU
Processing video: https://www.youtube.com/watch?v=SS48Om0PX6Q
Processing video: https://www.youtube.com/watch?v=WJlQzgS_itI
Processing video: https://www.youtube.com/watch?v=kzjVqg0Xxhk
Processing video: https://www.youtube.com/watch?v=4uJNuGfbxLg
Processing video: https://www.youtube.com/watch?v=ebdfJwHM5vo
Processing video: https://www.youtube.com/watch?v=KmDVM7VHB0Q
Processi

Processing YouTube playlists:  12%|█▎        | 1/8 [05:06<35:45, 306.53s/it]

Processing playlist: Gilbert Strang
Getting videos from playlist: https://youtube.com/playlist?list=PL49CF3715CB9EF31D&si=R2vA4kB9MqWZndTW




Processing video: https://www.youtube.com/watch?v=ZK3O402wf1c
Processing video: https://www.youtube.com/watch?v=QVKj3LADCnA
Processing video: https://www.youtube.com/watch?v=FX4C-JpTFgY
Processing video: https://www.youtube.com/watch?v=5hO3MrzPa0A
Processing video: https://www.youtube.com/watch?v=JibVXBElKL0
Processing video: https://www.youtube.com/watch?v=8o5Cmfpeo6g
Processing video: https://www.youtube.com/watch?v=VqP2tREMvt0
Processing video: https://www.youtube.com/watch?v=9Q1q7s1jTzU
Processing video: https://www.youtube.com/watch?v=yjBerM5jWsc
Processing video: https://www.youtube.com/watch?v=nHlE7EgJFds
Processing video: https://www.youtube.com/watch?v=2IdtqGM6KWU
Processing video: https://www.youtube.com/watch?v=6-wh6yvk6uc
Processing video: https://www.youtube.com/watch?v=l88D4r74gtM
Processing video: https://www.youtube.com/watch?v=YzZUIYRCE38
Processing video: https://www.youtube.com/watch?v=Y_Ac6KiQ1t0
Processing video: https://www.youtube.com/watch?v=osh80YCg_GM
Processi

Processing YouTube playlists:  25%|██▌       | 2/8 [07:17<20:19, 203.29s/it]

Processing playlist: 3blue1brown
Getting videos from playlist: https://youtube.com/playlist?list=PLZHQObOWTQDPD3MizzM2xVFitgF8hE_ab&si=PDgpnXcwZEEdFe4G
Processing video: https://www.youtube.com/watch?v=fNk_zzaMoSs
Processing video: https://www.youtube.com/watch?v=k7RM-ot2NWY
Processing video: https://www.youtube.com/watch?v=kYB8IZa5AuE
Processing video: https://www.youtube.com/watch?v=XkY2DOUCWMU
Processing video: https://www.youtube.com/watch?v=rHLEWRxRGiM
Processing video: https://www.youtube.com/watch?v=Ip3X9LOh2dk
Processing video: https://www.youtube.com/watch?v=uQhTuRlWMxw
Processing video: https://www.youtube.com/watch?v=v8VSDg_WQlA
Processing video: https://www.youtube.com/watch?v=LyGKycYT2v0
Processing video: https://www.youtube.com/watch?v=eu6i7WJeinw
Processing video: https://www.youtube.com/watch?v=BaM7OCEm3G0
Processing video: https://www.youtube.com/watch?v=jBsC34PxzoM
Processing video: https://www.youtube.com/watch?v=P2LTAUO1TdA
Processing video: https://www.youtube.com/

Processing YouTube playlists:  38%|███▊      | 3/8 [08:17<11:28, 137.76s/it]

Processing playlist: Wrath of Math
Getting videos from playlist: https://youtube.com/playlist?list=PLztBpqftvzxWT5z53AxSqkSaWDhAeToDG&si=ndRUFrIUIupQkpOj




Processing video: https://www.youtube.com/watch?v=oXMPQ-6YnGA
Processing video: https://www.youtube.com/watch?v=90YQb3Gajao
Processing video: https://www.youtube.com/watch?v=seet9VyHo3Q
Processing video: https://www.youtube.com/watch?v=CK65GbUTk4g
Processing video: https://www.youtube.com/watch?v=AQ7QHwHYEy4
Processing video: https://www.youtube.com/watch?v=FUVpa15NVcA
Processing video: https://www.youtube.com/watch?v=jG9Swa-wCwg
Processing video: https://www.youtube.com/watch?v=OuDA54fWk2o
Processing video: https://www.youtube.com/watch?v=LqkPbcwuD2E
Processing video: https://www.youtube.com/watch?v=JPwdkPZufe8
Processing video: https://www.youtube.com/watch?v=S9UkD30WhQs
Processing video: https://www.youtube.com/watch?v=vxB7bAJyL9c
Processing video: https://www.youtube.com/watch?v=_OTHALdXLmc
Processing video: https://www.youtube.com/watch?v=VAQNpUBwoAI
Processing video: https://www.youtube.com/watch?v=6E8p0cglZ4E
Processing video: https://www.youtube.com/watch?v=0CQ5uHSEcg8
Processi

ERROR: [youtube] 7w6RpH5v0jA: Join this channel from your computer or Android app to get access to members-only content like this video.


Error loading transcript for from https://www.youtube.com/watch?v=7w6RpH5v0jA: ERROR: [youtube] 7w6RpH5v0jA: Join this channel from your computer or Android app to get access to members-only content like this video.


ERROR: [youtube] 7w6RpH5v0jA: Join this channel from your computer or Android app to get access to members-only content like this video.
Traceback (most recent call last):
  File "c:\Users\jonathan.kasprisin\AppData\Local\miniconda3\envs\kg_ilp\Lib\site-packages\yt_dlp\YoutubeDL.py", line 1634, in wrapper
    return func(self, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\jonathan.kasprisin\AppData\Local\miniconda3\envs\kg_ilp\Lib\site-packages\yt_dlp\YoutubeDL.py", line 1769, in __extract_info
    ie_result = ie.extract(url)
                ^^^^^^^^^^^^^^^
  File "c:\Users\jonathan.kasprisin\AppData\Local\miniconda3\envs\kg_ilp\Lib\site-packages\yt_dlp\extractor\common.py", line 742, in extract
    ie_result = self._real_extract(url)
                ^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\jonathan.kasprisin\AppData\Local\miniconda3\envs\kg_ilp\Lib\site-packages\yt_dlp\extractor\youtube.py", line 4541, in _real_extract
    self.raise_no_formats(reason, expec

Error processing playlist: Wrath of Math: ERROR: [youtube] 7w6RpH5v0jA: Join this channel from your computer or Android app to get access to members-only content like this video.
Processing playlist: Adam Panagos
Getting videos from playlist: https://youtube.com/playlist?list=PLdciPPorsHuk3Hp7QPPAtTkpW0o1UXQB6&si=A_H3aALHmU-rI4Ki




Processing video: https://www.youtube.com/watch?v=Fg6B01vEN3U
Processing video: https://www.youtube.com/watch?v=YCyItKPDcSo
Processing video: https://www.youtube.com/watch?v=FeGVpk96UqY
Processing video: https://www.youtube.com/watch?v=VkYmaBhikYY
Processing video: https://www.youtube.com/watch?v=A4GoV2d21Ok
Processing video: https://www.youtube.com/watch?v=KFSkWm47Trs
Processing video: https://www.youtube.com/watch?v=jVw-OCy0Rqs
Processing video: https://www.youtube.com/watch?v=HXViJHadfhE
Processing video: https://www.youtube.com/watch?v=Bw0-UqZ_xnY
Processing video: https://www.youtube.com/watch?v=aK9GmG2_o9k
Processing video: https://www.youtube.com/watch?v=h4OiYfAHbqM
Processing video: https://www.youtube.com/watch?v=3TOmlY8dcXA
Processing video: https://www.youtube.com/watch?v=AGSajg1m-FU
Processing video: https://www.youtube.com/watch?v=tkyrhKRVbBw
Processing video: https://www.youtube.com/watch?v=rQZD3RM9ic0
Processing video: https://www.youtube.com/watch?v=y6a5fEfTXB0
Processi

Processing YouTube playlists:  62%|██████▎   | 5/8 [13:46<08:07, 162.65s/it]

Processing playlist: James Hamblin
Getting videos from playlist: https://youtube.com/playlist?list=PLNr8B4XHL5kGDHOrU4IeI6QNuZHur4F86&si=S16_paArJ6zCje6f




Processing video: https://www.youtube.com/watch?v=HAoL5fPmgrw
Processing video: https://www.youtube.com/watch?v=M13AmBUVQNc
Processing video: https://www.youtube.com/watch?v=x1CEVwn7Gnw
Processing video: https://www.youtube.com/watch?v=eXL8m865QeM
Processing video: https://www.youtube.com/watch?v=72ysuwtYA0c
Processing video: https://www.youtube.com/watch?v=kDbBTFvQgig
Processing video: https://www.youtube.com/watch?v=rSOST47KjRk
Processing video: https://www.youtube.com/watch?v=rfdVVxFbklM
Processing video: https://www.youtube.com/watch?v=qxRfVcJUihM
Processing video: https://www.youtube.com/watch?v=OpcznwRgUaY
Processing video: https://www.youtube.com/watch?v=OyqOfbeEhL0
Processing video: https://www.youtube.com/watch?v=4P1YUKPIc4w
Processing video: https://www.youtube.com/watch?v=Y8gB7nvtmrU
Processing video: https://www.youtube.com/watch?v=QdzOpzQ2pOQ
Processing video: https://www.youtube.com/watch?v=YqWpjR8IBFg
Processing video: https://www.youtube.com/watch?v=ZBLfZ6BYGf4
Processi

Processing YouTube playlists:  75%|███████▌  | 6/8 [16:54<05:42, 171.32s/it]

Processing playlist: Kimberly Brehm
Getting videos from playlist: https://youtube.com/playlist?list=PLl-gb0E4MII03hiCrZa7YqxUMEeEPmZqK&si=tnpBY3NExUXLfxnr




Processing video: https://www.youtube.com/watch?v=LHsPJ2bQX1U
Processing video: https://www.youtube.com/watch?v=SEh3yhEFK1w
Processing video: https://www.youtube.com/watch?v=7xtAYrAtuPc
Processing video: https://www.youtube.com/watch?v=MfeOEdjUfXw
Processing video: https://www.youtube.com/watch?v=ztsT5uoWeEE
Processing video: https://www.youtube.com/watch?v=9QXnOwSOoWA
Processing video: https://www.youtube.com/watch?v=kcBufqZgP4s
Processing video: https://www.youtube.com/watch?v=mH3Me8P-CO8
Processing video: https://www.youtube.com/watch?v=ODOztJ6YRUM
Processing video: https://www.youtube.com/watch?v=jsyutdYtOc4
Processing video: https://www.youtube.com/watch?v=Ju6Z90Spme0
Processing video: https://www.youtube.com/watch?v=Vk5Iun6sxhM
Processing video: https://www.youtube.com/watch?v=ofGN1Gtgjok
Processing video: https://www.youtube.com/watch?v=EdLoNRo0s28
Processing video: https://www.youtube.com/watch?v=oo2ej9M49Tw
Processing video: https://www.youtube.com/watch?v=ofAcvZVEfBk
Processi

Processing YouTube playlists:  88%|████████▊ | 7/8 [19:45<02:51, 171.27s/it]

Processing playlist: Khan Academy
Getting videos from playlist: https://youtube.com/playlist?list=PLFD0EB975BA0CC1E0&si=XXQHp8tUdTLqd21-
Processing video: https://www.youtube.com/watch?v=xyAuNHPsq-g
Processing video: https://www.youtube.com/watch?v=aKhhYguY0DQ
Processing video: https://www.youtube.com/watch?v=OAh573i_qn8
Processing video: https://www.youtube.com/watch?v=iUQR0enP7RQ
Processing video: https://www.youtube.com/watch?v=S4n-tQZnU6o
Processing video: https://www.youtube.com/watch?v=obts_JDS6_Q
Processing video: https://www.youtube.com/watch?v=AUqeb9Z3y3k
Processing video: https://www.youtube.com/watch?v=gsNgdVdAT1o
Processing video: https://www.youtube.com/watch?v=UqyN7-tRS00
Processing video: https://www.youtube.com/watch?v=5tB7y_piK6o
Processing video: https://www.youtube.com/watch?v=woqq3Sls1d8
Processing video: https://www.youtube.com/watch?v=5cWB52I-SF0
Processing video: https://www.youtube.com/watch?v=r4bH66vYjss
Processing video: https://www.youtube.com/watch?v=hWhs2cI

Processing YouTube playlists: 100%|██████████| 8/8 [29:01<00:00, 217.71s/it]


Processing video: Zach Star - Dear Linear Algebra Students
Processing video: Zach Star - The Applications of Matrices
Processing video: Zach Star - Eigenvectors and Eigenvalues
Processing video: Zach Star - Linear Algebra Tool
Processing video: Zach Star - Dot Product Applications


In [78]:
print(f"Loaded {len(all_yt_docs)} YouTube documents")
print(f"Example yt document: {all_yt_docs[0]}")
print(f"Errors loading YouTube documents: {len(error_loading_list)}")
if len(error_loading_list) > 0:
    print(f"Did not load the following: {error_loading_list}")


Loaded 442 YouTube documents
Example yt document: page_content='one of the things I love most about linear algebra is that you can do linear algebra in two different ways two ways that are like different sides of the same coin there's this algebraic world a world where you're doing all sorts of adding and subtracting and multiplying other algebraic operations there's also this geometric world a world where you're visualizing lines transforming to other lines and so forth and then what that magic of linear algebra is that these two different worlds they algebraic and the geometric they really merge together into one coherent picture that is both beautiful and interesting and powerful and I'm going to show you just a little bit of that in this video let's consider a function like f of X equal to x squared what's really going on there well I think of functions is sort of a transformation there's some inputs and the inputs are transformed into being some outputs and in the case of somethin

In [79]:
from langchain.document_loaders import WebBaseLoader
# Iterate through blog URLs
# Function to load blog content asynchronously
def load_blog_content(blog):
    print(f"Processing blog: {blog['name']}")
    blog_url = blog['url']
    try:
        loader = WebBaseLoader(blog_url)
        blog_docs = []
        for doc in loader.load():
            blog_docs.append(doc)
        return blog_docs
    except Exception as e:
        print(f"Error loading blog: {blog['name']} - {e}")
        return []

# Function to load all blogs synchronously
def load_all_blogs(blogs):
    all_blog_docs = []
    for blog in blogs:
        blog_docs = load_blog_content(blog)
        all_blog_docs.extend(blog_docs)
    return all_blog_docs

# Run the synchronous function to load all blogs
all_blog_docs = load_all_blogs(blogs)

# Print the results
print(f"Loaded {len(all_blog_docs)} documents from blogs.")

Processing blog: 3blue1brown Blog
Processing blog: Gregory Gundersen - Matrices as Functions
Processing blog: Gregory Gundersen - Positive Definite Matrices
Processing blog: Gregory Gundersen - Linear Independence
Processing blog: Gregory Gundersen - Matrix Inversion
Processing blog: Gregory Gundersen - Outer Products
Processing blog: Gregory Gundersen - Summing Quadratics
Processing blog: Gregory Gundersen - Randomized SVD
Processing blog: Gregory Gundersen - SVD Proof
Processing blog: Gregory Gundersen - Geometrical Understanding of Matrices
Processing blog: Gregory Gundersen - Dot Product
Loaded 11 documents from blogs.


In [80]:
import pickle

# Define the path to the pickle file
pickle_file_path = 'data/storage/full_all_documents.pkl'

# Create the directory if it doesn't exist
os.makedirs(os.path.dirname(pickle_file_path), exist_ok=True)
#save all documents lists (all_pdf_docs + all_yt_docs + all_blog_docs) to pickle
with open(pickle_file_path, 'wb') as f:
    pickle.dump([all_pdf_docs, all_yt_docs, all_blog_docs], f)

In [81]:
#merge all docs lists into one
all_docs = all_pdf_docs + all_yt_docs + all_blog_docs

In [82]:
for doc in all_yt_docs:
    if not isinstance(doc, Document):
        print(type(doc))
        print(doc)

assert all(isinstance(doc, Document) for doc in all_yt_docs), "All items in all_yt_docs should be instances of Document"

In [83]:
import sys

print(f"Number of documents: {len(all_docs)}")
print(f"Size of all_docs in bytes: {sys.getsizeof(all_docs)}")

# Function to print metadata and first 100 characters of content
def print_document_info(doc, doc_type):
    metadata = doc.metadata
    content_preview = doc.page_content[:200]
    print(f"{doc_type} Metadata: {metadata}")
    print(f"{doc_type} Content Preview: {content_preview}\n")

# Examine a document from each loader
print_document_info(all_pdf_docs[2], "PDF")
print_document_info(all_yt_docs[0], "YouTube")
print_document_info(all_blog_docs[0], "Blog")

#print(all_blog_docs[0].page_content)


Number of documents: 2501
Size of all_docs in bytes: 20064
PDF Metadata: {'source': 'C:\\Users\\jonathan.kasprisin\\github\\Learning\\KG_ilp\\data\\pdfs\\Gilbert_Strang_Linear_Algebra_and_Its_Applicatio_230928_225121.pdf', 'page': 2}
PDF Content Preview: Contents
Preface iv
1 Matrices and Gaussian Elimination 1
1.1 Introduction . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 1
1.2 The Geometry of Linear Equations . . . . . . . . . . . 

YouTube Metadata: {'source': 'ZKUqtErZCiU', 'title': "What's the big idea of Linear Algebra?    **Course Intro**", 'description': 'This is the start of a one semester university level course on Linear Algebra that emphasizes both conceptual understanding as well as procedural fluency with the techniques of Linear Algebra. In this video, we get to see just the beginning of some of the big ideas of Linear Algebra. \n\nFULL PLAYLIST: https://www.youtube.com/playlist?list=PLHXZ9OQGMqxfUl0tcqPNTJsb7R6BqSLo6\n\nLinear Algebra is a story told i

In [84]:
##some_splits

In [85]:
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain_text_splitters import RecursiveCharacterTextSplitter


#filter complex meta data like lists from yt videos
simple_metadata_docs =  filter_complex_metadata(all_docs)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True #track index in orginal document
)

print("splitting documents...")
all_splits = text_splitter.split_documents(all_docs)

print(f"split all docs: {len(all_docs)} into subdocuments: {len(all_splits)}")

#prevent issues in metadata
def clean_metadata(metadata):
    """
    Replace None values in metadata with a default value.
    """
    
    for key, value in metadata.items():
        if value is None:
            metadata[key] = ""  # Replace None with an empty string or a default value
    return metadata

print("cleaning metadata...")
for split in all_splits:
    split.metadata= clean_metadata(split.metadata)


splitting documents...
split all docs: 2501 into subdocuments: 11564
cleaning metadata...


## Embed and store

### Embed on CPU

In [86]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings


model_name = "BAAI/bge-small-en-v1.5" #BAAI/bge-en-icl
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
embd = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

  embd = HuggingFaceBgeEmbeddings(


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### Vectorstore 

In [87]:
from langchain_chroma import Chroma

vector_store = Chroma(
    embedding_function=embd,
    persist_directory="C:/Users/jonathan.kasprisin/github/Learning/KG_ilp/data/storage/chroma_db_bde_small",
    collection_name="full_vstore_bge_small",)

In [88]:
document_ids = []
for doc in tqdm(all_splits, desc="Adding documents to vector store", unit="doc"):
    doc_id = vector_store.add_documents(documents=[doc])
    document_ids.extend(doc_id)

print(f"Added {len(document_ids)} documents to the vector store. Example document ID: {document_ids[0]}")

#save db
vector_store._persist_directory

Adding documents to vector store: 100%|██████████| 11564/11564 [15:12<00:00, 12.68doc/s]

Added 11564 documents to the vector store. Example document ID: 10ed5211-1763-402c-b5a6-3c3370ac2f58





'C:/Users/jonathan.kasprisin/github/Learning/KG_ilp/data/storage/chroma_db_bde_small'