# Main notebook

In [1]:
from src.vectorstore import create_collection
from src.vectorstore import jsonize_document
from tqdm import tqdm
import re 

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader

create_collection()

  from .autonotebook import tqdm as notebook_tqdm


Creating collection...
Collection created successfully.


In [2]:
def reformat_text(s):
    return re.sub(r"(?<!\.)\n", " ", s)

In [3]:
text_loader = TextLoader("/home/damir/Projects/huberman_rag/docs/youtube/Using Caffeine to Optimize Mental & Physical Performance ｜ Huberman Lab Podcast 101.txt")
doc = text_loader.load()

In [4]:
doc

[Document(page_content="Welcome to the Huberman Lab Podcast,\nwhere we discuss science\nand science-based tools for everyday life.\nI'm Andrew Huberman,\nand I'm a professor of neurobiology and ophthalmology\nat Stanford School of Medicine.\nToday, we are discussing caffeine.\nCaffeine is one of the most widely used substances\non the planet.\nEstimates are that more than 90% of adults\nand as many as 50% of kids, that is adolescents and teenagers,\nuse caffeine on a daily basis.\nCaffeine is an amazing molecule.\nMost people are familiar with caffeine's ability\nto increase alertness\nand to reduce our feelings of sleepiness and fatigue.\nAnd indeed, it does that.\nBut what most people are not aware of\nis that caffeine acts as a strong reinforcer.\nWhat I mean by reinforcer\nis that when caffeine is present in a drink or food,\nand yes, indeed, caffeine is present in many foods,\neven unbeknownst to us,\nwhen it's present in drinks and foods,\nwe actively come to like those foods and

In [5]:
doc[0].page_content = reformat_text(doc[0].page_content)

In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=100,
    length_function=len,
)

In [7]:
docs = text_splitter.split_documents(doc)

In [8]:
docs

[Document(page_content="Welcome to the Huberman Lab Podcast, where we discuss science and science-based tools for everyday life.\nI'm Andrew Huberman, and I'm a professor of neurobiology and ophthalmology at Stanford School of Medicine.\nToday, we are discussing caffeine.\nCaffeine is one of the most widely used substances on the planet.\nEstimates are that more than 90% of adults and as many as 50% of kids, that is adolescents and teenagers, use caffeine on a daily basis.\nCaffeine is an amazing molecule.\nMost people are familiar with caffeine's ability to increase alertness and to reduce our feelings of sleepiness and fatigue.\nAnd indeed, it does that.\nBut what most people are not aware of is that caffeine acts as a strong reinforcer.\nWhat I mean by reinforcer is that when caffeine is present in a drink or food, and yes, indeed, caffeine is present in many foods, even unbeknownst to us, when it's present in drinks and foods, we actively come to like those foods and drinks more th

In [None]:
jsons = []

for d in tqdm(docs, desc="Processing texts"):
    json = jsonize_document(d)
    jsons.append(json)

In [None]:
jsons

In [None]:
import json
with open('jsons.json', 'w') as file:
    json.dump(jsons, file, indent=4)

In [10]:
import pandas as pd

df = pd.read_json("/home/damir/Projects/huberman_rag/jsons.json")
df['pk'] = df.index
df.to_csv("dataframe.csv", index=False)

In [11]:
df

Unnamed: 0,page_content,splade_embeddings,bge_embeddings,pk
0,"Welcome to the Huberman Lab Podcast, where we ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.011890139430761, 0.002748043276369, 0.02512...",0
1,"Now, that said, there are certain situations i...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.012476588599383002, 0.01234961207956, 0.001...",1
2,Turns out that GLP-1 acts on certain receptors...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.026759952306747003, 0.043216563761234006, -...",2
3,"There are also nowadays drugs, which are calle...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.042025502771139006, 0.024691145867109, -0.0...",3
4,"Now, thermogenesis is the active utilization o...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.015365839004516001, 0.030147943645715002, -...",4
...,...,...,...,...
75,And beautiful studies have been done that desc...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.0023800958879290003, 0.031530488282442, 0....",75
76,"For instance, I wonder why we are not pairing ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-0.0013232599012550001, -0.01642270386219, 0....",76
77,"I actually know somebody, I won't reveal who t...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.014871522784233001, -0.015996014699339003, ...",77
78,It's reducing fatigue.\nIt's improving mental ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.032302118837833, -0.014466208405792002, -0....",78
