In [None]:
from src.youtube_loader import load_from_youtube
from src.bge_embeddigns import bge_embeddings
from src.splade_embeddings import embed_splade
from src.document_splitting import text_splitter
from src.vectorstore import jsonize_document
#doc = load_from_youtube("https://www.youtube.com/watch?v=tkH2-_jMCSk")
doc = load_from_youtube("https://www.youtube.com/watch?v=CQlTmOFM4Qs")

In [None]:
doc[0].page_content

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=100,
    #separators=["\n", ".",]
   # length_function=len,
   # is_separator_regex=True,
)
texts = text_splitter.split_documents(doc)



In [None]:
texts

In [None]:
for i in texts:
    print(len(i.page_content))

In [None]:
txts = texts[:5]
jsons = []
from tqdm import tqdm

for i in tqdm(texts, desc="Processing texts"):
    json = jsonize_document(i)
    jsons.append(json)

In [None]:
jsons

In [None]:
rows = {"rows": jsons}

In [None]:
import json
with open('jsons.json', 'w') as file:
    json.dump(jsons, file, indent=4)

In [None]:
import json
with open('rows.json', 'w') as file:
    json.dump(rows, file, indent=4)

In [None]:
import pandas as pd

df = pd.read_json("/home/damir/Projects/huberman_rag/jsons.json")
df['pk'] = df.index
df.to_csv("dataframe.csv", index=False)

In [None]:
df.columns

In [None]:
from pymilvus import (
    connections, 
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
    MilvusClient
    )

client = MilvusClient(
    uri="http://localhost:19530"
)
# Specify Milvus server parameters
host = "127.0.0.1"  # Milvus server host
port = "19530"  # Milvus server port

# Connect to Milvus server
connections.connect(host=host, port=port)
# Define field schemas
pk = FieldSchema(name="pk", dtype=DataType.INT64,
                 is_primary=True, auto_id=False)
bge_embeddings = FieldSchema(
    name="bge_embeddings", dtype=DataType.FLOAT_VECTOR, dim=1024)
splade_embeddings = FieldSchema(
        name="splade_embeddings", dtype=DataType.FLOAT_VECTOR, dim=30522)
source = FieldSchema(name="source", dtype=DataType.VARCHAR, max_length=500)
page_content = FieldSchema(name="page_content", dtype=DataType.VARCHAR, max_length=3000)
title = FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=500)
description = FieldSchema(name="description", dtype=DataType.VARCHAR, max_length=500)
view_count = FieldSchema(name="view_count", dtype=DataType.INT64)
thumbnail_url = FieldSchema(name="thumbnail_url", dtype=DataType.VARCHAR, max_length=500)
publish_date = FieldSchema(name="publish_date", dtype=DataType.VARCHAR, max_length=500)
length = FieldSchema(name="length", dtype=DataType.VARCHAR, max_length=500)
author = FieldSchema(name="author", dtype=DataType.VARCHAR, max_length=500)

# Define collection schema
schema = CollectionSchema(fields=[pk, bge_embeddings, splade_embeddings, source, page_content, title, description,
                                  view_count, thumbnail_url, publish_date, length, author]y)

# Create collection
collection_name = "huberman_rag"
collection = Collection(name=collection_name, schema=schema,)


index_params = MilvusClient.prepare_index_params()

index_params.add_index(
    field_name="bge_embeddings",
    metric_type="IP",
    index_type="HNSW",
    index_name="bge_embeddings_index",
    efConstruction=500,
    M=2048
)

index_params.add_index(
    field_name="splade_embeddings",
    metric_type="IP",
    index_type="HNSW",
    index_name="splade_embeddings_index",
    efConstruction=500,
    M=2048
)

client.create_index(
    collection_name="huberman_rag",
    index_params=index_params
)
