# Veritasium - Batch Vectorization

## Downloading & Importing Libraries

In [1]:
!pip install transformers langchain langchain_community langchain_openai pinecone-client python-dotenv

Collecting langchain
  Downloading langchain-0.2.7-py3-none-any.whl (983 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.6/983.6 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain_community
  Downloading langchain_community-0.2.7-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain_openai
  Downloading langchain_openai-0.1.16-py3-none-any.whl (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.1/46.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pinecone-client
  Downloading pinecone_client-4.1.2-py3-none-any.whl (216 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m216.4/216.4 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Collecting langchain-core<0.3.0,>=0.2.12 (from langch

In [2]:
import os
from google.colab import files
from google.colab import userdata
from google.colab import runtime
# from dotenv import load_dotenv

import json
import re
from transformers import pipeline
import torch
from tqdm import tqdm
from typing import List, Dict, Any
import time

from pinecone import Pinecone
from pinecone import ServerlessSpec
from langchain_openai import OpenAI, ChatOpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.docstore.document import Document
from transformers import BartTokenizer, BartForConditionalGeneration
from langchain.prompts import PromptTemplate


In [3]:
OPENAI_API_KEY = userdata.get('Ironhack-GPT')
PC_API_KEY = userdata.get('PineCone')
HF_TOKEN = userdata.get('HF')


os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
os.environ['HF_TOKEN'] = HF_TOKEN

In [4]:
## Loading Data

# Load the JSON file
with open('/content/2-transcribed_videos_metadata.json', 'r') as f:
    transcribed_videos = json.load(f)

transcribed_videos_original = transcribed_videos.copy() ## keeping for reference

## Re-categorization

In [5]:
# Flatten the video list from all categories
videos = [video for category in transcribed_videos.values() for video in category]

# Inspect the first video
first_video = videos[0]
print(json.dumps(first_video, indent=4))

{
    "videoId": "scliyWrN7mk",
    "title": "how bikes *actually* work",
    "description": "why are bicycles stable? the most common answer is gyroscopic effects, but this is not right.",
    "published_at": "2024-04-17T16:20:50Z",
    "url": "https://www.youtube.com/watch?v=scliyWrN7mk",
    "transcription": "How do bikes without riders stay upright? As long as a bike is moving with sufficient speed, it can keep coasting indefinitely. But it turned out the ground where we went to test this effect was really bumpy. But the bike still manages to absorb all these perturbations.\n and remain stable. So how does it do this? I think most people believe it's the wheels spinning that creates some sort of gyroscopic effect that resists falling over. Just like in this demonstration of gyroscopic precession, the wheels\n stays upright even though gravity is pulling it down. But this is not why bikes are stable. Just watch what happens when we lock the handlebars completely, so you can only go 

In [3]:
# Function to check if a transcription is valid
def is_valid_transcription(transcription):
    return isinstance(transcription, str) and len(transcription) > 50  # valid transcription has more than 50 characters

def remove_repeated_phrases(text):
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove repeated phrases
    pattern = r'\b(\w+\s+){1,5}(\1)+\b'
    while re.search(pattern, text):
        text = re.sub(pattern, r'\1', text)

    return text

# Function to clean a single transcription
def clean_transcription(transcription):
    # Remove tokens between \n if the number of tokens is less than 3
    def clean_segment(segment):
        tokens = segment.split()
        return segment if len(tokens) >= 3 else ''

    # Split by \n, clean segments, and join back together
    segments = transcription.split('\n')
    cleaned_segments = [clean_segment(segment) for segment in segments]
    cleaned_transcription = ' '.join(cleaned_segments).strip()

    # Remove all remaining \n and repetitive words/phrases
    cleaned_transcription = cleaned_transcription.replace('\n', ' ')
    cleaned_transcription = re.sub(r'\s+', ' ', cleaned_transcription).strip()
    cleaned_transcription = remove_repeated_phrases(cleaned_transcription)

    return cleaned_transcription

# Function to process transcribed_videos and return cleaned version
def clean_transcribed_videos(transcribed_videos):
    for category in transcribed_videos.keys():
        for video in transcribed_videos[category]:
            if 'transcription' in video:
                video['transcription'] = clean_transcription(video['transcription'])
    return transcribed_videos

# Load the transcribed videos metadata from a JSON file
with open('3-transcribed_videos_metadata.json', 'r') as f:
    transcribed_videos_metadata = json.load(f)

# Function to find a video by videoId
def find_video_by_id(transcribed_videos, video_id):
    for category, videos in transcribed_videos.items():
        for video in videos:
            if video['videoId'] == video_id:
                return video
    return None

# ###  TESTING
# # Lookup the video by its ID
# video_id = "oVfHeWTKjag"  # Replace with the desired video ID
# sample_video = find_video_by_id(transcribed_videos_metadata, video_id)

# if sample_video:
#     # Clean the transcription of the sample video
#     sample_video_cleaned = clean_transcription(sample_video['transcription'])

#     # Print the cleaned transcription
#     print(f"Original Transcription:\n{sample_video['transcription']}\n")
#     print(f"Cleaned Transcription:\n{sample_video_cleaned}")
# else:
#     print(f"Video with ID {video_id} not found.")
# Clean the entire dataset

transcribed_videos = clean_transcribed_videos(transcribed_videos)

In [8]:
# Predefined categories
predefined_categories = [
    "Physics",
    "Mathematics",
    "Engineering",
    "Biology",
    "Chemistry",
    "Space",
    "Technology",
    "Geoscience",
    "General Science",
    "Miscellaneous Educational Content"
]

# Few-shot examples based on transcription
few_shot_examples = [
    {
        "title": "The Wonders of Quantum Physics",
        "transcription": "Quantum physics is a fundamental theory in physics that describes nature at the smallest scales of energy levels of atoms and subatomic particles.",
        "category": "Physics"
    },
    {
        "title": "DIY Experiment",
        "transcription": "This experiment demonstrates chemical reactions that can be easily done at home using common household items. Watch how baking soda reacts with vinegar.",
        "category": "Chemistry"
    },
    {
        "title": "The Future of AI",
        "transcription": "Artificial Intelligence is rapidly advancing. This video discusses the latest developments in AI technology and its potential future applications.",
        "category": "Technology"
    }
]

# Constructing the prompt with few-shot examples
few_shot_text = "\n".join(
    f"Title: {example['title']}\nTranscription: {example['transcription']}\nCategory: {example['category']}"
    for example in few_shot_examples
)

dynamic_prompt_template = f"""
You are an AI that categorizes YouTube videos based on their transcriptions.
You analyze the transcriptions thoroughly and conclude what theme is the video about.
You are very accurate in making this categorization and avoid categorizing in 'Miscellaneous Educational Content' unless absolutely necessary.
Choose the most appropriate category from the following list:
{', '.join(predefined_categories)}

Here are some examples:
{few_shot_text}

Now, categorize the following video:

Title: {{title}}
Transcription: {{transcription}}

Respond with only the most appropriate category for this video, without any additional text.
"""

# Create a LangChain prompt
dynamic_prompt = PromptTemplate(template=dynamic_prompt_template, input_variables=["title", "transcription"])

# Initialize the OpenAI Chat model with a specific name
dynamic_llm = ChatOpenAI(model="gpt-3.5-turbo", openai_api_key=OPENAI_API_KEY)

# Clean the transcriptions
transcribed_videos_metadata = clean_transcribed_videos(transcribed_videos_metadata)

# Filter out videos without valid transcriptions
filtered_videos = {}
for category in transcribed_videos_metadata.keys():
    valid_videos = [video for video in transcribed_videos_metadata[category] if 'transcription' in video and is_valid_transcription(video['transcription'])]
    if len(valid_videos) > 0:
        filtered_videos[category] = valid_videos

# Flatten the video list from all categories
videos = [video for category in filtered_videos.values() for video in category]

def dynamic_categorize_videos(videos):
    categories = {}

    for video in videos:
        title = video['title']
        transcription = video.get('transcription', '')

        if not transcription:
            category = "Miscellaneous Educational Content"
        else:
            prompt = dynamic_prompt.format(title=title, transcription=transcription)
            response = dynamic_llm.generate([prompt])
            category = response.generations[0][0].text.strip()  # Access the text and strip it

        # Ensuring only valid categories are considered
        if category in predefined_categories:
            if category not in categories:
                categories[category] = []
            categories[category].append(video)
        else:
            # Handle unexpected categories
            if "Miscellaneous Educational Content" not in categories:
                categories["Miscellaneous Educational Content"] = []
            categories["Miscellaneous Educational Content"].append(video)

    return categories

# Categorize the videos
categorized_videos = dynamic_categorize_videos(videos)

# Print categorized videos
for category, vids in categorized_videos.items():
    print(f"\nCategory: {category}")
    for vid in vids:
        print(f" - {vid['title']}")



Category: Physics
 - how bikes *actually* work
 - falling ladders - why does this happen?
 - world&#39;s roundest object
 - microwaving grapes makes plasma
 - this phone trick is impossible
 - i call this the &#39;no, you don&#39;t&#39; law
 - does pressure melt ice?
 - world&#39;s strongest magnet!
 - how a slinky falls in slow motion #shorts
 - atomic theory
 - backspin basketball flies off dam
 - how does a boomerang work?
 - can you solve this shadow illusion?
 - anti-gravity wheel explained
 - ice spikes explained
 - misconceptions about temperature
 - heisenberg&#39;s uncertainty principle explained
 - can you go the speed of light?
 - empty space is not empty
 - how special relativity makes magnets work
 - 4 revolutionary riddles
 - 5 fun physics phenomena
 - what is a force?
 - explained: 5 fun physics phenomena
 - option b - acceleration of a bungy jump
 - how does a sailboat actually work?
 - single photon interference
 - will this go faster than light?
 - misconceptions abo

## Creating Document for Pinecone

### Chunking

In [9]:
# NEW TEST
import json
from transformers import BartTokenizer, BartForConditionalGeneration
from tqdm import tqdm

# Function to create a summary for each chunk using BART
def create_summary(text):
    inputs = tokenizer(text, max_length=1024, return_tensors='pt', truncation=True).to('cuda')
    summary_ids = model.generate(inputs['input_ids'], max_length=250, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Custom function to split text into chunks with overlap
def custom_text_splitter(text, chunk_size=3000, chunk_overlap=150):
    chunks = []
    start = 0
    end = chunk_size

    while start < len(text):
        chunk = text[start:end]
        chunks.append(chunk)
        start = end - chunk_overlap
        end = start + chunk_size

    return chunks

# Function to chunk transcriptions and add context using the custom splitter
def chunk_transcriptions(videos, category, chunk_size=3000, chunk_overlap=150):
    chunks = []
    for video in tqdm(videos, desc="Processing Videos"):
        if 'transcription' in video:
            video_id = video["videoId"]
            transcription = video["transcription"]
            title = video["title"]
            description = video["description"]
            url = video["url"]
            published_at = video["published_at"]

            # Create document object with added context
            document = {
                "page_content": transcription,
                "metadata": {
                    "video_id": video_id,
                    "title": title,
                    "description": description,
                    "url": url,
                    "category": category,
                    "published_at": published_at
                }
            }

            # Use the custom text splitter to split the document into chunks
            chunked_texts = custom_text_splitter(document["page_content"], chunk_size, chunk_overlap)

            for i, chunk_content in enumerate(chunked_texts):
                summary = create_summary(chunk_content)

                # Add metadata to the chunk
                chunk = {
                    "page_content": chunk_content,
                    "metadata": {
                        "video_id": video_id,
                        "title": title,
                        "description": description,
                        "url": url,
                        "category": category,
                        "summary": summary,
                        "chunk_id": f"{video_id}_{i}",
                        "published_at": published_at
                    }
                }

                chunks.append(chunk)

    return chunks

# Function to process transcribed_videos in batches to avoid memory issues
def process_in_batches(filtered_videos, batch_size=20):
    all_chunks = []
    categories = list(filtered_videos.keys())

    for category in tqdm(categories, desc="Processing Categories"):
        videos = filtered_videos[category]

        # Split videos into batches
        for i in tqdm(range(0, len(videos), batch_size), desc="Processing Batches", leave=False):
            batch = videos[i:i+batch_size]

            # Chunk transcriptions and add context
            chunks = chunk_transcriptions(batch, category)

            # Accumulate chunks
            all_chunks.extend(chunks)

    return all_chunks

# Initialize the tokenizer and model for summarization
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn').to('cuda')

# ### Testing on a small category
# Process only the "Chemistry" category videos in batches to avoid memory issues
# chemistry_videos = categorized_videos.get("Chemistry", [])
# chunks = process_in_batches({"Chemistry": chemistry_videos}, batch_size=20)

chunks = process_in_batches(categorized_videos, batch_size=20)

# Output the number of chunks processed
print(f"Number of chunks processed: {len(chunks)}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Processing Categories:   0%|          | 0/10 [00:00<?, ?it/s]
Processing Batches:   0%|          | 0/7 [00:00<?, ?it/s][A

Processing Videos:   0%|          | 0/20 [00:00<?, ?it/s][A[A

Processing Videos:   5%|▌         | 1/20 [00:02<00:53,  2.79s/it][A[A

Processing Videos:  10%|█         | 2/20 [00:03<00:28,  1.56s/it][A[A

Processing Videos:  15%|█▌        | 3/20 [00:04<00:20,  1.19s/it][A[A

Processing Videos:  20%|██        | 4/20 [00:04<00:16,  1.01s/it][A[A

Processing Videos:  25%|██▌       | 5/20 [00:05<00:14,  1.07it/s][A[A

Processing Videos:  30%|███       | 6/20 [00:06<00:12,  1.14it/s][A[A

Processing Videos:  35%|███▌      | 7/20 [00:07<00:13,  1.04s/it][A[A

Processing Videos:  40%|████      | 8/20 [00:08<00:11,  1.03it/s][A[A

Processing Videos:  45%|████▌     | 9/20 [00:09<00:10,  1.07it/s][A[A

Processing Videos:  50%|█████     | 10/20 [00:10<00:09,  1.10it/s][A[A

Processing Videos:  55%|█████▌    | 11/20 [00:11<00:08,  1.05it/s][A[A

Proces

Number of chunks processed: 1141





In [10]:
# Inspect the results
for chunk in chunks[:20]:  # Display some chunks for inspection
    print(f"Chunk ID: {chunk['metadata']['chunk_id']}")
    print(f"Video ID: {chunk['metadata']['video_id']}")
    print(f"Title: {chunk['metadata']['title']}")
    print(f"Description: {chunk['metadata']['description']}")
    print(f"Category: {chunk['metadata']['category']}")
    print(f"Published At: {chunk['metadata']['published_at']}")
    print(f"Summary: {chunk['metadata']['summary']}")
    print(f"Chunk Content: {chunk['page_content'][:500]}...")  # Display the first 500 characters of the chunk
    print("\n" + "="*80 + "\n")


Chunk ID: scliyWrN7mk_0
Video ID: scliyWrN7mk
Title: how bikes *actually* work
Description: why are bicycles stable? the most common answer is gyroscopic effects, but this is not right.
Category: Physics
Published At: 2024-04-17T16:20:50Z
Summary: Bikes are stable without riders because they're cleverly designed to steer themselves. If they start falling to one side, the handlebars turn in that direction to steer the wheels back underneath them. Just watch what happens when we lock the handle bars completely, so you can only go straight ahead.
Chunk Content: How do bikes without riders stay upright? As long as a bike is moving with sufficient speed, it can keep coasting indefinitely. But it turned out the ground where we went to test this effect was really bumpy. But the bike still manages to absorb all these perturbations. and remain stable. So how does it do this? I think most people believe it's the wheels spinning that creates some sort of gyroscopic effect that resists falling ove

In [11]:
import gc
gc.collect()

176

### Final Data Cleaning Followed by Embedding

In [12]:
def clean_text(text):
    if not text:
        return ""
    # Remove HTML entities
    text = re.sub(r'&[#A-Za-z0-9]+;', ' ', text)
    # Remove any other non-alphanumeric characters except spaces
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def chunks_to_json(chunks, embeddings_model, filename="3-chunked_transcriptions.json", batch_size=20):
    all_embeddings = []

    for i in tqdm(range(0, len(chunks), batch_size), desc="Processing Chunks in Batches"):
        batch = chunks[i:i + batch_size]
        embeddings = []

        for chunk in batch:
            # Clean the transcription, title, and description
            chunk['page_content'] = clean_text(chunk['page_content'])
            chunk['metadata']['title'] = clean_text(chunk['metadata']['title'])
            chunk['metadata']['description'] = clean_text(chunk['metadata']['description'])

            # Compute embeddings for each chunk
            embedding = embeddings_model.embed_query(chunk['page_content'])
            embeddings.append({
                "metadata": chunk['metadata'],
                "embedding": embedding,
                "content": chunk['page_content']
            })

        all_embeddings.extend(embeddings)

    # Save all embeddings and metadata to a JSON file at once
    with open(filename, 'w') as f:
        json.dump(all_embeddings, f, indent=4)

# Initialize embeddings
embeddings_model = OpenAIEmbeddings(api_key=OPENAI_API_KEY, model='text-embedding-ada-002')

# Save chunks to a JSON file
chunks_to_json(chunks, embeddings_model)

Processing Chunks in Batches: 100%|██████████| 58/58 [04:40<00:00,  4.83s/it]


In [13]:
files.download('/content/3-chunked_transcriptions.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Check and remove duplicate chunks_ids (updated videos by the YT channel)

In [14]:
import gc
gc.collect()

with open('/content/3-chunked_transcriptions.json', 'r') as f:
    chunked_transcriptions = json.load(f)

In [15]:
len(chunked_transcriptions)

1141

In [16]:
# Extract chunk IDs and check for duplicates
chunk_ids = [chunk['metadata']['chunk_id'] for chunk in chunked_transcriptions]
unique_chunk_ids = set(chunk_ids)
duplicate_chunk_ids = [chunk_id for chunk_id in unique_chunk_ids if chunk_ids.count(chunk_id) > 1]

# Number of unique chunk IDs and any duplicates
num_unique_chunk_ids = len(unique_chunk_ids)
num_total_chunk_ids = len(chunk_ids)
num_duplicates = len(duplicate_chunk_ids)

print(f"Total chunk IDs: {num_total_chunk_ids}")
print(f"Unique chunk IDs: {num_unique_chunk_ids}")
print(f"Number of duplicate chunk IDs: {num_duplicates}")
print(f"Duplicate chunk IDs: {duplicate_chunk_ids}")

Total chunk IDs: 1141
Unique chunk IDs: 1066
Number of duplicate chunk IDs: 75
Duplicate chunk IDs: ['AaZ_RSt0KP8_0', 'Q51-gLL_MRM_3', 'Q51-gLL_MRM_0', 'uxPdPpi5W4o_2', 'R13BD8qKeTg_0', 'pir_muTzYM8_0', 'S2xHZPH5Sng_1', 'uxPdPpi5W4o_3', 'C_covjcIcZ4_0', 'R13BD8qKeTg_2', 'R13BD8qKeTg_3', '0dqX3NjwaQs_0', 'S2xHZPH5Sng_5', 'dvk2PQNcg8w_5', 'wQmnztyXwVA_0', '0dqX3NjwaQs_3', '3LopI4YeC4I_0', '3LopI4YeC4I_1', 'AaZ_RSt0KP8_3', 'S2xHZPH5Sng_6', 'S1tFT4smd6E_2', 'S1tFT4smd6E_1', 'Ux33-5k8cjg_0', 'dvk2PQNcg8w_0', 'Q51-gLL_MRM_2', 'AaZ_RSt0KP8_2', 'u3FB2SuKFfI_1', 'R13BD8qKeTg_1', 'dvk2PQNcg8w_4', '3LopI4YeC4I_4', 'sehKAccM8p0_0', 'AaZ_RSt0KP8_1', 'AaZ_RSt0KP8_6', '16Ci_2bN_zc_2', 'AaZ_RSt0KP8_7', '16Ci_2bN_zc_1', 'UMnQWn1rkAA_0', 'AaZ_RSt0KP8_4', 'pir_muTzYM8_1', '3LopI4YeC4I_3', 'Ux33-5k8cjg_1', '16Ci_2bN_zc_6', 'S1tFT4smd6E_3', 'S2xHZPH5Sng_2', 'dvk2PQNcg8w_3', '0dqX3NjwaQs_1', 'wQmnztyXwVA_1', 'AaZ_RSt0KP8_5', 'S2xHZPH5Sng_3', 'S1tFT4smd6E_0', 'CRSDxGtcsxE_0', 'pir_muTzYM8_4', 'uxPdPpi5W4o_0'

In [17]:
# Cleaning Duplicates
# Create a dictionary to store the latest version of each chunk based on chunk_id
latest_chunks = {}

for chunk in chunked_transcriptions:
    chunk_id = chunk['metadata']['chunk_id']
    published_at = chunk['metadata']['published_at']

    # If the chunk_id is already in the dictionary, compare the published_at dates
    if chunk_id in latest_chunks:
        existing_published_at = latest_chunks[chunk_id]['metadata']['published_at']
        if published_at > existing_published_at:
            latest_chunks[chunk_id] = chunk
    else:
        latest_chunks[chunk_id] = chunk

# Convert the dictionary back to a list and overwrite chunked_transcriptions
chunked_transcriptions = list(latest_chunks.values())

# Number of unique chunk IDs and any duplicates
num_unique_chunk_ids = len(set([chunk['metadata']['chunk_id'] for chunk in chunked_transcriptions]))
num_total_chunk_ids = len(chunked_transcriptions)

print(f"Total chunk IDs: {num_total_chunk_ids}")
print(f"Unique chunk IDs: {num_unique_chunk_ids}")


Total chunk IDs: 1066
Unique chunk IDs: 1066


## Pincone Upsert

In [18]:
# Initialize Pinecone
pc = Pinecone(api_key=PC_API_KEY)

cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'
spec = ServerlessSpec(cloud=cloud, region=region)

index_name = "veritasium-vs-final"

In [19]:
# Check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes().names():
    # If does not exist, create index
    pc.create_index(
        index_name,
        dimension=1536,  # Dimension for OpenAI embeddings
        metric="cosine",
        spec=spec
    )
    # Wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# Connect to index
index = pc.Index(index_name)
# View index stats
index.describe_index_stats()


{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [20]:
# Function to simplify metadata
def simplify_metadata(metadata: Dict[str, Any], transcription: str) -> Dict[str, Any]:
    simplified_metadata = {}
    for key, value in metadata.items():
        if isinstance(value, (str, int, float, bool)):
            simplified_metadata[key] = value
        elif isinstance(value, list) and all(isinstance(item, str) for item in value):
            simplified_metadata[key] = value
        else:
            simplified_metadata[key] = str(value)
    simplified_metadata['transcription'] = transcription  # Add the transcription to the metadata
    return simplified_metadata

# Function to upload transcriptions to Pinecone
def upload_transcriptions(transcriptions: List[Dict[str, Any]], batch_size: int = 64):
    vector_ids = []
    seen_ids = set()  # To check for duplicate IDs

    for i in tqdm(range(0, len(transcriptions), batch_size)):
        i_end = min(i + batch_size, len(transcriptions))
        batch = transcriptions[i:i_end]
        to_upsert = []

        for chunk in batch:
            chunk_id = chunk['metadata']['chunk_id']

            # Check for duplicate IDs in input data
            if chunk_id in seen_ids:
                print(f"Duplicate chunk ID found in input data: {chunk_id}")
                continue

            seen_ids.add(chunk_id)

            # Simplify metadata
            simplified_metadata = simplify_metadata(chunk['metadata'], chunk['content'])
            vector_ids.append(chunk_id)
            to_upsert.append({
                'id': chunk_id,
                'values': chunk['embedding'],
                'metadata': simplified_metadata
            })

        # Upsert/insert these records to Pinecone
        response = index.upsert(vectors=to_upsert)
        # Print metadata to ensure upsert was successful
        print(f"Upserted batch {i} to {i_end}")

    return vector_ids

# Example usage
vector_ids = upload_transcriptions(chunked_transcriptions)

# Wait for the upserts to complete
import time
time.sleep(5)

# Fetch the first vector ID and check metadata
vector_id = vector_ids[0]  # Using the first ID from the upserted vectors
response = index.fetch(ids=[vector_id])

# Print the metadata of the fetched vector
if vector_id in response['vectors']:
    fetched_vector = response['vectors'][vector_id]
    print("Fetched vector metadata:", json.dumps({
        "id": fetched_vector['id'],
        "metadata": fetched_vector['metadata']
    }, indent=2))
else:
    print("Vector not found.")

  6%|▌         | 1/17 [00:01<00:19,  1.23s/it]

Upserted batch 0 to 64


 12%|█▏        | 2/17 [00:01<00:14,  1.05it/s]

Upserted batch 64 to 128


 18%|█▊        | 3/17 [00:02<00:12,  1.16it/s]

Upserted batch 128 to 192


 24%|██▎       | 4/17 [00:03<00:11,  1.11it/s]

Upserted batch 192 to 256


 29%|██▉       | 5/17 [00:04<00:09,  1.22it/s]

Upserted batch 256 to 320


 35%|███▌      | 6/17 [00:05<00:08,  1.26it/s]

Upserted batch 320 to 384


 41%|████      | 7/17 [00:05<00:07,  1.28it/s]

Upserted batch 384 to 448


 47%|████▋     | 8/17 [00:06<00:06,  1.36it/s]

Upserted batch 448 to 512


 53%|█████▎    | 9/17 [00:07<00:05,  1.42it/s]

Upserted batch 512 to 576


 59%|█████▉    | 10/17 [00:07<00:04,  1.44it/s]

Upserted batch 576 to 640


 65%|██████▍   | 11/17 [00:08<00:04,  1.46it/s]

Upserted batch 640 to 704


 71%|███████   | 12/17 [00:09<00:03,  1.48it/s]

Upserted batch 704 to 768


 76%|███████▋  | 13/17 [00:09<00:02,  1.47it/s]

Upserted batch 768 to 832


 82%|████████▏ | 14/17 [00:10<00:02,  1.47it/s]

Upserted batch 832 to 896


 88%|████████▊ | 15/17 [00:11<00:01,  1.50it/s]

Upserted batch 896 to 960


 94%|█████████▍| 16/17 [00:11<00:00,  1.48it/s]

Upserted batch 960 to 1024


100%|██████████| 17/17 [00:12<00:00,  1.38it/s]

Upserted batch 1024 to 1066





Fetched vector metadata: {
  "id": "scliyWrN7mk_0",
  "metadata": {
    "category": "Physics",
    "chunk_id": "scliyWrN7mk_0",
    "description": "why are bicycles stable the most common answer is gyroscopic effects but this is not right",
    "published_at": "2024-04-17T16:20:50Z",
    "summary": "Bikes are stable without riders because they're cleverly designed to steer themselves. If they start falling to one side, the handlebars turn in that direction to steer the wheels back underneath them. Just watch what happens when we lock the handle bars completely, so you can only go straight ahead.",
    "title": "how bikes actually work",
    "transcription": "How do bikes without riders stay upright As long as a bike is moving with sufficient speed it can keep coasting indefinitely But it turned out the ground where we went to test this effect was really bumpy But the bike still manages to absorb all these perturbations and remain stable So how does it do this I think most people believ