## This file handles subtitle(transcript)(.src) file. 
## It's used in abondoned data, so it's deprecated. See store_transcript_new.ipynb.

In [4]:
from dotenv import load_dotenv
from openai import OpenAI
from pinecone import Pinecone

load_dotenv()
client = OpenAI() # api_key=

pc = Pinecone() # api_key=
index = pc.Index("ta")

Extract text from transcript files(.src) and clean the content.

In [6]:
import re
import os

def clean_srt_content(file_path):
    """
    Reads an SRT file, removes numeric identifiers and timestamps,
    and returns only the transcript text.
    
    :param file_path: Path to the SRT file
    :return: Cleaned transcript text
    """
    # Read the content of the SRT file
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Remove numeric identifiers and timestamps
    cleaned_content = re.sub(r'\d+\n\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}\n', '', content)
    
    # Remove residual newlines and return the cleaned text
    cleaned_content = re.sub(r'\n+', '\n', cleaned_content).strip()

    return cleaned_content

folder_path='./subtitles'
files=os.listdir(folder_path)
trans_texts={}
for file in files: 
    path=folder_path+'/'+file
    trans_text=clean_srt_content(path)
    trans_texts[file]=trans_text

Split the text into chunks and do the embedding for chunks, lastly store the vectors in Pinecone.

In [8]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=1000,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)

trans_chunks={}
for file_name, trans_text in trans_texts.items():
    trans_chunks[file_name]=text_splitter.split_text(trans_text)

In [10]:
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
index_name = "ta"
for file_name, trans_chunk in trans_chunks.items():
    docsearch = PineconeVectorStore.from_texts(trans_chunk, embeddings, index_name=index_name, namespace='transcript_data', metadatas=[{'file_name': file_name} for _ in trans_chunk])