In [53]:
import pandas as pd
from openai import OpenAI
import re
import tqdm
import tiktoken

def clean_transcript(transcript):
    # Use a regular expression to remove content within parentheses
    clean_text = re.sub(r'\(.*?\)', '', transcript)
    return clean_text

def truncate_text_tokens(text, encoding_name='cl100k_base', max_tokens=8191):
    """Truncate a string to have `max_tokens` according to the given encoding."""
    encoding = tiktoken.get_encoding(encoding_name)
    return encoding.encode(text)[:max_tokens]

def generate_embeddings(text):
    OPENAI_API_KEY = ""  # REMOVE THIS BEFORE COMMIT!!!
    client = OpenAI(api_key=OPENAI_API_KEY)
    return client.embeddings.create(input=text, model='text-embedding-3-large').data[0].embedding

df = (pd.read_csv("../metadata/asr.csv"))
df['transcript'] = df['transcript'].apply(clean_transcript)

embeddings = []
for tr in tqdm.tqdm(df['transcript'], total=len(df)):
    if tr == "":
        embeddings.append(pd.NA)
        continue
        
    try:
        emb = generate_embeddings(truncate_text_tokens(tr))
        embeddings.append(emb)
    except Exception as e:
        print(tr)
        raise e
    
df['embeddings'] = embeddings
df[['url', 'embeddings']].to_csv("../metadata/embeddings_transcript_clean.csv")