In [1]:
import sys
import os
import pathlib
from dotenv import load_dotenv, find_dotenv

import chromadb
from chromadb.utils import embedding_functions

from InstructorEmbedding import INSTRUCTOR
import tiktoken

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from matplotlib import pyplot as plt

os.environ["OPENAI_API_KEY"] = ""
sys.path.insert(0, os.path.abspath('../src/data/'))
load_dotenv(override=True)

  from tqdm.autonotebook import trange


True

### Steps to get timestamped podcast paragraph

In [6]:
# get collections from server
model_name = "hkunlp/instructor-large"
model = INSTRUCTOR(model_name)
ef = chromadb.utils.embedding_functions.InstructorEmbeddingFunction(model_name=model_name)
server_ip = os.getenv('CHROMA_SERVER_IP')
client = chromadb.HttpClient(host=server_ip, port=8000)
print("Current collections in chromadb server: ", client.list_collections())
transcript_collection = client.get_collection(name="transcripts-2", embedding_function=ef)
timestamp_collection = client.get_collection(name="transcript-timestamps", embedding_function=ef)


load INSTRUCTOR_Transformer
max_seq_length  512
load INSTRUCTOR_Transformer
max_seq_length  512
Current collections in chromadb server:  [Collection(name=transcripts), Collection(name=transcript-timestamps), Collection(name=transcripts-2)]


In [32]:
# Query first the transcripts
query_texts=[["Represent the statement for retrieving podcast documents: ", "Looking for sports and football podcasts"]]
query_embedding = model.encode(query_texts).tolist()
results = transcript_collection.query(
    query_embeddings=query_embedding,
    n_results=3,
)
result_df = pd.json_normalize(results["metadatas"][0])
result_df["episode_id"] = results["ids"][0]
result_df

Unnamed: 0,episode_description,episode_name,show_description,show_name,episode_id
0,Welcome to the very first episode of the Up No...,A Taste of The Up North Trip Ep.1,This podcast is all about the latest in the sp...,The Up North Trip,4q9et9b1xeAJcN3FnUq1vN
1,"JR is back with Paul and Matt, as they discuss...",Reporting as Eligible - A Green Bay Packer Pod...,"Paul, Matt, and JR take you on a trip through ...",Reporting As Eligible,101SutZ3LDi46SBuyO606G
2,Paul and Matt discuss why Pete Carroll is a do...,Reporting as Eligible: A Green Bay Packer Podc...,"Paul, Matt, and JR take you on a trip through ...",Reporting As Eligible,6FwNdlNG5oFexMhlR3srkh


In [37]:
# Select episode and then use it in the query where clause
selected_episode = result_df.loc[0, "episode_id"]

query_texts=[["Represent the Podcast query for retrieving relevant paragraphs: ","NFL team Patriots"]]
query_embedding = model.encode(query_texts).tolist()
results2 = timestamp_collection.query(
    query_embeddings=query_embedding,
    n_results=5,
    where={"episode": selected_episode},
)
result2_df = pd.json_normalize(results2["metadatas"][0])
result2_df["paragraph"] = results2["documents"][0]
result2_df

Unnamed: 0,endTime,episode,speakerTag,startTime,paragraph
0,981.5,4q9et9b1xeAJcN3FnUq1vN,2,973.4,We're talking about the Patriots but in the N...
1,135.8,4q9et9b1xeAJcN3FnUq1vN,2,106.1,Predominantly more the NFL this is going to b...
2,53.5,4q9et9b1xeAJcN3FnUq1vN,2,30.1,Going to be about sports. I'm a big fan of the...
3,813.0,4q9et9b1xeAJcN3FnUq1vN,2,783.2,As Kawhi Leonard would say so hopefully this ...
4,972.3,4q9et9b1xeAJcN3FnUq1vN,2,963.2,Let's get this thing going Jerry Jones signs ...


        Some handy methods to get collections data without querying

In [25]:
transcript_collection.get(ids=["4q9et9b1xeAJcN3FnUq1vN"])

{'ids': ['4q9et9b1xeAJcN3FnUq1vN'],
 'embeddings': None,
 'metadatas': [{'episode_description': "Welcome to the very first episode of the Up North Trip. This podcast is where we talk about sports, we'll be focusing on the news for the NFL and NBA and I'll be telling you a little about myself and what this podcast is going to be all about. ",
   'episode_name': 'A Taste of The Up North Trip Ep.1',
   'show_description': 'This podcast is all about the latest in the sports world, we focus on the news coming out of the NFL/NBA and more. Sean will be giving his opinions on the latest news and most likely getting predictions wrong. Thanks for tuning in!',
   'show_name': 'The Up North Trip'}],
 'documents': ["What's up guys? And welcome to the up nor trip. I'm your host Sean Lawson and I almost forgot the title to my podcast just there. So bear with me today is the very first episode of the up north trip. I'm hella excited and I appreciate you guys coming in and tuning in and supporting me t

In [24]:
timestamp_collection.peek()

{'ids': ['4q9et9b1xeAJcN3FnUq1vN0',
  '4q9et9b1xeAJcN3FnUq1vN1',
  '4q9et9b1xeAJcN3FnUq1vN2',
  '4q9et9b1xeAJcN3FnUq1vN3',
  '4q9et9b1xeAJcN3FnUq1vN4',
  '4q9et9b1xeAJcN3FnUq1vN5',
  '4q9et9b1xeAJcN3FnUq1vN6',
  '4q9et9b1xeAJcN3FnUq1vN7',
  '4q9et9b1xeAJcN3FnUq1vN8',
  '4q9et9b1xeAJcN3FnUq1vN9'],
 'embeddings': [[-0.05899237468838692,
   0.0006699366495013237,
   -0.0014701532199978828,
   0.01399296522140503,
   0.04559110105037689,
   0.024825621396303177,
   -0.013064807280898094,
   0.003350654849782586,
   -0.04170745611190796,
   0.05571015551686287,
   0.07091622799634933,
   0.015912286937236786,
   0.05171164497733116,
   0.05122723802924156,
   -0.031801607459783554,
   0.01479958463460207,
   -0.04664777219295502,
   0.0070796082727611065,
   -0.06333551555871964,
   -0.01995961181819439,
   0.012556813657283783,
   0.031814754009246826,
   -0.009107762016355991,
   0.04174608737230301,
   0.017639948055148125,
   0.04113740473985672,
   -0.02964342199265957,
   0.0613519586

In [2]:
test = "odjLKOK"
test.lower()

'odjlkok'