In [13]:
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader


from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

import os
import pandas as pd
import pathlib
import chromadb
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

os.environ["OPENAI_API_KEY"] = ""

In [50]:
# prepare texts with instructions
instruction = "Represent the podcast transcript document for retrieval: "

# read transcript data
filepath_metadata = 'notebooks/csv/transcripts-0[0-9].csv'
root = str(pathlib.Path().absolute()).split("notebooks")[0]
path = root + filepath_metadata
transcripts_df = pd.read_csv(path, index_col=0)

# Create list of instruction - transcript pairs (100 first episodes)
transcripts_df  = transcripts_df.head(100)
texts_with_instructions = []
for index, row in transcripts_df.iterrows():
    texts_with_instructions.append([instruction, row["transcript"]])

# calculate embeddings (100 first episodes took about 6 min)
model = INSTRUCTOR('hkunlp/instructor-large')
customized_embeddings = model.encode(texts_with_instructions)

load INSTRUCTOR_Transformer
max_seq_length  512


In [55]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'test-db'

# format data that if can be put to the chroma collection
embeddings = customized_embeddings.tolist()
documents = transcripts_df["transcript"].tolist()
ids = transcripts_df.index.astype(str).to_list()
metadata = transcripts_df[['episode_description']].to_dict(orient='records')

#loading into chroma
client = chromadb.PersistentClient(path=persist_directory)
# create the collection and add documents
collection = client.create_collection("transcripts", embedding_function=model)
collection.add(
    embeddings=embeddings,
    documents=documents,
    metadatas=metadata,
    ids=ids,
)

In [56]:
# Try querying the database
query_texts=[['Represent the statement for retrieving podcast documents: ',"I'm looking for podcasts about sports and especially basketball"]]
query_embedding = model.encode(query_texts).tolist()
collection.query(
    query_embeddings=query_embedding,
    n_results=2
)

{'ids': [['11', '7']],
 'distances': [[0.3139927387237549, 0.33367568254470825]],
 'metadatas': [[{'episode_description': 'NBA!!! Hollaaaa GRE words inter spliced with nba opinions and updates. We got Drake. We got Warriors. Oh...and some bad lotion ☠️   ---   This episode is sponsored by  · Anchor: The easiest way to make a podcast.  https://anchor.fm/app '},
   {'episode_description': 'Talking NBA and who is going to be MVP with some GRE vocab sprinkled on top. Learning is So kewl.   ---   This episode is sponsored by  · Anchor: The easiest way to make a podcast.  https://anchor.fm/app '}]],
 'embeddings': None,
 'documents': [["Before we continue. We just want to give a quick shout out to the people who make this podcast possible. Anchor anchor is everything you need to make a podcast in one place. They distribute your podcast for you. They have tools that allow you to record and edit everything right from your phone or computer. It's free the even allow you to put ads in your your 

In [44]:
transcripts_df[['episode_description']].to_dict(orient='records')

[{'episode_description': 'Trump. Genius or idiot? Brilliant or buffoon? Listen here!  ---   This episode is sponsored by  · Anchor: The easiest way to make a podcast.  https://anchor.fm/app '},
 {'episode_description': 'Who win and be Trumps challenger? NBA vs China...ding ding!   ---   This episode is sponsored by  · Anchor: The easiest way to make a podcast.  https://anchor.fm/app '},
 {'episode_description': 'Astros are freaking busted! Messed up...messed up  ---   This episode is sponsored by  · Anchor: The easiest way to make a podcast.  https://anchor.fm/app '},
 {'episode_description': 'We’re talking Lebron and MJ. Messi and Barcelona. Trump the crazy. Bernie the Rich!   ---   This episode is sponsored by  · Anchor: The easiest way to make a podcast.  https://anchor.fm/app '},
 {'episode_description': 'NASA is going to the moon...AGAIN!? And they want to stay there, forever...? Whaaaaa  ---   This episode is sponsored by  · Anchor: The easiest way to make a podcast.  https://anc