In [77]:
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader


from InstructorEmbedding import INSTRUCTOR
from langchain.embeddings import HuggingFaceInstructEmbeddings

import os
import pandas as pd
import pathlib
import chromadb
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from matplotlib import pyplot as plt

os.environ["OPENAI_API_KEY"] = ""

In [97]:
def read_and_preprocess_transcripts(path):
    """ 
    Function that reads a csv file from path and does preprocessing steps:
    - Split transcripts into chunks of 20000 characters
    - Explode to new rows
    - Remove transcripts with less than 1000 characters
    """
    root = str(pathlib.Path().absolute()).split("notebooks")[0]
    path = root + path
    transcripts_df = pd.read_csv(path, index_col=0)
    transcripts_df['transcript'] = transcripts_df['transcript'].apply(lambda x: [x[i:i+20000] for i in range(0, len(x), 20000)])
    transcripts_df = transcripts_df.explode('transcript')
    transcripts_df['transcript_length'] = transcripts_df['transcript'].apply(lambda x: len(x))
    transcripts_df = transcripts_df[transcripts_df['transcript_length'] > 1000].reset_index(drop=True)
    return transcripts_df

In [113]:
# prepare texts with instructions
instruction = "Represent the Podcast transcript for retrieval: "

# read transcript data
filepath = 'notebooks/csv/transcripts-0[0-9].csv'
transcripts_df = read_and_preprocess_transcripts(filepath)
transcripts_df = transcripts_df.head(100)

# Create list of instruction - transcript pairs (100 first episodes)
# transcripts_df  = transcripts_df.head(100)
texts_with_instructions = []
for index, row in transcripts_df.iterrows():
    texts_with_instructions.append([instruction, row["transcript"]])

# calculate embeddings (100 first episodes took about 6 min)
model = INSTRUCTOR('hkunlp/instructor-large')
customized_embeddings = model.encode(texts_with_instructions, show_progress_bar=True, batch_size=16)

load INSTRUCTOR_Transformer
max_seq_length  512


Batches: 100%|██████████| 7/7 [05:20<00:00, 45.84s/it]


In [114]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'test-db'

# format data that if can be put to the chroma collection
embeddings = customized_embeddings.tolist()
documents = transcripts_df["transcript"].tolist()
ids = transcripts_df.index.astype(str).to_list()
metadata = transcripts_df[['episode_description', 'episode_name']].to_dict(orient='records')

#loading into chroma
client = chromadb.PersistentClient(path=persist_directory)
# create the collection and add documents
try:
    client.delete_collection("transcripts")
except ValueError:
    pass
collection = client.create_collection("transcripts", embedding_function=model)
collection.add(
    embeddings=embeddings,
    documents=documents,
    metadatas=metadata,
    ids=ids,
)

In [122]:
# Try querying the database
query_texts=[['Represent the topic for retrieving relevant Podcast transcripts: ',"War in Irak"]]
query_embedding = model.encode(query_texts).tolist()
collection.query(
    query_embeddings=query_embedding,
    n_results=5
)

{'ids': [['74', '73', '85', '41', '56']],
 'distances': [[0.5550721883773804,
   0.5561415553092957,
   0.5664547681808472,
   0.5911593437194824,
   0.5919430255889893]],
 'metadatas': [[{'episode_description': 'Today is our FIRST ever NEWB, we’re so excited to get this started and would love you guys to write in about every topic….. Nick and Nate talk about the games they have been playing. What Tunes they like to listen to during some gaming sessions, and Give their Impressions on the LTM Winter Express-->___ ===Music== -If I had a Chicken- -Drunken Countess- -Rage- -Patience - -Turtle Time- {{Apex Legends Theme song Trap mix \xa0By Pido}}==> ---Facebook & YouTube - [Threat X3 Productions]-->___ ---Twitter - @TX3Productions-- - Email \xa0threatx3productions@gmail.com-->___ ---Twitter \xa0\xa0Apex Legends HUBcast- -@ApexLegendscast-->___ ---Discord- Threat X3 Productions- https://discord.gg/c6hBckx -->___  ---   This episode is sponsored by  · Anchor: The easiest way to make a podcas

In [None]:
transcripts_df[['episode_description']].to_dict(orient='records')

In [98]:
read_and_preprocess_transcripts(filepath)

Unnamed: 0,transcript,show,episode,show_uri,show_name,show_description,publisher,language,rss_link,episode_uri,episode_name,episode_description,duration,show_filename_prefix,episode_filename_prefix,transcript_length
0,Welcome back to GRE vocab. Before we continue...,show_00iaP2GhJUWHAovx41BN5N,2VVBXyGRGd2cydXPxHJme6,spotify:show:00iaP2GhJUWHAovx41BN5N,GRE Vocab,We use publicly recommended vocab words that m...,GRE Vocab Yo,['en-US'],https://anchor.fm/s/9b4ae84/podcast/rss,spotify:episode:2VVBXyGRGd2cydXPxHJme6,Fatuous • Apocryphal • Acridity • Trenchancy •...,Trump. Genius or idiot? Brilliant or buffoon? ...,2.809233,show_00iaP2GhJUWHAovx41BN5N,2VVBXyGRGd2cydXPxHJme6,1951
1,Welcome back to another episode of a GRE vocab...,show_00iaP2GhJUWHAovx41BN5N,3jIQUvPSaCcMOpJfx5AB1w,spotify:show:00iaP2GhJUWHAovx41BN5N,GRE Vocab,We use publicly recommended vocab words that m...,GRE Vocab Yo,['en-US'],https://anchor.fm/s/9b4ae84/podcast/rss,spotify:episode:3jIQUvPSaCcMOpJfx5AB1w,Phlegmatic • laconic • probity • proclivity • ...,Who win and be Trumps challenger? NBA vs China...,3.593667,show_00iaP2GhJUWHAovx41BN5N,3jIQUvPSaCcMOpJfx5AB1w,2480
2,Welcome back to the GRE vocab podcast. We use ...,show_00iaP2GhJUWHAovx41BN5N,3iRJkm19fnZkdwW7nTX5Yb,spotify:show:00iaP2GhJUWHAovx41BN5N,GRE Vocab,We use publicly recommended vocab words that m...,GRE Vocab Yo,['en-US'],https://anchor.fm/s/9b4ae84/podcast/rss,spotify:episode:3iRJkm19fnZkdwW7nTX5Yb,Punctilious • Vociferous • Waggish • Panegyric...,Astros are freaking busted! Messed up...messed...,3.347933,show_00iaP2GhJUWHAovx41BN5N,3iRJkm19fnZkdwW7nTX5Yb,2144
3,Welcome back to episode 5 of GRE vocab today t...,show_00iaP2GhJUWHAovx41BN5N,3vdykhEg3lYhfXEVIIB8GH,spotify:show:00iaP2GhJUWHAovx41BN5N,GRE Vocab,We use publicly recommended vocab words that m...,GRE Vocab Yo,['en-US'],https://anchor.fm/s/9b4ae84/podcast/rss,spotify:episode:3vdykhEg3lYhfXEVIIB8GH,Halcyon • hedonism • iconoclast • impunity • i...,We’re talking Lebron and MJ. Messi and Barcelo...,4.140900,show_00iaP2GhJUWHAovx41BN5N,3vdykhEg3lYhfXEVIIB8GH,3362
4,Welcome back to GRE vocab today are five words...,show_00iaP2GhJUWHAovx41BN5N,4Xsex7XPthez6srVIdv4vU,spotify:show:00iaP2GhJUWHAovx41BN5N,GRE Vocab,We use publicly recommended vocab words that m...,GRE Vocab Yo,['en-US'],https://anchor.fm/s/9b4ae84/podcast/rss,spotify:episode:4Xsex7XPthez6srVIdv4vU,Prodigal • Rancorous • Salubrious • Querulous ...,NASA is going to the moon...AGAIN!? And they w...,4.011250,show_00iaP2GhJUWHAovx41BN5N,4Xsex7XPthez6srVIdv4vU,2995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4020,ere's no harm in being personal personal reall...,show_09UZ3QZQGagWfnPMgZYccR,1H2m1fhwmVpeEf0N4YZVgB,spotify:show:09UZ3QZQGagWfnPMgZYccR,The Food Podcast,The Food Podcast is a show where personal stor...,The Food Podcast,['en'],https://anchor.fm/s/4887ad0/podcast/rss,spotify:episode:1H2m1fhwmVpeEf0N4YZVgB,THE FOOD PODCAST 029 -Making friends and feedi...,"On today’s episode, Hetty McKinnon and I share...",29.926400,show_09UZ3QZQGagWfnPMgZYccR,1H2m1fhwmVpeEf0N4YZVgB,5583
4021,"Hello, my name is Quinton Pastrana. Welcome to...",show_09NfbvW32HL0dKd4gxxBCk,4ch15nYQdO4kNW2v4Yy4Wz,spotify:show:09NfbvW32HL0dKd4gxxBCk,Life Sentences,Words to live by and die for. Life Sentences p...,PumaPodcast,['en-PH'],https://anchor.fm/s/11afbcc8/podcast/rss,spotify:episode:4ch15nYQdO4kNW2v4Yy4Wz,Episode 1: Mi Ultimo Adiós,Mi Ultimo Adiós: Rizal's farewell revisited an...,12.916483,show_09NfbvW32HL0dKd4gxxBCk,4ch15nYQdO4kNW2v4Yy4Wz,8111
4022,Over the span of 12 months to larger-than-life...,show_09NfbvW32HL0dKd4gxxBCk,1YGjk9nbFi4onBQhad0hYv,spotify:show:09NfbvW32HL0dKd4gxxBCk,Life Sentences,Words to live by and die for. Life Sentences p...,PumaPodcast,['en-PH'],https://anchor.fm/s/11afbcc8/podcast/rss,spotify:episode:1YGjk9nbFi4onBQhad0hYv,Episode 4: A tribute to Twink Macaraig and Mar...,A tribute to veteran journalist Twink Macaraig...,8.090550,show_09NfbvW32HL0dKd4gxxBCk,1YGjk9nbFi4onBQhad0hYv,4765
4023,My name is Quinton Pastrana. Welcome to anothe...,show_09NfbvW32HL0dKd4gxxBCk,3fzo9XOqqNlfp10lIcFJGm,spotify:show:09NfbvW32HL0dKd4gxxBCk,Life Sentences,Words to live by and die for. Life Sentences p...,PumaPodcast,['en-PH'],https://anchor.fm/s/11afbcc8/podcast/rss,spotify:episode:3fzo9XOqqNlfp10lIcFJGm,Episode 2: Year End Poems,New Year's resolutions: a pair of spiritual an...,7.135883,show_09NfbvW32HL0dKd4gxxBCk,3fzo9XOqqNlfp10lIcFJGm,4926
