In [92]:
from InstructorEmbedding import INSTRUCTOR

import os
import pandas as pd
import pathlib
import chromadb
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from matplotlib import pyplot as plt

os.environ["OPENAI_API_KEY"] = ""

In [1]:
import sys
import os
sys.path.insert(0, os.path.abspath('../src/data/'))
from s3_utils import read_from_s3, write_to_s3
from chromadb.config import Settings
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

True

In [11]:
def preprocess_transcripts(transcripts_df):
    """ 
    Function that does preprocessing steps:
    - Split transcripts into chunks of 20000 characters
    - Explode to new rows
    - Remove transcripts with less than 1000 characters
    """
    transcripts_df['transcript'] = transcripts_df['transcript'].apply(lambda x: [x[i:i+20000] for i in range(0, len(x), 20000)])
    transcripts_df = transcripts_df.explode('transcript')
    transcripts_df['transcript_length'] = transcripts_df['transcript'].apply(lambda x: len(x))
    transcripts_df = transcripts_df[transcripts_df['transcript_length'] > 1000].reset_index(drop=True)
    return transcripts_df

In [6]:
# prepare texts with instructions
instruction = "Represent the Podcast transcript for retrieval: "

# read transcript data
filename = 'podcasts-transcripts-0to2-35.csv'

transcripts_df = read_from_s3(filename, filetype=None)
# transcripts_df = preprocess_transcripts(transcripts_df)
transcripts_df = transcripts_df.head(100)

# Create list of instruction - transcript pairs (100 first episodes)
# transcripts_df  = transcripts_df.head(100)
texts_with_instructions = []
for index, row in transcripts_df.iterrows():
    texts_with_instructions.append([instruction, row["transcript"]])

# calculate embeddings (100 first episodes took about 6 min)
# model = INSTRUCTOR('hkunlp/instructor-large')
# customized_embeddings = model.encode(texts_with_instructions, show_progress_bar=True, batch_size=16)
texts_with_instructions[0]

['Represent the Podcast transcript for retrieval: ',
 "Hi everyone, and welcome to murder friends the podcast were three friends from three different countries talk about murder. My name is Alana and I'm Canadian. Hi, I'm Anna and I'm American. My name is Hannah and I'm British. So sit back and relax you're among friends. And let's talk murder. Now. Today's episode is really special because it is the conclusion of spooky month. Which in my opinion is probably the best month. You're right. I love Halloween is the best time of year. For the whole month, really? Yes spooky season. It's pretty great. I mean in general you could start wearing sweaters boots, you know, go completely black and goth and coffee. Yeah spice lattes if that's your sort of basic thing, you know, which is because it is and if you're a more murdery, no than its quad gourd season. Yeah, exactly. The only thing that I've noticed so I've been living in England for just a couple of years. And England doesn't do Halloween

In [104]:
# from transformers import GPT2TokenizerFast
import transformers
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import TokenTextSplitter

#tokenizer = transformers.AutoTokenizer.from_pretrained('hkunlp/instructor-large')
#tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
filename = 'podcasts-transcripts-0to2-13.csv'

transcripts_df = read_from_s3(filename, filetype=None)
transcript = texts_with_instructions[0][1]
#text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(tokenizer, chunk_size=512, chunk_overlap=0)
text_splitter = TokenTextSplitter(chunk_size=2500, chunk_overlap=0)
transcripts_df["transcript_length"] = transcripts_df["transcript"].apply(lambda x: len(x))
transcripts_df = transcripts_df[transcripts_df["transcript_length"] > 1000]
transcripts_df.loc[:,"transcript"] = transcripts_df["transcript"].apply(lambda x: text_splitter.split_text(x))
#transcripts_df['num_of_splits'] = transcripts_df['splitted_text'].apply(lambda x: len(x))

#transcripts_df['num_of_splits'].describe()
transcripts_df.reset_index(drop=True).head(3)

Unnamed: 0,transcript,show,episode,show_uri,show_name,show_description,publisher,language,rss_link,episode_uri,episode_name,episode_description,duration,show_filename_prefix,episode_filename_prefix,source_file,transcript_length
0,[Just wanted to take a brief moment to give yo...,show_02Yjg2GfjecTzu0NbTV1HD,5EvN3tk0Y8gEWWRUzFRsfh,spotify:show:02Yjg2GfjecTzu0NbTV1HD,Paddle N' Fin,"Paddle N' Fin is your goto for Fishing, Kayak,...",Paddle N Fin Network,['en-US'],https://anchor.fm/s/44ec0b0/podcast/rss,spotify:episode:5EvN3tk0Y8gEWWRUzFRsfh,S3E22. Reel Down with Nauti Fish from TKAA,Your Host Sam Jones and the OG Brian Schiller ...,56.017367,show_02Yjg2GfjecTzu0NbTV1HD,5EvN3tk0Y8gEWWRUzFRsfh,podcasts-transcripts-0to2.tar.gz,45764
1,[Just wanted to take a brief moment to give yo...,show_02Yjg2GfjecTzu0NbTV1HD,6SKHSUqdHKe3wQuxovgCmm,spotify:show:02Yjg2GfjecTzu0NbTV1HD,Paddle N' Fin,"Paddle N' Fin is your goto for Fishing, Kayak,...",Paddle N Fin Network,['en-US'],https://anchor.fm/s/44ec0b0/podcast/rss,spotify:episode:6SKHSUqdHKe3wQuxovgCmm,"S1E44. New Prop, Giant Smallmouth, Wisconsin ...",Brian and Jay sit down and chat about this pas...,50.465483,show_02Yjg2GfjecTzu0NbTV1HD,6SKHSUqdHKe3wQuxovgCmm,podcasts-transcripts-0to2.tar.gz,36529
2,[Just wanted to take a brief moment to give yo...,show_02Yjg2GfjecTzu0NbTV1HD,6HiVnhPXYlpR5Gu56XH8Qg,spotify:show:02Yjg2GfjecTzu0NbTV1HD,Paddle N' Fin,"Paddle N' Fin is your goto for Fishing, Kayak,...",Paddle N Fin Network,['en-US'],https://anchor.fm/s/44ec0b0/podcast/rss,spotify:episode:6HiVnhPXYlpR5Gu56XH8Qg,S1E34. Tennessee Trip Recap,Brian and Jay sit down to chat about this past...,54.146233,show_02Yjg2GfjecTzu0NbTV1HD,6HiVnhPXYlpR5Gu56XH8Qg,podcasts-transcripts-0to2.tar.gz,42174


In [68]:
from InstructorEmbedding import INSTRUCTOR
model = INSTRUCTOR('hkunlp/instructor-large')
model.max_seq_length = 512

load INSTRUCTOR_Transformer
max_seq_length  512


In [71]:
text_splitter = TokenTextSplitter(chunk_size=4096, chunk_overlap=0)
texts = text_splitter.split_text(transcript)
instruction = "Represent the Podcast transcript for retrieval: "
texts_with_instructions = []
for i in range(1):
    text = texts[i]
    texts_with_instructions.append([instruction, text])
customized_embeddings = model.encode(texts_with_instructions, show_progress_bar=True, batch_size=32)
customized_embeddings

Batches: 100%|██████████| 1/1 [00:03<00:00,  3.07s/it]


array([[-5.16323410e-02,  6.67927694e-03, -8.85380805e-03,
         2.44533196e-02,  4.25812751e-02,  1.54207665e-02,
        -1.50699094e-02,  3.12019289e-02, -5.03524877e-02,
         6.62412420e-02,  4.46674079e-02, -3.49719706e-03,
         3.33244726e-02,  2.95191091e-02, -2.37314161e-02,
        -3.90568678e-03, -2.18310207e-02,  2.83075590e-02,
        -3.82242575e-02,  5.62727312e-03,  4.54290770e-02,
         3.08722332e-02,  2.85880733e-02,  3.92169580e-02,
         4.76138666e-02,  3.13129425e-02, -2.64866650e-02,
         3.40732262e-02,  4.13482450e-02, -7.31522217e-02,
         4.99199815e-02, -3.32918204e-02, -3.03022228e-02,
        -5.33378944e-02, -2.74871886e-02, -3.54510266e-03,
         2.04877574e-02,  3.19864415e-02,  2.18761712e-02,
         6.83443621e-02, -1.20599801e-02, -2.10893080e-02,
        -3.27954954e-03, -4.55695502e-02,  1.09888113e-03,
         7.08918273e-03, -6.10362217e-02, -2.55248547e-02,
         4.40250300e-02, -6.68482529e-03, -1.75272841e-0

In [73]:
import numpy as np
np.average(customized_embeddings, axis=0).reshape((1,768))

array([[-5.16323410e-02,  6.67927694e-03, -8.85380805e-03,
         2.44533196e-02,  4.25812751e-02,  1.54207665e-02,
        -1.50699094e-02,  3.12019289e-02, -5.03524877e-02,
         6.62412420e-02,  4.46674079e-02, -3.49719706e-03,
         3.33244726e-02,  2.95191091e-02, -2.37314161e-02,
        -3.90568678e-03, -2.18310207e-02,  2.83075590e-02,
        -3.82242575e-02,  5.62727312e-03,  4.54290770e-02,
         3.08722332e-02,  2.85880733e-02,  3.92169580e-02,
         4.76138666e-02,  3.13129425e-02, -2.64866650e-02,
         3.40732262e-02,  4.13482450e-02, -7.31522217e-02,
         4.99199815e-02, -3.32918204e-02, -3.03022228e-02,
        -5.33378944e-02, -2.74871886e-02, -3.54510266e-03,
         2.04877574e-02,  3.19864415e-02,  2.18761712e-02,
         6.83443621e-02, -1.20599801e-02, -2.10893080e-02,
        -3.27954954e-03, -4.55695502e-02,  1.09888113e-03,
         7.08918273e-03, -6.10362217e-02, -2.55248547e-02,
         4.40250300e-02, -6.68482529e-03, -1.75272841e-0

In [16]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'test-db'

# format data that if can be put to the chroma collection
embeddings = customized_embeddings.tolist()
documents = transcripts_df["transcript"].tolist()
ids = transcripts_df.index.astype(str).to_list()
metadata = transcripts_df[['episode_description', 'episode_name']].to_dict(orient='records')

#loading into chroma
#client = chromadb.PersistentClient(path=persist_directory)
model = INSTRUCTOR('hkunlp/instructor-large')
server_ip = os.getenv('CHROMA_SERVER_IP')
client = chromadb.HttpClient(host=server_ip, port=8000)
collection = client.get_collection(name="transcripts", embedding_function=model)


# create the collection and add documents
try:
    client.delete_collection("transcripts")
except Exception as e:
    pass
collection = client.create_collection("transcripts", embedding_function=model)
collection.add(
    embeddings=embeddings,
    documents=documents,
    metadatas=metadata,
    ids=ids,
)

In [116]:
import boto3
import pandas as pd
my_session = boto3.Session(region_name='eu-central-1', profile_name='my-admin-profile')
client = boto3.client('s3')
files = client.list_objects(Bucket='sumar-ai')['Contents']
filenames = [f['Key'] for f in files][1:4]
processed_files_df = pd.DataFrame({"processed_files" : filenames})
processed_files_df
#processed_files_df.loc[processed_files_df.index.max() + 1] = "test"
#processed_files_df
write_to_s3(processed_files_df, 'processed_files', 'metadata')

In [3]:
processed_files_df = read_from_s3('processed_files', 'metadata')
processed_files_list = list(processed_files_df.processed_files)
processed_files_df.loc[processed_files_df.index.max() + 1] = "test"
processed_files_df

TokenRetrievalError: Error when retrieving token from sso: Token has expired and refresh failed

In [122]:
# get collection from server
model = INSTRUCTOR('hkunlp/instructor-large')
server_ip = os.getenv('CHROMA_SERVER_IP')
client = chromadb.HttpClient(host=server_ip, port=8000)
collection = client.get_collection(name="transcripts-2", embedding_function=model)


load INSTRUCTOR_Transformer
max_seq_length  512


In [126]:
# Try querying the database
query_texts=[['Represent the topic for retrieving relevant Podcast transcripts: ',"Economy"]]
query_embedding = model.encode(query_texts).tolist()
collection.query(
    query_embeddings=query_embedding,
    n_results=3
)
#client.list_collections()
collection.count()


3955

In [None]:
transcripts_df[['episode_description']].to_dict(orient='records')

In [98]:
read_and_preprocess_transcripts(filepath)

Unnamed: 0,transcript,show,episode,show_uri,show_name,show_description,publisher,language,rss_link,episode_uri,episode_name,episode_description,duration,show_filename_prefix,episode_filename_prefix,transcript_length
0,Welcome back to GRE vocab. Before we continue...,show_00iaP2GhJUWHAovx41BN5N,2VVBXyGRGd2cydXPxHJme6,spotify:show:00iaP2GhJUWHAovx41BN5N,GRE Vocab,We use publicly recommended vocab words that m...,GRE Vocab Yo,['en-US'],https://anchor.fm/s/9b4ae84/podcast/rss,spotify:episode:2VVBXyGRGd2cydXPxHJme6,Fatuous • Apocryphal • Acridity • Trenchancy •...,Trump. Genius or idiot? Brilliant or buffoon? ...,2.809233,show_00iaP2GhJUWHAovx41BN5N,2VVBXyGRGd2cydXPxHJme6,1951
1,Welcome back to another episode of a GRE vocab...,show_00iaP2GhJUWHAovx41BN5N,3jIQUvPSaCcMOpJfx5AB1w,spotify:show:00iaP2GhJUWHAovx41BN5N,GRE Vocab,We use publicly recommended vocab words that m...,GRE Vocab Yo,['en-US'],https://anchor.fm/s/9b4ae84/podcast/rss,spotify:episode:3jIQUvPSaCcMOpJfx5AB1w,Phlegmatic • laconic • probity • proclivity • ...,Who win and be Trumps challenger? NBA vs China...,3.593667,show_00iaP2GhJUWHAovx41BN5N,3jIQUvPSaCcMOpJfx5AB1w,2480
2,Welcome back to the GRE vocab podcast. We use ...,show_00iaP2GhJUWHAovx41BN5N,3iRJkm19fnZkdwW7nTX5Yb,spotify:show:00iaP2GhJUWHAovx41BN5N,GRE Vocab,We use publicly recommended vocab words that m...,GRE Vocab Yo,['en-US'],https://anchor.fm/s/9b4ae84/podcast/rss,spotify:episode:3iRJkm19fnZkdwW7nTX5Yb,Punctilious • Vociferous • Waggish • Panegyric...,Astros are freaking busted! Messed up...messed...,3.347933,show_00iaP2GhJUWHAovx41BN5N,3iRJkm19fnZkdwW7nTX5Yb,2144
3,Welcome back to episode 5 of GRE vocab today t...,show_00iaP2GhJUWHAovx41BN5N,3vdykhEg3lYhfXEVIIB8GH,spotify:show:00iaP2GhJUWHAovx41BN5N,GRE Vocab,We use publicly recommended vocab words that m...,GRE Vocab Yo,['en-US'],https://anchor.fm/s/9b4ae84/podcast/rss,spotify:episode:3vdykhEg3lYhfXEVIIB8GH,Halcyon • hedonism • iconoclast • impunity • i...,We’re talking Lebron and MJ. Messi and Barcelo...,4.140900,show_00iaP2GhJUWHAovx41BN5N,3vdykhEg3lYhfXEVIIB8GH,3362
4,Welcome back to GRE vocab today are five words...,show_00iaP2GhJUWHAovx41BN5N,4Xsex7XPthez6srVIdv4vU,spotify:show:00iaP2GhJUWHAovx41BN5N,GRE Vocab,We use publicly recommended vocab words that m...,GRE Vocab Yo,['en-US'],https://anchor.fm/s/9b4ae84/podcast/rss,spotify:episode:4Xsex7XPthez6srVIdv4vU,Prodigal • Rancorous • Salubrious • Querulous ...,NASA is going to the moon...AGAIN!? And they w...,4.011250,show_00iaP2GhJUWHAovx41BN5N,4Xsex7XPthez6srVIdv4vU,2995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4020,ere's no harm in being personal personal reall...,show_09UZ3QZQGagWfnPMgZYccR,1H2m1fhwmVpeEf0N4YZVgB,spotify:show:09UZ3QZQGagWfnPMgZYccR,The Food Podcast,The Food Podcast is a show where personal stor...,The Food Podcast,['en'],https://anchor.fm/s/4887ad0/podcast/rss,spotify:episode:1H2m1fhwmVpeEf0N4YZVgB,THE FOOD PODCAST 029 -Making friends and feedi...,"On today’s episode, Hetty McKinnon and I share...",29.926400,show_09UZ3QZQGagWfnPMgZYccR,1H2m1fhwmVpeEf0N4YZVgB,5583
4021,"Hello, my name is Quinton Pastrana. Welcome to...",show_09NfbvW32HL0dKd4gxxBCk,4ch15nYQdO4kNW2v4Yy4Wz,spotify:show:09NfbvW32HL0dKd4gxxBCk,Life Sentences,Words to live by and die for. Life Sentences p...,PumaPodcast,['en-PH'],https://anchor.fm/s/11afbcc8/podcast/rss,spotify:episode:4ch15nYQdO4kNW2v4Yy4Wz,Episode 1: Mi Ultimo Adiós,Mi Ultimo Adiós: Rizal's farewell revisited an...,12.916483,show_09NfbvW32HL0dKd4gxxBCk,4ch15nYQdO4kNW2v4Yy4Wz,8111
4022,Over the span of 12 months to larger-than-life...,show_09NfbvW32HL0dKd4gxxBCk,1YGjk9nbFi4onBQhad0hYv,spotify:show:09NfbvW32HL0dKd4gxxBCk,Life Sentences,Words to live by and die for. Life Sentences p...,PumaPodcast,['en-PH'],https://anchor.fm/s/11afbcc8/podcast/rss,spotify:episode:1YGjk9nbFi4onBQhad0hYv,Episode 4: A tribute to Twink Macaraig and Mar...,A tribute to veteran journalist Twink Macaraig...,8.090550,show_09NfbvW32HL0dKd4gxxBCk,1YGjk9nbFi4onBQhad0hYv,4765
4023,My name is Quinton Pastrana. Welcome to anothe...,show_09NfbvW32HL0dKd4gxxBCk,3fzo9XOqqNlfp10lIcFJGm,spotify:show:09NfbvW32HL0dKd4gxxBCk,Life Sentences,Words to live by and die for. Life Sentences p...,PumaPodcast,['en-PH'],https://anchor.fm/s/11afbcc8/podcast/rss,spotify:episode:3fzo9XOqqNlfp10lIcFJGm,Episode 2: Year End Poems,New Year's resolutions: a pair of spiritual an...,7.135883,show_09NfbvW32HL0dKd4gxxBCk,3fzo9XOqqNlfp10lIcFJGm,4926


In [124]:
print(transcripts_df.loc[0,'transcript'])

Welcome back to GRE vocab.  Before we continue. We just want to give a quick shout out to the people who make this podcast possible. Anchor anchor is everything you need to make a podcast in one place. They distribute your podcast for you. They have tools that allow you to record and edit everything right from your phone or computer. It's free the even allow you to put ads in your your podcast. It's pretty awesome. Go check it out download the free anchor app or go to Anchored off dot f m-- to get started. Okay, let's get back to the show.  Today's words are fatuous, silly and pointless apocryphal.  Of doubtful authenticity, although widely circulated as being true usually in regards to a story or statement a credit e having an acrid smell accurate is having an irritant Lee Strong and unpleasant taste or smell angry bitter.  Trench in see in size of - keenness and forcefulness of thought or expression or intellect euphony the quality of being pleasing to the ear, especially through a h