In [2]:
from InstructorEmbedding import INSTRUCTOR

import os
import pandas as pd
import pathlib
import chromadb
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from matplotlib import pyplot as plt

os.environ["OPENAI_API_KEY"] = ""

In [1]:
import sys
import os
sys.path.insert(0, os.path.abspath('../src/data/'))
from s3_utils import read_from_s3, write_to_s3
from chromadb.config import Settings
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

True

In [11]:
def preprocess_transcripts(transcripts_df):
    """ 
    Function that does preprocessing steps:
    - Split transcripts into chunks of 20000 characters
    - Explode to new rows
    - Remove transcripts with less than 1000 characters
    """
    transcripts_df['transcript'] = transcripts_df['transcript'].apply(lambda x: [x[i:i+20000] for i in range(0, len(x), 20000)])
    transcripts_df = transcripts_df.explode('transcript')
    transcripts_df['transcript_length'] = transcripts_df['transcript'].apply(lambda x: len(x))
    transcripts_df = transcripts_df[transcripts_df['transcript_length'] > 1000].reset_index(drop=True)
    return transcripts_df

In [12]:
# prepare texts with instructions
instruction = "Represent the Podcast transcript for retrieval: "

# read transcript data
filename = 'podcasts-transcripts-0to2-35.csv'

transcripts_df = read_from_s3(filename, filetype=None)
transcripts_df = preprocess_transcripts(transcripts_df)
transcripts_df = transcripts_df.head(100)

# Create list of instruction - transcript pairs (100 first episodes)
# transcripts_df  = transcripts_df.head(100)
texts_with_instructions = []
for index, row in transcripts_df.iterrows():
    texts_with_instructions.append([instruction, row["transcript"]])

# calculate embeddings (100 first episodes took about 6 min)
model = INSTRUCTOR('hkunlp/instructor-large')
customized_embeddings = model.encode(texts_with_instructions, show_progress_bar=True, batch_size=16)

load INSTRUCTOR_Transformer
max_seq_length  512


Batches: 100%|██████████| 7/7 [06:14<00:00, 53.46s/it]


In [16]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'test-db'

# format data that if can be put to the chroma collection
embeddings = customized_embeddings.tolist()
documents = transcripts_df["transcript"].tolist()
ids = transcripts_df.index.astype(str).to_list()
metadata = transcripts_df[['episode_description', 'episode_name']].to_dict(orient='records')

#loading into chroma
#client = chromadb.PersistentClient(path=persist_directory)
model = INSTRUCTOR('hkunlp/instructor-large')
server_ip = os.getenv('CHROMA_SERVER_IP')
client = chromadb.HttpClient(host=server_ip, port=8000)
collection = client.get_collection(name="transcripts", embedding_function=model)


# create the collection and add documents
try:
    client.delete_collection("transcripts")
except Exception as e:
    pass
collection = client.create_collection("transcripts", embedding_function=model)
collection.add(
    embeddings=embeddings,
    documents=documents,
    metadatas=metadata,
    ids=ids,
)

In [3]:
import boto3
import pandas as pd
#my_session = boto3.Session(region_name='eu-central-1', profile_name='my-admin-profile')
client = boto3.client('s3')
files = client.list_objects(Bucket='sumar-ai')['Contents']
filenames = [f['Key'] for f in files]
processed_files = pd.DataFrame(filenames[:5], columns=['processed_files'])

write_to_s3(processed_files, 'processed_files', 'metadata')

In [3]:
processed_files_df = read_from_s3('processed_files', 'metadata')
processed_files_list = list(processed_files_df.processed_files)

['transcripts/podcasts-transcripts-0to2-1.csv',
 'transcripts/podcasts-transcripts-0to2-10.csv',
 'transcripts/podcasts-transcripts-0to2-11.csv',
 'transcripts/podcasts-transcripts-0to2-12.csv',
 'transcripts/podcasts-transcripts-0to2-13.csv']

In [3]:
# get collection from server
model = INSTRUCTOR('hkunlp/instructor-large')
server_ip = os.getenv('CHROMA_SERVER_IP')
client = chromadb.HttpClient(host=server_ip, port=8000)
collection = client.get_collection(name="transcripts", embedding_function=model)


load INSTRUCTOR_Transformer
max_seq_length  512


In [5]:
# Try querying the database
query_texts=[['Represent the topic for retrieving relevant Podcast transcripts: ',"Sports"]]
query_embedding = model.encode(query_texts).tolist()
collection.query(
    query_embeddings=query_embedding,
    n_results=5
)

{'ids': [['220', '221', '1297', '206', '2187']],
 'distances': [[0.3547995090484619,
   0.36153286695480347,
   0.38233038783073425,
   0.3846590220928192,
   0.3865782916545868]],
 'embeddings': None,
 'metadatas': [[{'episode_description': 'After two NBA-centric eps, Coop & Cal invite their first guest onto the show to help them break the hockey barrier. Joe Milo, a former college hockey player, clocks in to talk Stanley Cup finals, the physical repercussions of pro sports, and how Wayne Gretzky stacks up to other sports greats. There is also a surprisingly complete conversation about whether or not college athletes should be paid.\xa0 ',
    'episode_name': '003: Six Pairs of Garage Doors'},
   {'episode_description': 'After two NBA-centric eps, Coop & Cal invite their first guest onto the show to help them break the hockey barrier. Joe Milo, a former college hockey player, clocks in to talk Stanley Cup finals, the physical repercussions of pro sports, and how Wayne Gretzky stacks u

In [None]:
transcripts_df[['episode_description']].to_dict(orient='records')

In [98]:
read_and_preprocess_transcripts(filepath)

Unnamed: 0,transcript,show,episode,show_uri,show_name,show_description,publisher,language,rss_link,episode_uri,episode_name,episode_description,duration,show_filename_prefix,episode_filename_prefix,transcript_length
0,Welcome back to GRE vocab. Before we continue...,show_00iaP2GhJUWHAovx41BN5N,2VVBXyGRGd2cydXPxHJme6,spotify:show:00iaP2GhJUWHAovx41BN5N,GRE Vocab,We use publicly recommended vocab words that m...,GRE Vocab Yo,['en-US'],https://anchor.fm/s/9b4ae84/podcast/rss,spotify:episode:2VVBXyGRGd2cydXPxHJme6,Fatuous • Apocryphal • Acridity • Trenchancy •...,Trump. Genius or idiot? Brilliant or buffoon? ...,2.809233,show_00iaP2GhJUWHAovx41BN5N,2VVBXyGRGd2cydXPxHJme6,1951
1,Welcome back to another episode of a GRE vocab...,show_00iaP2GhJUWHAovx41BN5N,3jIQUvPSaCcMOpJfx5AB1w,spotify:show:00iaP2GhJUWHAovx41BN5N,GRE Vocab,We use publicly recommended vocab words that m...,GRE Vocab Yo,['en-US'],https://anchor.fm/s/9b4ae84/podcast/rss,spotify:episode:3jIQUvPSaCcMOpJfx5AB1w,Phlegmatic • laconic • probity • proclivity • ...,Who win and be Trumps challenger? NBA vs China...,3.593667,show_00iaP2GhJUWHAovx41BN5N,3jIQUvPSaCcMOpJfx5AB1w,2480
2,Welcome back to the GRE vocab podcast. We use ...,show_00iaP2GhJUWHAovx41BN5N,3iRJkm19fnZkdwW7nTX5Yb,spotify:show:00iaP2GhJUWHAovx41BN5N,GRE Vocab,We use publicly recommended vocab words that m...,GRE Vocab Yo,['en-US'],https://anchor.fm/s/9b4ae84/podcast/rss,spotify:episode:3iRJkm19fnZkdwW7nTX5Yb,Punctilious • Vociferous • Waggish • Panegyric...,Astros are freaking busted! Messed up...messed...,3.347933,show_00iaP2GhJUWHAovx41BN5N,3iRJkm19fnZkdwW7nTX5Yb,2144
3,Welcome back to episode 5 of GRE vocab today t...,show_00iaP2GhJUWHAovx41BN5N,3vdykhEg3lYhfXEVIIB8GH,spotify:show:00iaP2GhJUWHAovx41BN5N,GRE Vocab,We use publicly recommended vocab words that m...,GRE Vocab Yo,['en-US'],https://anchor.fm/s/9b4ae84/podcast/rss,spotify:episode:3vdykhEg3lYhfXEVIIB8GH,Halcyon • hedonism • iconoclast • impunity • i...,We’re talking Lebron and MJ. Messi and Barcelo...,4.140900,show_00iaP2GhJUWHAovx41BN5N,3vdykhEg3lYhfXEVIIB8GH,3362
4,Welcome back to GRE vocab today are five words...,show_00iaP2GhJUWHAovx41BN5N,4Xsex7XPthez6srVIdv4vU,spotify:show:00iaP2GhJUWHAovx41BN5N,GRE Vocab,We use publicly recommended vocab words that m...,GRE Vocab Yo,['en-US'],https://anchor.fm/s/9b4ae84/podcast/rss,spotify:episode:4Xsex7XPthez6srVIdv4vU,Prodigal • Rancorous • Salubrious • Querulous ...,NASA is going to the moon...AGAIN!? And they w...,4.011250,show_00iaP2GhJUWHAovx41BN5N,4Xsex7XPthez6srVIdv4vU,2995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4020,ere's no harm in being personal personal reall...,show_09UZ3QZQGagWfnPMgZYccR,1H2m1fhwmVpeEf0N4YZVgB,spotify:show:09UZ3QZQGagWfnPMgZYccR,The Food Podcast,The Food Podcast is a show where personal stor...,The Food Podcast,['en'],https://anchor.fm/s/4887ad0/podcast/rss,spotify:episode:1H2m1fhwmVpeEf0N4YZVgB,THE FOOD PODCAST 029 -Making friends and feedi...,"On today’s episode, Hetty McKinnon and I share...",29.926400,show_09UZ3QZQGagWfnPMgZYccR,1H2m1fhwmVpeEf0N4YZVgB,5583
4021,"Hello, my name is Quinton Pastrana. Welcome to...",show_09NfbvW32HL0dKd4gxxBCk,4ch15nYQdO4kNW2v4Yy4Wz,spotify:show:09NfbvW32HL0dKd4gxxBCk,Life Sentences,Words to live by and die for. Life Sentences p...,PumaPodcast,['en-PH'],https://anchor.fm/s/11afbcc8/podcast/rss,spotify:episode:4ch15nYQdO4kNW2v4Yy4Wz,Episode 1: Mi Ultimo Adiós,Mi Ultimo Adiós: Rizal's farewell revisited an...,12.916483,show_09NfbvW32HL0dKd4gxxBCk,4ch15nYQdO4kNW2v4Yy4Wz,8111
4022,Over the span of 12 months to larger-than-life...,show_09NfbvW32HL0dKd4gxxBCk,1YGjk9nbFi4onBQhad0hYv,spotify:show:09NfbvW32HL0dKd4gxxBCk,Life Sentences,Words to live by and die for. Life Sentences p...,PumaPodcast,['en-PH'],https://anchor.fm/s/11afbcc8/podcast/rss,spotify:episode:1YGjk9nbFi4onBQhad0hYv,Episode 4: A tribute to Twink Macaraig and Mar...,A tribute to veteran journalist Twink Macaraig...,8.090550,show_09NfbvW32HL0dKd4gxxBCk,1YGjk9nbFi4onBQhad0hYv,4765
4023,My name is Quinton Pastrana. Welcome to anothe...,show_09NfbvW32HL0dKd4gxxBCk,3fzo9XOqqNlfp10lIcFJGm,spotify:show:09NfbvW32HL0dKd4gxxBCk,Life Sentences,Words to live by and die for. Life Sentences p...,PumaPodcast,['en-PH'],https://anchor.fm/s/11afbcc8/podcast/rss,spotify:episode:3fzo9XOqqNlfp10lIcFJGm,Episode 2: Year End Poems,New Year's resolutions: a pair of spiritual an...,7.135883,show_09NfbvW32HL0dKd4gxxBCk,3fzo9XOqqNlfp10lIcFJGm,4926


In [124]:
print(transcripts_df.loc[0,'transcript'])

Welcome back to GRE vocab.  Before we continue. We just want to give a quick shout out to the people who make this podcast possible. Anchor anchor is everything you need to make a podcast in one place. They distribute your podcast for you. They have tools that allow you to record and edit everything right from your phone or computer. It's free the even allow you to put ads in your your podcast. It's pretty awesome. Go check it out download the free anchor app or go to Anchored off dot f m-- to get started. Okay, let's get back to the show.  Today's words are fatuous, silly and pointless apocryphal.  Of doubtful authenticity, although widely circulated as being true usually in regards to a story or statement a credit e having an acrid smell accurate is having an irritant Lee Strong and unpleasant taste or smell angry bitter.  Trench in see in size of - keenness and forcefulness of thought or expression or intellect euphony the quality of being pleasing to the ear, especially through a h