In [10]:
from InstructorEmbedding import INSTRUCTOR
import tiktoken

import os
import pandas as pd
import pathlib
import chromadb
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from matplotlib import pyplot as plt

os.environ["OPENAI_API_KEY"] = ""

In [6]:
import sys
import os
sys.path.insert(0, os.path.abspath('../src/data/'))
from s3_utils import read_from_s3, write_to_s3
from chromadb.config import Settings
from dotenv import load_dotenv, find_dotenv

load_dotenv(override=True)

True

In [11]:
def preprocess_transcripts(transcripts_df):
    """ 
    Function that does preprocessing steps:
    - Split transcripts into chunks of 20000 characters
    - Explode to new rows
    - Remove transcripts with less than 1000 characters
    """
    transcripts_df['transcript'] = transcripts_df['transcript'].apply(lambda x: [x[i:i+20000] for i in range(0, len(x), 20000)])
    transcripts_df = transcripts_df.explode('transcript')
    transcripts_df['transcript_length'] = transcripts_df['transcript'].apply(lambda x: len(x))
    transcripts_df = transcripts_df[transcripts_df['transcript_length'] > 1000].reset_index(drop=True)
    return transcripts_df

In [6]:
# prepare texts with instructions
instruction = "Represent the Podcast transcript for retrieval: "

# read transcript data
filename = 'podcasts-transcripts-0to2-35.csv'

transcripts_df = read_from_s3(filename, filetype=None)
# transcripts_df = preprocess_transcripts(transcripts_df)
transcripts_df = transcripts_df.head(100)

# Create list of instruction - transcript pairs (100 first episodes)
# transcripts_df  = transcripts_df.head(100)
texts_with_instructions = []
for index, row in transcripts_df.iterrows():
    texts_with_instructions.append([instruction, row["transcript"]])

# calculate embeddings (100 first episodes took about 6 min)
# model = INSTRUCTOR('hkunlp/instructor-large')
# customized_embeddings = model.encode(texts_with_instructions, show_progress_bar=True, batch_size=16)
texts_with_instructions[0]

['Represent the Podcast transcript for retrieval: ',
 "Hi everyone, and welcome to murder friends the podcast were three friends from three different countries talk about murder. My name is Alana and I'm Canadian. Hi, I'm Anna and I'm American. My name is Hannah and I'm British. So sit back and relax you're among friends. And let's talk murder. Now. Today's episode is really special because it is the conclusion of spooky month. Which in my opinion is probably the best month. You're right. I love Halloween is the best time of year. For the whole month, really? Yes spooky season. It's pretty great. I mean in general you could start wearing sweaters boots, you know, go completely black and goth and coffee. Yeah spice lattes if that's your sort of basic thing, you know, which is because it is and if you're a more murdery, no than its quad gourd season. Yeah, exactly. The only thing that I've noticed so I've been living in England for just a couple of years. And England doesn't do Halloween

In [104]:
# from transformers import GPT2TokenizerFast
import transformers
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import TokenTextSplitter

#tokenizer = transformers.AutoTokenizer.from_pretrained('hkunlp/instructor-large')
#tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
filename = 'podcasts-transcripts-0to2-13.csv'

transcripts_df = read_from_s3(filename, filetype=None)
transcript = texts_with_instructions[0][1]
#text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(tokenizer, chunk_size=512, chunk_overlap=0)
text_splitter = TokenTextSplitter(chunk_size=2500, chunk_overlap=0)
transcripts_df["transcript_length"] = transcripts_df["transcript"].apply(lambda x: len(x))
transcripts_df = transcripts_df[transcripts_df["transcript_length"] > 1000]
transcripts_df.loc[:,"transcript"] = transcripts_df["transcript"].apply(lambda x: text_splitter.split_text(x))
#transcripts_df['num_of_splits'] = transcripts_df['splitted_text'].apply(lambda x: len(x))

#transcripts_df['num_of_splits'].describe()
transcripts_df.reset_index(drop=True).head(3)

Unnamed: 0,transcript,show,episode,show_uri,show_name,show_description,publisher,language,rss_link,episode_uri,episode_name,episode_description,duration,show_filename_prefix,episode_filename_prefix,source_file,transcript_length
0,[Just wanted to take a brief moment to give yo...,show_02Yjg2GfjecTzu0NbTV1HD,5EvN3tk0Y8gEWWRUzFRsfh,spotify:show:02Yjg2GfjecTzu0NbTV1HD,Paddle N' Fin,"Paddle N' Fin is your goto for Fishing, Kayak,...",Paddle N Fin Network,['en-US'],https://anchor.fm/s/44ec0b0/podcast/rss,spotify:episode:5EvN3tk0Y8gEWWRUzFRsfh,S3E22. Reel Down with Nauti Fish from TKAA,Your Host Sam Jones and the OG Brian Schiller ...,56.017367,show_02Yjg2GfjecTzu0NbTV1HD,5EvN3tk0Y8gEWWRUzFRsfh,podcasts-transcripts-0to2.tar.gz,45764
1,[Just wanted to take a brief moment to give yo...,show_02Yjg2GfjecTzu0NbTV1HD,6SKHSUqdHKe3wQuxovgCmm,spotify:show:02Yjg2GfjecTzu0NbTV1HD,Paddle N' Fin,"Paddle N' Fin is your goto for Fishing, Kayak,...",Paddle N Fin Network,['en-US'],https://anchor.fm/s/44ec0b0/podcast/rss,spotify:episode:6SKHSUqdHKe3wQuxovgCmm,"S1E44. New Prop, Giant Smallmouth, Wisconsin ...",Brian and Jay sit down and chat about this pas...,50.465483,show_02Yjg2GfjecTzu0NbTV1HD,6SKHSUqdHKe3wQuxovgCmm,podcasts-transcripts-0to2.tar.gz,36529
2,[Just wanted to take a brief moment to give yo...,show_02Yjg2GfjecTzu0NbTV1HD,6HiVnhPXYlpR5Gu56XH8Qg,spotify:show:02Yjg2GfjecTzu0NbTV1HD,Paddle N' Fin,"Paddle N' Fin is your goto for Fishing, Kayak,...",Paddle N Fin Network,['en-US'],https://anchor.fm/s/44ec0b0/podcast/rss,spotify:episode:6HiVnhPXYlpR5Gu56XH8Qg,S1E34. Tennessee Trip Recap,Brian and Jay sit down to chat about this past...,54.146233,show_02Yjg2GfjecTzu0NbTV1HD,6HiVnhPXYlpR5Gu56XH8Qg,podcasts-transcripts-0to2.tar.gz,42174


In [None]:
text_splitter = TokenTextSplitter(chunk_size=4096, chunk_overlap=0)
texts = text_splitter.split_text(transcript)
instruction = "Represent the Podcast transcript for retrieval: "
texts_with_instructions = []
for i in range(1):
    text = texts[i]
    texts_with_instructions.append([instruction, text])
customized_embeddings = model.encode(texts_with_instructions, show_progress_bar=True, batch_size=32)
customized_embeddings

In [None]:
import numpy as np
np.average(customized_embeddings, axis=0).reshape((1,768))

In [16]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'test-db'

# format data that if can be put to the chroma collection
embeddings = customized_embeddings.tolist()
documents = transcripts_df["transcript"].tolist()
ids = transcripts_df.index.astype(str).to_list()
metadata = transcripts_df[['episode_description', 'episode_name']].to_dict(orient='records')

#loading into chroma
#client = chromadb.PersistentClient(path=persist_directory)
model = INSTRUCTOR('hkunlp/instructor-large')
server_ip = os.getenv('CHROMA_SERVER_IP')
client = chromadb.HttpClient(host=server_ip, port=8000)
collection = client.get_collection(name="transcripts", embedding_function=model)


# create the collection and add documents
try:
    client.delete_collection("transcripts")
except Exception as e:
    pass
collection = client.create_collection("transcripts", embedding_function=model)
collection.add(
    embeddings=embeddings,
    documents=documents,
    metadatas=metadata,
    ids=ids,
)

In [79]:
import boto3
import pandas as pd
#my_session = boto3.Session(region_name='eu-central-1', profile_name='my-admin-profile')
client = boto3.client('s3')
files = client.list_objects(Bucket='sumar-ai')['Contents']
filenames = [f['Key'] for f in files][2]
#filenames = ['dummy.csv']
#processed_files_df = pd.DataFrame({"processed_files" : filenames})
processed_files_df = read_from_s3('processed_files_timestamps', 'metadata')
processed_files_df.loc[processed_files_df.index.max() + 1] = filenames
#processed_files_df
write_to_s3(processed_files_df, 'processed_files_timestamps', 'metadata')
processed_files_df

Unnamed: 0,processed_files
0,dummy.csv
1,transcript_timestamps/podcasts-transcripts-0to2-1.csv


In [80]:
processed_files_df = read_from_s3('processed_files_timestamps', 'metadata')
processed_files_list = list(processed_files_df.processed_files)
processed_files_df

Unnamed: 0,processed_files
0,dummy.csv
1,transcript_timestamps/podcasts-transcripts-0to2-1.csv


In [45]:
# get collection from server
collection_name = "transcript-timestamps" # transcripts-2
# model = INSTRUCTOR('hkunlp/instructor-base')
model = INSTRUCTOR('hkunlp/instructor-large')
server_ip = os.getenv('CHROMA_SERVER_IP')
client = chromadb.HttpClient(host=server_ip, port=8000)
collection = client.get_collection(name=collection_name, embedding_function=model)


load INSTRUCTOR_Transformer
max_seq_length  512


In [3]:
# get collection from server
collection_name = "transcript-timestamps" # transcripts-2
# model = INSTRUCTOR('hkunlp/instructor-base')
model = INSTRUCTOR('hkunlp/instructor-large')
server_ip = os.getenv('CHROMA_SERVER_IP')
client = chromadb.HttpClient(host=server_ip, port=8000)
collection = client.get_collection(name=collection_name, embedding_function=model)

# Try querying the database
#query_texts=[["Represent the Podcast query for retrieving relevant paragraphs: ","Patriots"]]
#query_embedding = model.encode(query_texts).tolist()
#collection.query(
#    query_embeddings=query_embedding,
#    n_results=5,
#    where={"episode": "2yJYDUNmumLmTck8ornY1J"},
#)
#client.list_collections()
collection.count()


load INSTRUCTOR_Transformer
max_seq_length  512


2000

In [None]:
transcripts_df[['episode_description']].to_dict(orient='records')

In [32]:
3%2

1

In [71]:
pd.set_option('display.max_colwidth', 0)
encoding = tiktoken.get_encoding("gpt2")
df = read_from_s3("podcasts-transcripts-0to2-1.csv", "transcript_timestamps")
#df["token_count"] = df.transcript.apply(lambda x: len(encoding.encode(x)))

df["episode_speaker"] = df["episode"] + df["speakerTag"].astype(str)
df["transcript"] = np.where(df["episode_speaker"] == df["episode_speaker"].shift(-1), df["transcript"] + " " + df["transcript"].shift(-1), df["transcript"])
df["group"] = (df["episode_speaker"] != df["episode_speaker"].shift()).cumsum()
df.set_index(df.groupby("group").cumcount(), inplace = True)
df["transcript"] = np.where(df.index%2==0, df["transcript"], np.nan)
df["endTime"] = np.where(pd.isna(df.transcript.shift(-1)), df["endTime"].shift(-1), df["endTime"])
df["paragraph_length"] = df.endTime - df.startTime
df = df.dropna().reset_index(drop=True)
df["token_count"] = df.transcript.apply(lambda x: len(encoding.encode(x)))
df = df[(df.token_count > 20)]
df.describe()

Unnamed: 0,startTime,endTime,speakerTag,group,paragraph_length,token_count
count,37176.0,37176.0,37176.0,37176.0,37176.0,37176.0
mean,1282.403882,1325.548736,2.003685,9952.700828,43.144854,147.48994
std,1101.002662,1099.478256,0.977439,5664.908259,16.541288,61.361887
min,0.0,7.6,1.0,1.0,3.6,21.0
25%,421.4,471.275,1.0,5435.75,29.7,100.0
50%,974.7,1018.95,2.0,9500.5,45.3,144.0
75%,1841.625,1881.125,3.0,14659.0,59.6,199.0
max,5294.4,5349.3,4.0,19775.0,326.8,335.0


In [124]:
print(transcripts_df.loc[0,'transcript'])

Welcome back to GRE vocab.  Before we continue. We just want to give a quick shout out to the people who make this podcast possible. Anchor anchor is everything you need to make a podcast in one place. They distribute your podcast for you. They have tools that allow you to record and edit everything right from your phone or computer. It's free the even allow you to put ads in your your podcast. It's pretty awesome. Go check it out download the free anchor app or go to Anchored off dot f m-- to get started. Okay, let's get back to the show.  Today's words are fatuous, silly and pointless apocryphal.  Of doubtful authenticity, although widely circulated as being true usually in regards to a story or statement a credit e having an acrid smell accurate is having an irritant Lee Strong and unpleasant taste or smell angry bitter.  Trench in see in size of - keenness and forcefulness of thought or expression or intellect euphony the quality of being pleasing to the ear, especially through a h