In [17]:
%load_ext autoreload
%autoreload 2

import json
import os
from dotenv import load_dotenv
# load_dotenv('./.env', override=True)

from tqdm import tqdm
from collections import defaultdict
from typing import List, Dict, Tuple
import pandas as pd
import numpy as np

from preprocessing import FileIO, Vectorizor, Splitters, Utilities
from sentence_transformers import SentenceTransformer
from llama_index.text_splitter import SentenceSplitter
from concurrent.futures import ProcessPoolExecutor, as_completed

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Step 1 -->  Import Podcast Transcripts

In [29]:
data_path = './data/impact_theory_updated_Nov1.json'

#should see 385 unique podcast entries 
with open(data_path) as f:
    data =  json.load(f)
len(data)

385

In [8]:
#let's get some rough statistics on the content lengths of each podcast
lens = [len(d['content'].split()) for d in data]
df = pd.DataFrame(lens, columns=['Lengths'])
df[df['Lengths'] == 16860]

Unnamed: 0,Lengths
94,16860


In [30]:
data[94]

{'author': 'Tom Bilyeu',
 'title': 'The 3 DAILY HABITS That Destroy Your Health & DECREASE Lifespan! | Bob Hariri',
 'video_id': '3Q17HRonPok',
 'playlist_id': 'PL8qcvQ7Byc3OJ02hbWJbHWePh4XEg3cvo',
 'channel_id': 'UCnYMOamNKLGVlJgRUbamveA',
 'description': 'No description provided',
 'keywords': [],
 'length': 6518,
 'publish_date': '11-15-2022',
 'thumbnail_url': 'https://i.ytimg.com/vi/3Q17HRonPok/hq720.jpg',
 'views': 59181,
 'age_restricted': False,
 'episode_url': 'https://www.youtube.com/watch?v=3Q17HRonPok&list=PL8qcvQ7Byc3OJ02hbWJbHWePh4XEg3cvo',
 'content': "What we put in our bodies impacts our health, our lifespan, our mental state. We live in a world where there are so many choices. The vast majority of choices are not great choices. People will often take the path of least resistance, and they'll get the fastest, most available food for their day. In many cases, what they're doing is they're poisoning themselves. Dr. Bob Harari, welcome to the show. Tom, good to see you, a

Bad pipe message: %s [b'\xd6O\x8c\x08', b'\x81\xe8\x1co|\xf6\xd1\xd5BX\xb5\n \xb7\xe6\xc6=\xfdl\xac\xd0\\\x89\xf4?\x1ee[I\x99K\x8c\xf9\xf7\xb1\x11\x86\xdb\x18A\x0c\x9d\x9aE5\x00\x08\x13\x02']
Bad pipe message: %s [b'\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t1', b'.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04', b'\x03\x06', b'\x07\x08']
Bad pipe message: %s [b'\t\x08\n\x08\x0b\x08\x04']
Bad pipe message: %s [b'\x08\x06\x04\x01\x05\x01\x06', b'']
Bad pipe message: %s [b'\x03\x02\x03\x04\x00-\x00\x02\x01\x01\x003\x00&\x00$\x00\x1d\x00 G\xff\xd1\xe2\x00\n\xb4\x03o\xb2\xd6w\x14\xff\xdc\x9ew\xe5$\xdd\x01\x07']
Bad pipe message: %s [b',\xfa\xda\xf0\x98LE<\n\\\xb9\x00v\x8b\xf7\x04s\xcf \xb0\r\xa9\xe6;\xbf\xc8a\xc2\xbe\xb5\x07\x95\xdd\xcc\xe8\xd0iQ6\x06\xe4\x04\xc6\x9a\xc8\xf9O3R\xdc\xbd\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x0

In [7]:
#peek at what a data entry looks like (use shortest transcript)
# data[np.argmin(df)]

## Step 2 -->  Split Text into Sentences - LlamaIndex

In [6]:
#TODO: Dig into why text_splitter is using NLTK tokenizer under the hood. 

In [10]:
#discussion on chunk size
chunk_size = 196
text_splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=20)

In [8]:
split_dict = splitter.split_corpus(data, text_splitter)

Docs: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 385/385 [00:21<00:00, 17.57it/s]


In [12]:
lens = [len(split_dict[key]) for key in split_dict]
sum(lens)

37007

## Step 3 -->  Encode Chunks as Vectors (Transfer to Google Colab)

### 3a.) SentenceTransformers

In [9]:
# sbert = SentenceTransformer('all-MiniLM-L6-v2'). ##  35 seconds to encode all ImpactTheory 
# model = SentenceTransformer(model_path, device='cuda:0') ## 136 seconds to encode all ImpactTheory
model = SentenceTransformer('thenlper/gte-base')

In [14]:
model_path = 'thenlper/gte-base'
base_model = 'sentence-transformers/all-MiniLM-L6-v2'
vectorizer = Vectorizor(model_name_or_path=base_model)

In [17]:
# CPU demonstration
%%time
vectors = []
from tqdm import tqdm
for sent in tqdm(split_dict[0]):
    vectors.append(model.encode(sent, device='cpu'))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 280/280 [00:52<00:00,  5.30it/s]


In [19]:
len(vectors), len(split_dict[0])

(280, 280)

In [15]:
%%time
## GPU demonstration
merged_dict = vectorizer.encode_from_dict(split_dict, device='cuda:0')

Docs: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 385/385 [00:36<00:00, 10.55it/s]

CPU times: user 49.3 s, sys: 1.71 s, total: 51 s
Wall time: 36.5 s





In [107]:
# np.save('./gte_vectors.npy', gte_vectors, allow_pickle=False)

In [18]:
docs = vectorizer.join_metadata(corpus=data, merged_dict=merged_dict, create_doc_id=True)
len(docs)

37007

In [20]:
test = [d for d in docs if d['video_id'] == 'mrND5lSPEQU']
len(test)

280

In [24]:
chunk_size

196

In [25]:
io = FileIO()
io.save_as_parquet(file_path=f'/home/elastic/notebooks/vector_search_applications/data/impact_theory_minilm_{chunk_size}.parquet', data=docs, overwrite=True)

[32m2023-10-21 21:46:34.585[0m | [1mINFO    [0m | [36mpreprocessing[0m:[36msave_as_parquet[0m:[36m34[0m - [1mDataFrame saved as parquet file here: /home/elastic/notebooks/vector_search_applications/data/impact_theory_minilm_196.parquet[0m
Bad pipe message: %s [b'<!\xb6\xc7wd\xca\x8fJ\x92#J\xb4Ra\x14\x82\x0c \xd1\xcd\xc0\xea\xd6\xc8|\x8e\t\x0c~2\x1f\xfb\x94\x87\x7f\x1c\xc8[\xa5UK@\xe87\xbb%\x08#\xe9\xaf\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.']
Bad pipe message: %s [b'\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03\x08']
Bad pipe message: %s [b'\x08\x08\t\x08\n\x08']
Bad pipe message: %s [b'\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06']
Bad pipe message: %s [b'\xf1\t\xccuf\x97\xca\xe8=\xcd\x91\xfd|\x84\x80f\xeb4 \xday>^\xe9\xc6p\xca\xf6\x84\x87FG\x13\xd4\xd5t\xd5\x95\xe41I\x81B\xbe\xeb\x9e\xb

## Step - 3.1 --> OPTIONAL: OpenAI Ada Embeddings

In [104]:
openai.api_key = os.environ['OPENAI_API_KEY']
model = "text-embedding-ada-002"
from openai.embeddings_utils import get_embedding, cosine_similarity
#get cost first
tokenizer = Tokenizer(model_type="cl100k_base", price=0.001)

# cost = tokenizer.get_cost(text_chunks)

In [103]:
openai.api_key

'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'

#### Working around OpenAI rate limits

In [93]:
#split text_chunks into roughly 1 million tokens total per group
for num in range(0,43000,6000):
    chunks = text_chunks[num:num+6000]
    cost = tokenizer.get_cost(chunks)
    

Total Tokens: 890642	Cost: 0.891
Total Tokens: 889149	Cost: 0.889
Total Tokens: 892516	Cost: 0.893
Total Tokens: 887084	Cost: 0.887
Total Tokens: 892144	Cost: 0.892
Total Tokens: 887583	Cost: 0.888
Total Tokens: 876077	Cost: 0.876
Total Tokens: 126477	Cost: 0.126


In [55]:
# openai.Embedding.create(text_chunks[:2], engine=model)

In [106]:
openai.api_key

'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'

In [56]:
# %%time
# embeddings = []
# for num in range(0,43000,6000):
#     chunks = text_chunks[num:num+6000]
#     results = openai.Embedding.create(input=chunks, engine=model)
#     embeddings.append(results)
#     time.sleep(60)

In [9]:
episode = '''
In the Impact Theory episode featuring Ray Dalio, a renowned billionaire investor and hedge fund manager, a wide range of topics is explored, with a central focus on the global economic landscape and the challenges it currently faces. The conversation between Tom Bilyeu and Ray Dalio delves into various aspects of economics, geopolitics, and the dynamics that shape our world today.

One of the primary concerns highlighted in the discussion is the vulnerability of the U.S. dollar. Ray Dalio expresses his unease regarding the ongoing international efforts by the BRICS nations (Brazil, Russia, India, China, and South Africa) to reduce their reliance on the dollar as the world's primary reserve currency. He emphasizes that this shift is not an outright attack on the dollar but rather a reflection of the changing economic landscape.

Dalio draws parallels between historical events where other reserve currencies, such as the British pound and the Dutch guilder, lost their dominance due to a combination of factors, including holding excessive amounts of the currency and concerns over potential sanctions. He explains that countries are now looking for alternatives to transact in, driven by a desire to avoid the risks associated with holding large amounts of dollar-denominated debt and the potential for sanctions.

The discussion also touches on the importance of financial responsibility for countries, suggesting that being financially strong is crucial in this evolving global environment. The idea of externalizing inflation through currency devaluation is explored, along with the complex interplay between interest rates, money printing, and fiscal responsibility.

Throughout the conversation, Ray Dalio emphasizes that understanding the historical context and the cyclical nature of economic events is crucial. He advocates for principles like strong family structures, quality education, and equal opportunities as foundational elements for a thriving middle class and a prosperous society. However, he acknowledges that achieving these principles in today's complex world is challenging, particularly when it comes to addressing issues like education inequality and poverty.

In summary, the Impact Theory episode featuring Ray Dalio provides valuable insights into the current state of the global economy and the challenges it faces. Dalio's expertise and historical perspective shed light on the potential vulnerabilities of the U.S. dollar and the importance of financial responsibility for nations. The conversation serves as a reminder of the complex interplay of economic forces, geopolitics, and social factors that shape our world, and it underscores the significance of addressing these challenges to ensure a prosperous future.
'''

In [10]:
len(episode.split())

406