In [34]:
%load_ext autoreload 
%autoreload 2

from youtube_interface import PyTubeClient
from reranker import ReRanker
from preprocessing import Utilities
from opensearch_interface import OpenSearchClient
from sentence_transformers import SentenceTransformer
from index_templates import youtube_body
from typing import List, Union, Dict
from reranker import ReRanker
import json
import os

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [35]:
pytube_client = PyTubeClient('fake')

In [2]:
osclient = OpenSearchClient()

In [3]:
osclient.info()

{'name': '7d5740afb0b1',
 'cluster_name': 'docker-cluster',
 'cluster_uuid': 'u62AiiEFR4yWUStAhyFveg',
 'version': {'distribution': 'opensearch',
  'number': '2.9.0',
  'build_type': 'tar',
  'build_hash': '1164221ee2b8ba3560f0ff492309867beea28433',
  'build_date': '2023-07-18T21:23:29.367080729Z',
  'build_snapshot': False,
  'lucene_version': '9.7.0',
  'minimum_wire_compatibility_version': '7.10.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'The OpenSearch Project: https://opensearch.org/'}

In [4]:
osclient.show_indexes()

health status index                        uuid                   pri rep docs.count docs.deleted store.size pri.store.size
yellow open   semantic-538-testrun         DjBPg6CdQwKbOGhJrI4YIQ   3   1        284            0      2.9mb          2.9mb
yellow open   security-auditlog-2023.09.10 q2UCytHxQXOdDxdpwOeqpA   1   1       1704            0      1.5mb          1.5mb
yellow open   test-kw-index                6EF4Q2xDT9Gz1wua5a2IpQ   3   1        158            0      5.6mb          5.6mb
yellow open   security-auditlog-2023.09.11 u3bVAn9lRReojRlF0t3sYw   1   1       1704            0      3.3mb          3.3mb
yellow open   kw-full                      uNhdaqbnRVuyJci_L1Om8Q   3   1       6678            0     12.1mb         12.1mb
yellow open   security-auditlog-2023.09.12 wJoWS1RfSBynSDjh3tV7Mw   1   1       1706            0      3.2mb          3.2mb
yellow open   security-auditlog-2023.09.13 RZG3AOK3QkKFGWBgOpzoiQ   1   1        855            0      1.7mb          1.7mb
yellow o

In [5]:
data_path = '/home/elastic/notebooks/vector_search_applications/data/impact_theory_with_vectors.json'

In [6]:
def json_data_loader(file_path: str):
    with open(file_path) as f:
        data = json.loads(f.read())
    return data

In [7]:
data = json_data_loader(data_path)
len(data)
for d in data:
    d['content_embedding'] = d['vector']
    del d['vector']

In [30]:
sorted(list(data[0].keys()))

['age_restricted',
 'author',
 'channel_id',
 'content',
 'content_embedding',
 'description',
 'episode_num',
 'group_id',
 'keywords',
 'length',
 'playlist_id',
 'publish_date',
 'thumbnail_url',
 'title',
 'unique_id',
 'video_id',
 'views']

In [32]:
def create_video_url(video_id: str, playlist_id: str):
    return f'https://www.youtube.com/watch?v={video_id}&list={playlist_id}'

In [33]:
for d in data:
    videoid = d['video_id']
    playlistid = d['playlist_id']
    episode_url = create_video_url(videoid, playlistid)
    d.update(episode_url=episode_url)

In [37]:
pytube_client.save_meta_toJSON(data, out_path='/home/elastic/notebooks/vector_search_applications/data/impact_theory_with_vectors.json')

[32m2023-09-13 20:37:45.686[0m | [1mINFO    [0m | [36myoutube_interface[0m:[36msave_meta_toJSON[0m:[36m195[0m - [1mMetadata saved as: /home/elastic/notebooks/vector_search_applications/data/impact_theory_with_vectors.json.[0m


### KW Indexing

In [38]:
index_name = "kw-impact-theory"
youtube_body

{'settings': {'number_of_shards': 3,
  'refresh_interval': '30s',
  'index': {'knn': True}},
 'mappings': {'properties': {'title': {'type': 'text', 'index': 'true'},
   'unique_id': {'type': 'keyword', 'index': 'false'},
   'group_id': {'type': 'short', 'index': 'false'},
   'video_id': {'type': 'keyword', 'index': 'false'},
   'playlist_id': {'type': 'keyword', 'index': 'false'},
   'episode_url': {'type': 'keyword', 'index': 'false'},
   'episode_num': {'type': 'short', 'index': 'false'},
   'description': {'type': 'text', 'index': 'true'},
   'length': {'type': 'long', 'index': 'false'},
   'publish_date': {'type': 'keyword', 'index': 'false'},
   'views': {'type': 'long', 'index': 'false'},
   'thumbnail_url': {'type': 'keyword', 'index': 'false'},
   'content': {'type': 'text', 'index': 'true'},
   'content_embedding': {'type': 'knn_vector', 'dimension': 384}}}}

In [40]:
osclient.document_indexer(index_name=index_name, data=data, body_template=youtube_body)

[32m2023-09-13 20:38:10.661[0m | [1mINFO    [0m | [36mopensearch_interface[0m:[36mdocument_indexer[0m:[36m207[0m - [1mThe ** kw-impact-theory ** index was created[0m
[32m2023-09-13 20:38:10.662[0m | [1mINFO    [0m | [36mopensearch_interface[0m:[36mdocument_indexer[0m:[36m218[0m - [1mThe # of documents to be indexed = 33164[0m


In [44]:
osclient.indices.delete("security-audit*")

{'acknowledged': True}

In [41]:
osclient.indices.refresh(index=index_name)

{'_shards': {'total': 6, 'successful': 3, 'failed': 0}}

In [48]:
osclient.show_indexes()

health status index                        uuid                   pri rep docs.count docs.deleted store.size pri.store.size
yellow open   kw-538-testrun               Wam6NsdMR7K5lE8N8ZBTRQ   3   1        284            0    351.4kb        351.4kb
green  open   .opensearch-observability    nN299E0QS9OvsRh_UcbJVQ   1   0          0            0       208b           208b
yellow open   .plugins-ml-config           IEeXrm-DRiOMm2qzo7PbqA   1   1          1            0      3.9kb          3.9kb
yellow open   semantic-538-testrun         DjBPg6CdQwKbOGhJrI4YIQ   3   1        284            0      2.9mb          2.9mb
yellow open   kw-impact-theory             2MjMun4bQYOoeUpv5UsJxg   3   1      33164            0     29.4mb         29.4mb
yellow open   test-kw-index                6EF4Q2xDT9Gz1wua5a2IpQ   3   1        158            0      5.6mb          5.6mb
yellow open   kw-full                      uNhdaqbnRVuyJci_L1Om8Q   3   1       6678            0     12.1mb         12.1mb
yellow o

In [18]:
osclient.cat.count(index=index_name, format="json")

[{'epoch': '1694636432', 'timestamp': '20:20:32', 'count': '33164'}]

In [19]:
def keyword_search(query: str, index: str, size: int=10, return_raw: bool=False):
    body = {
            "_source": ['content','group_id','show_link','video_id','length','publish_date','thumbnail_url','title','views'], 
            "size": size,
            "query": {
                "bool": {
                    "must": {
                        "match": {"content": query,}
                            },
                        "filter": {"bool": {"must_not": {"match_phrase": {"content": "Vishal"}}}},
                    },
                },            
            }
    response = osclient.search(body=body, index=index)
    if return_raw: 
        return response 
    else: return response['hits']['hits']

In [21]:
query = "Does trump have support in iowa"

In [23]:
# osclient.keyword_search(query, index=index_name)

In [29]:
osclient.indices.get_mapping(index=index_name)

{'kw-impact-theory': {'mappings': {'properties': {'age_restricted': {'type': 'boolean'},
    'author': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'channel_id': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'content': {'type': 'text'},
    'description': {'type': 'text'},
    'episode_num': {'type': 'short', 'index': False},
    'episode_url': {'type': 'keyword', 'index': False},
    'group_id': {'type': 'short', 'index': False},
    'length': {'type': 'long', 'index': False},
    'playlist_id': {'type': 'keyword', 'index': False},
    'publish_date': {'type': 'keyword', 'index': False},
    'thumbnail_url': {'type': 'keyword', 'index': False},
    'title': {'type': 'text'},
    'unique_id': {'type': 'keyword', 'index': False},
    'video_id': {'type': 'keyword', 'index': False},
    'views': {'type': 'long', 'index': False}}}}}

### Semantic Indexing

In [26]:
#upload data
utils = Utilities()

In [62]:
path = '/home/elastic/notebooks/vector_search_applications/data/podcast_transcripts/fivethirtyeight/'

In [74]:
# data = utils.json_data_loader(path)
# #be sure to change vector field name to "content_embedding"
# # [d.update(content_embedding=d['vector']) for d in data]
# # for d in data:
# #     del d['vector']

In [46]:
sem_index = 'semantic-impact-theory'
model = SentenceTransformer('all-minilm-l6-v2')
# osclient.indices.delete(sem_index)

In [47]:
osclient.indices.delete(sem_index)

{'acknowledged': True}

In [50]:
# osclient.document_indexer(index_name=sem_index, data=data, body_template=youtube_body, semantic_index=True)

Bad pipe message: %s [b'\xee\xf5\x96\xc1\n\xe3XE:L\xf8yQ\x14A\xad\xb0\x0f \x07W\x13\x11\x81\xc9\x8co0O\x99.4\x08\xf0\x10x\x9dH\x02\xcaK\x17D_<\x83[\x02\x8d(\x97\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06']
Bad pipe message: %s [b'\x07\x08']
Bad pipe message: %s [b'\t\x08\n\x08\x0b\x08\x04']
Bad pipe message: %s [b";\xee\xdc\xb6#\xe8p\x10\x0c\xf4Y\x9e\x07\xe2\x08\x8b\xca\xa4\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005\x00/\x00\x9a\x00\x99\xc0\x07\xc0\x11\

In [159]:
def vector_search(query: str, 
                  index: str, 
                  model: Union[str, SentenceTransformer], 
                  size: int=10,
                  k: int=10,
                  return_raw: bool=False
                  ) -> Dict[str,str]:
    if isinstance(model, SentenceTransformer):
        query_embedding = model.encode(query).tolist()

    body={  "_source": ['title', 'episode_id', 'group_id', 'episode_num', 'episode_url', 'mp3_url', 'content'],
            "size": size,
            "query": 
               {"knn": {"content_embedding": {"vector": query_embedding, "k": k}}},
         }
    response = osclient.search(body=body, index=index)
    if return_raw: 
        return response 
    else: return response['hits']['hits']

In [21]:
query = 'who is more powerful, Musk or Bezos?'

In [23]:
response = osclient.vector_search(query, sem_index, model)

### Reranking + Hybrid Search

In [24]:
reranker = ReRanker()

In [25]:
hybrid = osclient.hybrid_search(query, kw_index=index_name, vec_index=sem_index, model=model)
final = reranker.rerank(hybrid, query, top_k=20)

[32m2023-09-13 20:22:28.903[0m | [1mINFO    [0m | [36mopensearch_interface[0m:[36m_deduplicate_results[0m:[36m327[0m - [1mDuplicate Hit: zm0QVutAkYg-112 on index semantic-impact-theory[0m
[32m2023-09-13 20:22:28.904[0m | [1mINFO    [0m | [36mopensearch_interface[0m:[36m_deduplicate_results[0m:[36m327[0m - [1mDuplicate Hit: SaIkelDUDic-15 on index semantic-impact-theory[0m
[32m2023-09-13 20:22:28.905[0m | [1mINFO    [0m | [36mopensearch_interface[0m:[36m_deduplicate_results[0m:[36m327[0m - [1mDuplicate Hit: sl3XhHs6ggs-8 on index semantic-impact-theory[0m
[32m2023-09-13 20:22:28.906[0m | [1mINFO    [0m | [36mopensearch_interface[0m:[36m_deduplicate_results[0m:[36m327[0m - [1mDuplicate Hit: OoGghm0_Q8I-30 on index semantic-impact-theory[0m
[32m2023-09-13 20:22:28.907[0m | [1mINFO    [0m | [36mopensearch_interface[0m:[36m_deduplicate_results[0m:[36m327[0m - [1mDuplicate Hit: rKByaM5asU8-124 on index semantic-impact-theory[0m
[32

In [71]:
def cleanup(results: List[dict]) -> List[dict]:
    final_results = []
    unique_ids = set()
    for res in results:
        unique_id = f"{res['_source']['group_id']}-{res['_source']['video_id']}"
        if unique_id in unique_ids:
            continue
        else: 
            unique_ids.add(unique_id)
            final_results.append(res)
    return final_results

In [79]:
final = cleanup(final)

In [27]:
def display_results(results: List[dict], threshhold: float=0.0, include_indexes: bool=False):
    filtered = [r for r in results if r['cross_score'] > threshhold]
    if include_indexes:
        indices = [d['_index'] for d in filtered]
        return filtered, indices
    else: return filtered

In [28]:
display_results(final, threshhold=-10, include_indexes=True)

([{'_index': 'kw-impact-theory',
   '_id': 'UNgzkIoBPTzzIQZAms8O',
   '_score': 14.798523,
   '_source': {'group_id': 12,
    'length': 5884,
    'title': 'WHY YOUR LIFE IS SO BORING... (Fix This To Find Fulfillment) | Tom Bilyeu',
    'thumbnail_url': 'https://i.ytimg.com/vi/NoSkC1hn23Q/hq720.jpg',
    'publish_date': '01-28-2023',
    'content': "Really is awesome, by the way. Even though it's black and white and it's all the shit for the TikTok generation, they're never going to go for it. But it's a really fantastic film and Orson Welles made it when he was 24 and he made it when arguably the most powerful man in the world. So imagine somebody like Bill Gates coming after you and oh God, even better, one of the guys that owns like a media conglomerate, Jeff Bezos owns Washington Post. So the guy, Rudolph Hearst, owned what at the time was the media publication. Citizen Kane is about that guy. So it would be like somebody making a movie about Jeff Bezos and that he's corrupt and lon

In [5]:
osclient.indices.get_mapping(index='kw-full')

{'kw-full': {'mappings': {'properties': {'content': {'type': 'text'},
    'episode_id': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'episode_num': {'type': 'short', 'index': False},
    'episode_url': {'type': 'keyword', 'index': False},
    'group_id': {'type': 'long'},
    'id': {'type': 'keyword', 'index': False},
    'mp3_url': {'type': 'keyword', 'index': False},
    'summary': {'type': 'text'},
    'title': {'type': 'text'}}}}}

In [14]:
osclient.indices.get_mapping(index='kw-538-testrun')

{'kw-538-testrun': {'mappings': {'properties': {'channel_id': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'content': {'type': 'text'},
    'description': {'type': 'text'},
    'episode_url': {'type': 'keyword', 'index': False},
    'group_id': {'type': 'short', 'index': False},
    'length': {'type': 'long', 'index': False},
    'playlist_id': {'type': 'keyword', 'index': False},
    'publish_date': {'type': 'keyword', 'index': False},
    'show_link': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
    'thumbnail_url': {'type': 'keyword', 'index': False},
    'title': {'type': 'text'},
    'video_id': {'type': 'keyword', 'index': False},
    'views': {'type': 'long', 'index': False}}}}}

In [10]:
this = hash("This is some text")

In [8]:
hash("And this is some text as well?")

872434049414064053

In [13]:
this == hash("This is some text")

True