In [27]:
import os
import requests
from dotenv import load_dotenv
from IPython.display import display as Markdown
from langchain_community.document_loaders import TextLoader, PyPDFLoader


load_dotenv()
api = os.getenv('HUGGING_FACE_API_KEY')

In [28]:
loader = PyPDFLoader('./pdf/what-is-generative-ai.pdf')
pdf_file = loader.load()
pdf_file

[Document(metadata={'source': './pdf/what-is-generative-ai.pdf', 'page': 0}, page_content='January 2023\nMcKinsey Explainers\nWhat is generative AI?\nGenerative artificial intelligence (AI) describes algorithms (such \nas ChatGPT) that can be used to create new content, including \naudio, code, images, text, simulations, and videos. Recent new \nbreakthroughs in the field have the potential to drastically change \nthe way we approach content creation.'),
 Document(metadata={'source': './pdf/what-is-generative-ai.pdf', 'page': 1}, page_content='Generative AI systems fall under the broad \ncategory of machine learning, and here’s how one \nsuch system—ChatGPT—describes what it can do:\nReady to take your creativity to the next level? \nLook no further than generative AI! This nifty form \nof machine learning allows computers to generate \nall sorts of new and exciting content, from music \nand art to entire virtual worlds. And it’s not just for \nfun—generative AI has plenty of practical

In [29]:
# Markdown(pdf_file[0].page_content)

## Model

### Splitting

In [42]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_split = RecursiveCharacterTextSplitter(chunk_size=500, 
                                            chunk_overlap=100)
docs = text_split.split_documents(pdf_file)
docs = docs[:10]

In [32]:
len(docs)

2

### Vector Embedding

Embeddings model

In [43]:
import requests

API_URL = "https://api-inference.huggingface.co/models/sentence-transformers/all-MiniLM-L6-v2"
headers = {"Authorization": f"Bearer {api}"}


def query(payload):
  responnse = requests.post(API_URL, headers=headers, json=payload)
  return responnse.json()

def get_embeddings(source_sentence, sentences):
  output = query({
    "inputs": {
      "source_sentence": source_sentence,
      "sentences": sentences
    }
  })

  print('API response', output)

  if 'error' in output:
    print(f"Error from API: {output['error']}")
    return None

  if isinstance(output, list):
    return output

  print('Unexpected format', output)
  return None

In [44]:
all_embeddings = []
for document in docs:
  source_sentence = document.page_content
  page_contents = [d.page_content for d in docs]
  embeddings = (source_sentence, page_contents)

  embeddings = get_embeddings(source_sentence, page_contents)
  if embeddings is not None:
    all_embeddings.append(embeddings)
    # print(output)
    # print(f"Embeddings for: '{source_sentence}': {embeddings}")

API response [0.9999998807907104, 0.8468075394630432, 0.7417824864387512, 0.6171432733535767, 0.5358301997184753, 0.6794014573097229, 0.7063855528831482, 0.5247949957847595, 0.536730945110321, 0.32393527030944824]
API response [0.8468075394630432, 1.0, 0.7277600169181824, 0.5657753348350525, 0.5645157098770142, 0.6013243198394775, 0.7152350544929504, 0.45174118876457214, 0.5081220269203186, 0.31998926401138306]
API response [0.741782546043396, 0.7277601361274719, 0.9999999403953552, 0.6852745413780212, 0.5358399152755737, 0.6001232266426086, 0.6627333760261536, 0.3332376480102539, 0.3658885061740875, 0.18336352705955505]
API response [0.6171431541442871, 0.5657752752304077, 0.6852746605873108, 1.0000001192092896, 0.6024659872055054, 0.5648976564407349, 0.512198269367218, 0.393480509519577, 0.450045645236969, 0.19179870188236237]
API response [0.5358301401138306, 0.5645157098770142, 0.535839855670929, 0.6024661064147949, 1.0000001192092896, 0.5807186365127563, 0.5414723753929138, 0.4734

In [49]:
print(all_embeddings) 

[[0.9999998807907104, 0.8468075394630432, 0.7417824864387512, 0.6171432733535767, 0.5358301997184753, 0.6794014573097229, 0.7063855528831482, 0.5247949957847595, 0.536730945110321, 0.32393527030944824], [0.8468075394630432, 1.0, 0.7277600169181824, 0.5657753348350525, 0.5645157098770142, 0.6013243198394775, 0.7152350544929504, 0.45174118876457214, 0.5081220269203186, 0.31998926401138306], [0.741782546043396, 0.7277601361274719, 0.9999999403953552, 0.6852745413780212, 0.5358399152755737, 0.6001232266426086, 0.6627333760261536, 0.3332376480102539, 0.3658885061740875, 0.18336352705955505], [0.6171431541442871, 0.5657752752304077, 0.6852746605873108, 1.0000001192092896, 0.6024659872055054, 0.5648976564407349, 0.512198269367218, 0.393480509519577, 0.450045645236969, 0.19179870188236237], [0.5358301401138306, 0.5645157098770142, 0.535839855670929, 0.6024661064147949, 1.0000001192092896, 0.5807186365127563, 0.5414723753929138, 0.4734347462654114, 0.6165812015533447, 0.3298359513282776], [0.67