In [1]:
import PyPDF2
import re
import os

# Uncomment if text cleaning is desired
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize
import time
from langchain.text_splitter import TokenTextSplitter
from langchain_openai import OpenAIEmbeddings
import pandas as pd
from openai import OpenAI
import dotenv

dotenv.load_dotenv()


client = OpenAI()


def parse_pdf(file):
    """This function uses PyPDF2 to read a PDF,

    Args:
        file (FileStorage): The PDF to be parsed

    Returns:
        text: String
    """

    pdf_reader = PyPDF2.PdfReader(file)
    text = ""
    num_pages = len(pdf_reader.pages)
    for page_num in range(num_pages):
        page = pdf_reader.pages[page_num]
        text += page.extract_text()

    return text


def text_splitting(text_to_split, chunk_size=5_000, overlap=0.1):
    """_summary_

    Args:
        text (Str): The string that is going to be splitted
        tokens (int, optional): The amount of tokens in each batch. Defaults to 1_000.
    """
    overlap = round(chunk_size * overlap)
    text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    texts = text_splitter.split_text(text_to_split)

    return texts


def get_embedding(text, model="text-embedding-3-small"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input=[text], model=model).data[0].embedding


def pdf_embedding(file, model="text-embedding-3-small"):
    """
    This function create the embeddings from Open AI
    with dimension 1,536


    Args:
        file (FileStorage): The pdf file
    Returns:
        a vector of embeddings from
    """

    text = parse_pdf(file)

    splitted_text = text_splitting(text)

    embeddings = list(map(lambda x: get_embedding(x, model=model), splitted_text))

    df = pd.DataFrame({"document": splitted_text, "embedding": embeddings})

    return df


def embeddings_from_type(doc_type: str, file) -> pd.core.frame.DataFrame:

    match doc_type:

        case ".pdf":

            return pdf_embedding(file)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [5]:
file_path = "../sample_docs/cs50_harvard.pdf"
with open(file_path, "rb") as file:
    # Use parse_pdf() on the file
    # text, num_pages = parse_pdf(file)
    # docs = text_splitting(text)
    pdf_embeddings = pdf_embedding(file)
    # print(f"Type docs: {type(docs)}")
    # print(f"len docs: {len(docs)}")
    # print(f"Sample:\n{docs[0][:200]}")

35988


In [6]:
pdf_embeddings

Unnamed: 0,document,embedding
0,Teaching CS50 with AI\nLeveraging Generative A...,"[0.0056123631075024605, 0.003144606249406934, ..."
1,replies to\nthese threads). The CS50 Duck on E...,"[0.003431424032896757, 0.01149859931319952, 0...."


In [2]:
from pinecone import Pinecone

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index = pc.Index("chatarena")

stats = index.describe_index_stats()

In [10]:
n_new_vectors = pdf_embeddings.shape[0]
index_size = stats["total_vector_count"]
ids = [f"ID{i}" for i in range(index_size + 1, index_size + n_new_vectors + 1)]
pdf_embeddings["ID"] = ids

In [15]:
metadata = {
    "file name": "file.filename",
    "file type": "file_type",
    "name": "owner_name",
    "subject": "subject",
    "timestamp": "timestamp",
    "blob url": "blob_url",
}

vectors = [
    {"id": row["ID"], "values": row["embedding"], "metadata": metadata}
    for _, row in pdf_embeddings.iterrows()
]