## Store the text content of PPT pages in Pinecone

Initialize Pinecone and OpenAI with API key.

In [None]:
from dotenv import load_dotenv
from openai import OpenAI
from pinecone import Pinecone

load_dotenv()
client = OpenAI() # api_key=

pc = Pinecone() # api_key=
index = pc.Index("ta")


Read the text in each page in each PPT file, then save the text content of all PPT pages into a .json file.

In [None]:
from pptx import Presentation
import json

def extract_text_from_ppt(ppt_path):
    ppt_pages = {}
    prs = Presentation(ppt_path)
    for idx, slide in enumerate(prs.slides):
        page_content=''
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                page_content+=shape.text.replace("\x0b", "")
        ppt_pages[idx+1]=page_content
    return ppt_pages

folder_path='./slides_PPT'
files=os.listdir(folder_path)
ppts={}
for file in files: 
    path=folder_path+'/'+file
    ppts[file]=extract_text_from_ppt(path)

with open('ppt_text.json', "w") as json_file:
    json.dump(ppts, json_file)

For the text content in each page, do embedding and store the vector in the Pinecone vector database.

In [None]:
import json
with open('./ppt_text.json', "r") as json_file:
    ppts = json.load(json_file)

for file_name, ppt_pages in ppts.items():
    for page_num, page_content in ppt_pages.items():
        res = client.embeddings.create(input=[page_content], model="text-embedding-3-small")
        vector=res.data[0].embedding
        index.upsert(
            vectors=[
                {
                "id": f"PPT-{file_name}-{page_num}", 
                "values": vector, 
                "metadata": {"file_name": f"{file_name}", "page_num": page_num}
                }
            ],
            namespace='ppt_data')

## Generate PPT page images and save them locally.

In [1]:
import fitz  # PyMuPDF
import os

def save_pdf_as_images(pdf_path, output_path):
    pdf_document = fitz.open(pdf_path)
    num_pages = len(pdf_document)
    for page_number in range(num_pages):
        output_image_path = os.path.join(output_path, f"page_{page_number + 1}.png")
        page = pdf_document.load_page(page_number)  

        dpi = 100  
        zoom_x = dpi / 72
        zoom_y = dpi / 72

        mat = fitz.Matrix(zoom_x, zoom_y)
        pix = page.get_pixmap(matrix=mat)

        pix.save(output_image_path)

    pdf_document.close()

In [2]:
pdf_path='./slides_PDF/'
for pdf_file in os.listdir(pdf_path):
    output_path='./slides_image/'+pdf_file.replace('pdf', 'pptx')
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    save_pdf_as_images(pdf_path+pdf_file, output_path)
    print(f'{pdf_file} finished!')

ch01-intro.pdf finished!
ch02-mapreduce.pdf finished!
ch03-lsh.pdf finished!
ch04-streams1.pdf finished!
ch04-streams2.pdf finished!
ch05-linkanalysis1.pdf finished!
ch05-linkanalysis2.pdf finished!
ch06-assocrules.pdf finished!
ch07-clustering.pdf finished!
ch08-advertising.pdf finished!
ch09-recsys1.pdf finished!
ch09-recsys2.pdf finished!
ch10-graphs1.pdf finished!
ch10-graphs2.pdf finished!
ch11-dimred.pdf finished!
ch12-ml1.pdf finished!
ch12-ml2.pdf finished!
