In [1]:
from dotenv import load_dotenv

import PyPDF2
import sqlite3

import numpy as np
import pickle

from openai import OpenAI
import os

import streamlit as st

from pinecone import Pinecone, ServerlessSpec

In [6]:



def extract_text_from_pdf(pdf_file):
    '''
    This function extracts text from a PDF file.
    '''
    pdf_file = open(pdf_file, 'rb')
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    text = ''
    for page_num in range(len(pdf_reader.pages)):
        text += pdf_reader.pages[page_num].extract_text()
    return text

def chunking(text):
    '''
    This function chunks the text into smaller pieces to be used for creating embeddings.
    Chunk size is 1000 and the overlap is 200.
    '''
    chunks = [text[i:i+1000] for i in range(0, len(text), 800)]
    return chunks

def make_embeddings(client, chunks):
    '''
    This function creates embeddings for the chunks of text using the OpenAI API.
    '''
    
    def _make_embedding(client, chunk, model="text-embedding-3-small"):
        chunk = chunk.replace("\n", " ")
        return client.embeddings.create(input = [chunk], model=model).data[0].embedding
    
    embeddings = []
    for chunk in chunks:
        embedding = _make_embedding(client, chunk)
        embeddings.append(embedding)
    return embeddings

def create_database():
    pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
    index = pc.Index('aipi590-project2')
    return index

def insert_embeddings(index, text, embeddings):
    pass

In [7]:
def main():
    load_dotenv(override=True)
    
    openai_key = os.getenv("OPENAI_KEY")
    
    client = OpenAI(api_key=openai_key)
    
    data_path = "../data/"
    meta_chunks = []; meta_embedding = []
    meta_embeddings = {}
    # iterate files 
    for file in os.listdir(data_path):
        if file.endswith(".pdf"):
            file = os.path.join(data_path, file)
            
            text = extract_text_from_pdf(file)
            chunks = chunking(text)
            embeddings = make_embeddings(client, chunks)
            
            meta_chunks = meta_chunks + chunks
            meta_embedding = meta_embedding + embeddings
    
    print(len(meta_chunks), len(meta_embedding))
    
    for idx in range(len(meta_chunks)):
        meta_embeddings[idx] = {"text": meta_chunks[idx], "embedding": meta_embedding[idx]}
    
    return meta_embeddings
    
    pass


In [8]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index = pc.Index('aipi590-project2')

load_dotenv(override=True)

openai_key = os.getenv("OPENAI_KEY")

client = OpenAI(api_key=openai_key)

In [22]:
query = "what course should i choose in the first semester of the AI meng program at Duke?"
    
query_embedding = make_embeddings(client, [query])[0]

test = index.query(
    vector=query_embedding,
    top_k=5,
    include_values=True,
    include_metadata=True
)

In [26]:
test['matches'][0]['metadata']['text']

"ess and will be able to converse in some depth in each of the areas studied upon completion. Other topics will include Supply Chain Management, Stage-Gate Development Cycles, Balances Scorecards, Blue Ocean Strategy, and Disruptive Technologies. ELECTIVE OPTIONS (SELECT THREE) Note: In addition to the courses listed below, students in this master's degree program may take other graduate-level elective courses within Duke's Pratt School of Engineering, on a space-available basis with instructor permission. At least one elective must be taken through the AI MEng program. AIPI Departmental Electives  AIPI 530: Optimization in Practice Optimization is the ultimate skill in artificial intelligence and prescriptive analytics allowing practitioners to generate the best actionable solutions for business needs. This class will give students required skills to mathematically formulate relevant business problems as optimization models, use leading software modeling syntax and solvers to generate

In [30]:
from bs4 import BeautifulSoup

# import xmltodict
import requests

def extract_text_from(url):
    html = requests.get(url).text
    soup = BeautifulSoup(html, features="html.parser")
    text = soup.get_text()

    lines = (line.strip() for line in text.splitlines())
    return '\n'.join(line for line in lines if line)


In [32]:
extract_text_from("https://ai.meng.duke.edu/")

'Duke AI Master of Engineering\nJump to navigation\nDuke Engineering\nPratt School of Engineering\nInstitute for Enterprise Engineering\nIndustry Relations\nLeadership\nNews\nContact\nWhy Duke?The Duke DifferenceCareer ServicesGraduate OutcomesWhat Tech Leaders Are SayingDegreeCertificateCoursesFacultyApply\nBecome a leader in applying AI & machine learning\nArtificial intelligence is creating immense opportunities across every industry. Duke’s unique, immersive Master of Engineering in Artificial Intelligence for Product Innovation program will prepare you with strong technical AI skills complemented by a product design perspective and hands-on practical experience building AI software applications that solve real-world problems.\nDuke AI Master of Engineering\nStudy AI and Machine Learning at Duke\nMaster\'s Degree\n30-credit degree program\nComplete in as little as 12 months\nOnline or on-campus\nGain expertise in machine learning and AI\nBuild a project portfolio\nDegree Details\nC