In [1]:
from datetime import datetime
import os
import dotenv
from pinecone import Pinecone
from PyPDF2 import PdfReader
from cleaning_utils import *
from uploading_vdb_utils import *


file_path = "../sample_docs/cs50_harvard.pdf"
with open(file_path, "rb") as file:
    pdf_embeddings = pdf_embedding(file)

In [2]:
list(map(len, pdf_embeddings["document"]))

[3999, 4407, 4177, 4210, 4225, 4111, 3809, 4209, 3989, 2422]

In [3]:
metadata = {
    "file name": "CS_50",
    "file type": ".pdf",
}

vectors, ids = generating_vetors(pdf_embeddings, metadata, netid="rd278")
pdf_embeddings["ID"] = ids

In [4]:
list(map(len, pdf_embeddings["ID"]))

[28, 28, 28, 28, 28, 28, 28, 28, 28, 28]

In [5]:
pdf_embeddings

Unnamed: 0,document,values,ID
0,Teaching CS50 with AI\nLeveraging Generative A...,"[0.0016433328855782747, 0.01620206981897354, 0...",ID-rd278-1707749385.818722-0
1,\n(AI) built atop large language models (LLMs)...,"[0.006348843220621347, -0.009144599549472332, ...",ID-rd278-1707749385.818722-1
2,":1 teacher-to-student\nratio, providing each s...","[0.00807948224246502, -0.006738923955708742, 0...",ID-rd278-1707749385.818722-2
3,\nonline discussion forum for asynchronous hel...,"[3.493686381261796e-05, -0.021586531773209572,...",ID-rd278-1707749385.818722-3
4,LLMs can take a series of prompts as input and...,"[-0.008169667795300484, -0.0031846577767282724...",ID-rd278-1707749385.818722-4
5,replies to\nthese threads). The CS50 Duck on E...,"[-0.0011166228214278817, -0.001193598494865000...",ID-rd278-1707749385.818722-5
6,"% “helpful,” 21% “some-\nwhat helpful,” and 6%...","[0.014878105372190475, 0.021495763212442398, 0...",ID-rd278-1707749385.818722-6
7,-4 to be out-of-sync with the course’s latest\...,"[0.02324444241821766, 0.011087690480053425, 0....",ID-rd278-1707749385.818722-7
8,"staff. (In CS50’s MOOC, design\nis not evalua...","[0.018958529457449913, 0.02126556821167469, 0....",ID-rd278-1707749385.818722-8
9,", Nayeon Lee, Rita Frieske, Tiezheng Yu, Dan S...","[-0.03487817198038101, 0.018287474289536476, 0...",ID-rd278-1707749385.818722-9


# Uploading to pinecone

In [2]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index = pc.Index("chatarena")

In [17]:
index.upsert(vectors)

{'upserted_count': 10}

In [7]:
import mysql.connector
import os
import dotenv

dotenv.load_dotenv()

try:
    cnx = mysql.connector.connect(
        user=os.getenv("MYSQL_ADMIN"),
        password=os.getenv("MYSQL_PASS"),
        host=os.getenv("MYSQL_HOST"),
        port=3306,
        database=os.getenv("MY_SQL_DATABASE"),
    )
except Exception as err:
    print(f"The connection to Azure MySQL failed.\nError: {err}\n")

In [8]:
cursor = cnx.cursor()

# Show tables
cursor.execute("SHOW TABLES")
tables = cursor.fetchall()
print("Tables:", tables)

Tables: [('documents_values',)]


In [9]:
cursor.execute("DROP TABLE IF EXISTS documents_values")

In [None]:
# Check if table exists
try:
    if ("documents_values",) not in tables:
        # Create table
        cursor.execute(
            """
            CREATE TABLE documents_values (
                ID CHAR(50) PRIMARY KEY,
                document TEXT
            )
        """
        )
        print("Table created.")
    else:
        print("Table already exists.")

except Exception as err:
    print(f"The connection to Azure MySQL failed.\nError: {err}\n")

In [10]:
from sqlalchemy import create_engine
import pandas.io.sql as psql

# Create a SQLAlchemy engine from the existing MySQL connection
engine = create_engine("mysql+mysqlconnector://", creator=lambda: cnx)

# Write the DataFrame to the SQL table

In [31]:
engine

Engine(mysql+mysqlconnector://)

In [14]:
pdf_embeddings[["ID", "document"]]

Unnamed: 0,ID,document
0,ID-rd278-1707749385.818722-0,Teaching CS50 with AI\nLeveraging Generative A...
1,ID-rd278-1707749385.818722-1,\n(AI) built atop large language models (LLMs)...
2,ID-rd278-1707749385.818722-2,":1 teacher-to-student\nratio, providing each s..."
3,ID-rd278-1707749385.818722-3,\nonline discussion forum for asynchronous hel...
4,ID-rd278-1707749385.818722-4,LLMs can take a series of prompts as input and...
5,ID-rd278-1707749385.818722-5,replies to\nthese threads). The CS50 Duck on E...
6,ID-rd278-1707749385.818722-6,"% “helpful,” 21% “some-\nwhat helpful,” and 6%..."
7,ID-rd278-1707749385.818722-7,-4 to be out-of-sync with the course’s latest\...
8,ID-rd278-1707749385.818722-8,"staff. (In CS50’s MOOC, design\nis not evalua..."
9,ID-rd278-1707749385.818722-9,", Nayeon Lee, Rita Frieske, Tiezheng Yu, Dan S..."


In [15]:
from sqlalchemy import create_engine

# Create a SQLAlchemy engine using the MySQL connection details
engine = create_engine(
    f"mysql+mysqlconnector://{os.getenv('MYSQL_ADMIN')}:{os.getenv('MYSQL_PASS')}@{os.getenv('MYSQL_HOST')}/{os.getenv('MY_SQL_DATABASE')}"
)

# Write the DataFrame to the SQL table
pdf_embeddings[["ID", "document"]].to_sql(
    name="documents_values", con=engine, if_exists="append", index=False
)

10

# Creating the Retrieval

To ask a query we need the embedding of the query 

In [3]:
text = "Give 5 points of embeddings and Audio from Mulan paper."
query_vector = get_embedding(text)

now we to retrieve the vector from the VDB

In [4]:
response_from_vdb = index.query(
    vector=query_vector, top_k=5, include_values=True, include_metadata=True
)

# Only 3 Chunks so far

In [5]:
id_retrieval = [response_from_vdb["matches"][i]["id"] for i in range(3)]

In [6]:
# Convert the list of IDs into a string of comma-separated values
id_list_str = ", ".join(f"'{id_}'" for id_ in id_retrieval)
id_list_str

"'ID-rd278-1707933094.392059-0', 'ID-rd278-1707933091.252368-0', 'ID-rd278-1707933091.252368-4'"

In [9]:
# Create the SQL query
query = f"SELECT document FROM documents_values WHERE ID IN ({id_list_str})"

# Execute the query
cursor.execute(query)

# Fetch the results
results = cursor.fetchall()

In [10]:
# Replace '\n' with ' ' in each string and join them into a single string
context = " ".join(result[0].replace("\n", " ") for result in results)

In [11]:
from langchain_core.prompts import PromptTemplate

template = """You are a Duke AI assistant, your goal is to \
    provide the clearest response to an question provided.\
    if in the context you cannot find the answer, \
    reply: I'm sorry, I don't have the answer to that.
    
    This is the context: {context}
    This is the question: {question}"""

In [12]:
prompt = PromptTemplate(input_variables=["question", "context"], template=template)

In [13]:
from langchain import LLMChain
from langchain_openai import OpenAI

llm_gpt3_5_chain = LLMChain(prompt=prompt, llm=OpenAI())

response = llm_gpt3_5_chain.run(question=text, context=context, temperature=0)

  warn_deprecated(


In [14]:
print(response)


1. MuLan is a two-tower, joint audio-text embedding model that is trained using 44 million music recordings (370K hours) and weakly-associated, free-form text annotations.

2. The resulting audio-text representation subsumes existing ontologies while also being able to graduate to true zero-shot functionalities.

3. The MuLan embeddings have been shown to be versatile, with a range of experiments demonstrating their capabilities in transfer learning, zero-shot music tagging, language understanding in the music domain, and cross-modal retrieval applications.

4. MuLan uses a combination of language models and Transformer-based architectures, such as BERT and Audio Spectrogram Transformer, to embed both audio and text inputs into a shared space of dimensiond= 128.

5. The training dataset for MuLan includes 44 million 30-second clips of music audio from internet music videos, with accompanying text data from video titles, descriptions, comments, and linked playlists.


"\n\nThe main ideas of the CS50 paper by Liu et al about the use of Generative AI in education are:\n\n1. The apprehension towards AI in education: The paper acknowledges the concerns about AI disrupting education, particularly in terms of facilitating academic dishonesty. This leads to a common response of forbidding the use of AI altogether.\n\n2. Incorporating AI into CS50: The authors chose to directly incorporate Generative AI into CS50, Harvard University's introductory course in computer science. They implemented guardrails to uphold academic integrity and promote meaningful learning.\n\n3. The potential of AI in education: The paper discusses how AI can potentially improve the learning experience by providing continuous, customized support to students and allowing human educators to focus on more complex pedagogical issues.\n\n4. The development of CS50.ai: The authors developed a suite of AI tools, including a chatbot and a virtual rubber duck, to assist students in CS50. These tools were initially available to summer students, then to online students, and finally to on-campus students.\n\n5. The challenges and solutions of using AI: The paper discusses the challenges faced in integrating AI into education, such as the subjectivity of grading and the potential for academic dishonesty. The authors propose solutions, such as design50"