In [58]:
# Import modules
import os
from pinecone import Pinecone as pc, ServerlessSpec
import dotenv
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain

In [12]:
# Load the pdf textbook file
loader = PyPDFLoader("data_science_textbook.pdf")
data = loader.load()

In [20]:
# Take a look at the loaded file
# Note: If we load a PDF file, the number of document will be the same as the number of pages in the PDF file
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[3].page_content)} characters in your sample document')
print (f'Here is a sample: {data[3].page_content[:]}')

You have 403 document(s) in your data
There are 1797 characters in your sample document
Here is a sample: iv Contents
2.4.4 Model Implementation and Post Produc-
tion Stage . . . . . . . . . . . . . . . . . . 41
2.4.5 Project Cycle Summary . . . . . . . . . . 42
2.5 Common Mistakes in Data Science . . . . . . . . 43
2.5.1 Problem F ormulation Stage . . . . . . . . 43
2.5.2 Project Planning Stage . . . . . . . . . . . 44
2.5.3 Project Modeling Stage . . . . . . . . . . 45
2.5.4 Model Implementation and Post Produc-
tion Stage . . . . . . . . . . . . . . . . . . 46
2.5.5 Summary of Common Mistakes . . . . . . 47
3 Introduction to the Data 49
3.1 Customer Data for a Clothing Company . . . . . 49
3.2 Swine Disease Breakout Data . . . . . . . . . . . 51
3.3 MNIST Dataset . . . . . . . . . . . . . . . . . . 53
3.4 IMDB Dataset . . . . . . . . . . . . . . . . . . . 53
4 Big Data Cloud Platform 57
4.1 Power of Cluster of Computers . . . . . . . . . . 58
4.2 Evolution of Cluster Computing . . .

In [21]:
# We'll split our data into chunks around 500 characters each with a 50 character overlap.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(data)

In [26]:
# Let's see how many small chunks we have
print (f'Now you have {len(texts)} documents')
print(texts[0].page_content)

Now you have 1250 documents
Hui Lin and Ming Li
Practitioner’s Guide to
Data Science


In [40]:
# Fetch the Pinecone API key
dotenv.load_dotenv()

True

In [41]:
# Fetch the OpenAI API key from environment variables
openai_api_key = os.getenv("OPENAI_API_KEY")

# Fetch the Pinecone API key from environment variables
pinecone_api_key = os.getenv("PINECONE_API_KEY")

In [44]:
# Innitialize the OpenAI Embeddings 
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

In [48]:
# Initialize Pinecone
pc = pc(api_key=pinecone_api_key)

In [50]:
# Print out all existing indexes stored in pinecone
pc.list_indexes()

{'indexes': [{'dimension': 1536,
              'host': 'data-science-textbook-f38dchn.svc.apw5-4e34-81fa.pinecone.io',
              'metric': 'cosine',
              'name': 'data-science-textbook',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-west-2'}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [53]:
# Check to see if the index exists, if not, create it
index_name = "data-science-textbook"
if "data-science-textbook" not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536, # OpenAI Embeddings dimension
        metric="cosine", 
        spec=ServerlessSpec(
            cloud="aws",
            region="us-west-2"
        ) 
    )

In [52]:
# Innitialize the index
index = pc.Index("data-science-textbook")

In [55]:
# Upsert the texts to the index
docsearch = Pinecone.from_texts(
    texts=[t.page_content for t in texts], 
    embedding=embeddings, 
    index_name=index_name)

In [59]:
# Initialize the LLM
llm = ChatOpenAI(temperature=0, openai_api_key=openai_api_key)

# Initialize the QA chain
chain = load_qa_chain(llm, chain_type="stuff")

In [64]:
# Make a query
query = "What is adaptive boosting?"

# Run the query using similarity search
docs = docsearch.similarity_search(query)

# Print the similarity ranked results
print(docs)
print(f'number of documents retrieved: {len(docs)}')

[Document(page_content='boosting. Here we introduce two main types: adaptive boosting\nand stochastic gradient boosting.\n11.7.1 Adaptive Boosting\nY oav F reund and Robert Schapire ( F reund and Schapire ,1997 )\ncame up the AdaBoost.M1 algorithm. Consider a binary classi-\nfication problem where the response variable has two categories\n𝑌 ∈ {−1, 1} . Given predictor matrix, 𝑋, construct a classifier\n𝐺(𝑋) that predicts 1or −1 . The corresponding error rate in the\ntraining set is:\n̄ 𝑒𝑟𝑟 =1\n𝑁Σ𝑁\n𝑖=1 𝐼 (𝑦𝑖≠ 𝐺(𝑥𝑖))'), Document(page_content='et. al ,2000 ), chemical substructure classification ( V armuza K and\nK,2003 ), music classification ( Bergstra et al. ,2006 ), etc. The first\neffective implementation of boosting is Adaptive Boosting (Ad-\naBoost) algorithm came up by Y oav F reund and Robert Schapire\nin 1996 ( YFR ,1999 ). After that, some researchers ( F riedman et al. ,\n2000 ) started to connect the boosting algorithm with some statisti-\ncal concepts, such as loss function

In [70]:
# Run the QA chain
chain.invoke({"input_documents": docs, "question": query})

{'input_documents': [Document(page_content='boosting. Here we introduce two main types: adaptive boosting\nand stochastic gradient boosting.\n11.7.1 Adaptive Boosting\nY oav F reund and Robert Schapire ( F reund and Schapire ,1997 )\ncame up the AdaBoost.M1 algorithm. Consider a binary classi-\nfication problem where the response variable has two categories\n𝑌 ∈ {−1, 1} . Given predictor matrix, 𝑋, construct a classifier\n𝐺(𝑋) that predicts 1or −1 . The corresponding error rate in the\ntraining set is:\n̄ 𝑒𝑟𝑟 =1\n𝑁Σ𝑁\n𝑖=1 𝐼 (𝑦𝑖≠ 𝐺(𝑥𝑖))'),
  Document(page_content='et. al ,2000 ), chemical substructure classification ( V armuza K and\nK,2003 ), music classification ( Bergstra et al. ,2006 ), etc. The first\neffective implementation of boosting is Adaptive Boosting (Ad-\naBoost) algorithm came up by Y oav F reund and Robert Schapire\nin 1996 ( YFR ,1999 ). After that, some researchers ( F riedman et al. ,\n2000 ) started to connect the boosting algorithm with some statisti-\ncal concepts,

In [78]:
# Get just the output text from the chain
answer = chain.invoke({"input_documents": docs, "question": query})["output_text"]
print(answer)

Adaptive boosting, also known as AdaBoost, is a machine learning algorithm used for binary classification problems. It was introduced by Yoav Freund and Robert Schapire in 1997. The algorithm aims to construct a classifier that predicts the binary classes (1 or -1) by iteratively combining multiple weak learners (classifiers that are marginally better than random guess). Each weak learner is trained on a modified version of the training data, where the weights of misclassified instances are increased. The final prediction is made by aggregating the predictions of all weak learners, giving more weight to the ones with higher accuracy. AdaBoost is known for its ability to handle complex datasets and improve the performance of weak learners.
