# Data Integration Pipeline Demo

In [1]:
from pdf_and_text_utils import load_pdf
from keyword_extraction_tfidf import get_search_terms, pdf_docs_to_str
from scraper import load_web_content
from vectorstore_controller import VectorstoreController

  from tqdm.autonotebook import tqdm


#### Create vectorstoreController

In [2]:
vectorstore_controller = VectorstoreController()

#### Load pdf and add to vectorstore

In [3]:
pdf_path = "spielwiese/test_pdf/3_ProjectManagement.pdf"
pdf_docs = load_pdf(pdf_path)
vectorstore_controller.add_documents_to_vectorstore(pdf_docs)

Upserted vectors:   0%|          | 0/106 [00:00<?, ?it/s]

PineconeProtocolError: Failed to connect; did you specify the correct index name?

#### Extract search query from pdf

In [4]:
pdf_str = pdf_docs_to_str(pdf_docs)
search_query = get_search_terms(text=pdf_str)
search_query

'model team ml projects data science data scientists anti patterns'

#### Scrape the web for content and store in vectorstore

In [5]:
web_content = load_web_content('chrome', search_query)
web_content[0]

Document(page_content='Machine Learning Model Serving Overview (Seldon Core, KFServing, BentoML, MLFlow)\n\nYinon Data·Follow\n\nPublished inEverything Full Stack·8 min read·Nov 25, 2020\n\nListenShare\n\n-\n\nListen\n\nShare\n\nTLDR; I’m looking for a way to provide Data Scientists with tools to deploy a growing number of models independently, with minimal Engineering and DevOps efforts for each deployment. After considering several model serving solutions, I found Seldon Core to be the most suitable for this project’s needs\n\nEdit August 27, 2021:\n\nI’ve created a video tutorial for getting started with Seldon Core, watch it here:  ML Model Serving at Scale Tutorial — Seldon Core\n\nML Model Serving at Scale Tutorial — Seldon Core\n\nContext\n\nI’m currently building an ML based system for my client.To give you a simplified context without getting too much into the details — the goal of the ML system is to help the main business system by providing real time predictions based on tr

In [6]:
vectorstore_controller.add_documents_to_vectorstore(web_content)

Upserted vectors:   0%|          | 0/10 [00:00<?, ?it/s]

PineconeProtocolError: Failed to connect; did you specify the correct index name?

In [4]:
# split pdf document
splits = split_pdf(pdf_pages)

Created a chunk of size 475, which is longer than the specified 200
Created a chunk of size 209, which is longer than the specified 200
Created a chunk of size 210, which is longer than the specified 200
Created a chunk of size 266, which is longer than the specified 200
Created a chunk of size 567, which is longer than the specified 200
Created a chunk of size 202, which is longer than the specified 200
Created a chunk of size 764, which is longer than the specified 200
Created a chunk of size 285, which is longer than the specified 200
Created a chunk of size 214, which is longer than the specified 200
Created a chunk of size 438, which is longer than the specified 200
Created a chunk of size 429, which is longer than the specified 200
Created a chunk of size 365, which is longer than the specified 200
Created a chunk of size 362, which is longer than the specified 200
Created a chunk of size 452, which is longer than the specified 200
Created a chunk of size 298, which is longer tha

In [8]:
import os
print(os.environ['PINECONE_INDEX_NAME'])

se4ai-chatbot


In [6]:
# store embeddings to vectorstore
vectorstore_controller.add_documents_to_vectorstore(documents=splits)

Upserted vectors:   0%|          | 0/183 [00:00<?, ?it/s]

PineconeProtocolError: Failed to connect; did you specify the correct index name?

In [27]:
# similarity search
query_results = vectorstore_controller.query_vectorstore(
    query="When was Langchain released?", k=2, get_raw_text=True)


In [28]:
# print results
for r in query_results:
    print(r, "\n")


[1]
LangChain was launched in October 2022 as an
open source project by Harrison Chase, while
working at machine learning startup Robust
Intelligence. 

LangChain
Developer(s)Harrison Chase
Initial releaseOctober 2022
Repository github.com/hwchase17/langchain
(https://github.com/hwchase17/la
ngchain)
Written in Python and JavaScript
Type Software framework for large
language model application
development
License MIT License
Website LangChain.com (https://langchai
n.com/)LangChain
LangChain is a software development framework
designed to simplify the creation of applications using
large language models (LLMs). 



# Split paragraphs of blog post

In [5]:
# extract from https://uxdesign.cc/how-i-used-midjourney-to-design-a-brand-identity-394cf9ddaeed
blog_test = """4. Post editing

MidJourney isnt perfect. There will be situations where further editing is needed. Whether its fine-tuning colors or composition, I use Photoshop for the finishing touches.

If youd like to continue to edit using Midjourney text prompts, Lars Nielsens prompt guide is a great beginners reference.

In closing
This exercise will by no means guarantee the same results for everyone. Rather, its to showcase an example of AIs power applied to design.

Designers will ultimately need to consider how AI can be used purposefully. Using this technology can have an instrumental impact on streamlining tedious design workflows but also has the ability to compromise the quality of our work if were not careful.

Moving forward, its important that we collectively build upon our AI knowledge as a community to guarantee success in our creative work.

Happy blending!
"""


In [7]:
parts = vectorstore_controller.split_paragraphs(blog_test)
for p in parts:
    print("----------------")
    print(p)


----------------
4. Post editing
----------------
MidJourney isnt perfect. There will be situations where further editing is needed. Whether its fine-tuning colors or composition, I use Photoshop for the finishing touches.
----------------
If youd like to continue to edit using Midjourney text prompts, Lars Nielsens prompt guide is a great beginners reference.
----------------
In closing
This exercise will by no means guarantee the same results for everyone. Rather, its to showcase an example of AIs power applied to design.
----------------
Designers will ultimately need to consider how AI can be used purposefully. Using this technology can have an instrumental impact on streamlining tedious design workflows but also has the ability to compromise the quality of our work if were not careful.
----------------
Moving forward, its important that we collectively build upon our AI knowledge as a community to guarantee success in our creative work.
----------------
Happy blending!'

