In [1]:
from data_integration_pipeline import DataIntegrationPipeline

pipeline = DataIntegrationPipeline()

  from tqdm.autonotebook import tqdm


# Scrape web content with query "MidJourney"

In [2]:
web_documents = pipeline.load_web_content("chrome", "MidJourney")

In [3]:
print(web_documents[2])


page_content='Top highlight\n\nYou have 1 free member-only story left this month. Sign up for Medium and get an extra one.\n\nMember-only story\n\nThe Midjourney Cheat Sheet (V5)\n\nFrom prompts to parameters and weights, all in one place\n\nTristan Wolff¬∑Follow\n\nPublished inBootcamp¬∑3 min read¬∑Apr 11\n\n891\n\nListen\n\nShare\n\nYou know how it is: every now and then we have to dig through the Midjourney documentation to check a parameter‚Äôs value range or to find out whether the latest version (V5) allows us to use the cool ‚Äú ‚Äî video‚Äù feature as V3 did! Opening a new tab. Navigating through the docs. Annoying!\n\nThat‚Äôs why I created this little cheat sheet. Pretty handy if you print it out and put it next to your workstation.\n\nIt should cover the basic stuff and point out some advanced concepts.\n\nIf you think something is missing, please let me know in the comments. I will update this every time new features are added to V5.\n\nHere‚Äôs the text version in case you

# For each paragraph --> source and content gets saved

In [4]:
paragraphs = pipeline.split_document_paragraphs(web_documents[2])

# print out whole document (first paragraph) 
print(paragraphs[0])

# for simplicity, only print out page content for the other paragraphs
print([p.page_content for p in paragraphs[1:]])

page_content='Top highlight' metadata={'source': 'https://medium.com/design-bootcamp/the-midjourney-cheat-sheet-v5-54b5fd92d2da?source=search_post---------6----------------------------'}
['You have 1 free member-only story left this month. Sign up for Medium and get an extra one.', 'Member-only story', 'The Midjourney Cheat Sheet (V5)', 'From prompts to parameters and weights, all in one place', 'Tristan Wolff¬∑Follow', 'Published inBootcamp¬∑3 min read¬∑Apr 11', '891', 'Listen', 'Share', 'You know how it is: every now and then we have to dig through the Midjourney documentation to check a parameter‚Äôs value range or to find out whether the latest version (V5) allows us to use the cool ‚Äú ‚Äî video‚Äù feature as V3 did! Opening a new tab. Navigating through the docs. Annoying!', 'That‚Äôs why I created this little cheat sheet. Pretty handy if you print it out and put it next to your workstation.', 'It should cover the basic stuff and point out some advanced concepts.', 'If you think 

# Store paragraphs in Pinecone DB

In [5]:
all_paragraphs = []
for web_doc in web_documents:
    doc_paragraphs = pipeline.split_document_paragraphs(web_doc)
    for par in doc_paragraphs:
        all_paragraphs.append(par)

# store parapgraphs to vector db
pipeline.add_documents_to_vectorstore(all_paragraphs)

Upserted vectors:   0%|          | 0/964 [00:00<?, ?it/s]

['76102398-e67a-4fb4-9ef8-1e5e034dd85d',
 'b15dda88-a815-4be9-901d-a13c42f9a677',
 'bd9e9138-e423-4aa7-8c7d-76e9fc56b876',
 'cb4dbfae-311f-467c-9051-5ee9c62a34b8',
 'ab3d37dc-1e36-4539-9790-26763ad8f552',
 'b54de808-eb4d-4d38-b1d6-1a9922a0aecb',
 'ade9d532-b086-4654-a1e5-6fd24ba3de97',
 'c43a8b68-ba92-4eb9-ab17-6adb862d0b77',
 '63ae3a50-5707-4d42-8670-15ad256ace5a',
 'f445a065-5ad1-4b9d-8370-8a8fc6a69a66',
 '75dd860c-6b10-4caa-8b72-07c77c85a5dd',
 'b252904e-d274-45b4-aa67-a2f1fe6eee1c',
 'e68101a6-d383-4736-8655-853b5be4fdf7',
 '6c563c6a-3f89-4942-8687-3816a0987aa4',
 '40bcd644-f2d7-4aaf-980f-5417d3a354e7',
 'a80feeb6-5f64-4feb-b012-124194d1cf14',
 'ef14947a-6da8-462f-81e9-db0560a9a06a',
 '84562e82-9272-40eb-aa4e-b661157fc486',
 'db15c934-b3e5-41f4-a8c5-924f4a14e79d',
 'ac8dd81b-3d1b-43e4-bbb8-992e47ed57b2',
 '9b73a1b6-ccb8-49f9-812a-b9594d910565',
 '1475b806-84c1-40d9-83ad-82ec12b59e6a',
 'c30cf589-4d11-40a9-a596-bb5c12e2185e',
 '18adac15-9a66-4c57-af90-3692e32f5d7b',
 'a3cd8766-9ba5-

# Similarity Search

In [6]:
# similarity search
query_results = pipeline.query_vectorstore(
    query="MidJourney", k=5)

for q in query_results:
    print(q)

[Document(page_content='196', metadata={'source': 'https://medium.com/artificial-corner/youre-using-midjourney-wrong-here-s-how-to-create-better-images-than-99-of-midjourney-users-c876fbe7915e?source=search_post---------2----------------------------'}), Document(page_content='380', metadata={'source': 'https://medium.com/user-experience-design-1/how-i-used-midjourney-to-design-a-brand-identity-394cf9ddaeed?source=search_post---------0----------------------------'}), Document(page_content='-', metadata={'source': 'https://medium.com/mlearning-ai/an-advanced-guide-to-writing-prompts-for-midjourney-text-to-image-aa12a1e33b6?source=search_post---------1----------------------------'}), Document(page_content='380', metadata={'source': 'https://medium.com/ux-planet/ui-design-using-midjourney-713e8d1b6a6b?source=search_post---------3----------------------------'}), Document(page_content='494', metadata={'source': 'https://medium.com/@hortfrancis/using-midjourney-for-logo-design-fb6178b65b6d?so

# ???