In [80]:
import os
import openai
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
import pprint

import sys
sys.path.append('/Users/jasonz/forward_data_lab_llmie/forward_data-llm_ie/')
from src import web_extractor



# Specify the path to config.json (adjust the path as needed)
config_file_path = '/Users/jasonz/forward_data_lab_llmie/forward_data-llm_ie/config.json'

with open(config_file_path, "r") as config_file:
    config = json.load(config_file)
    openai.api_key = config["api_key"]

In [81]:
llm = ChatOpenAI(temperature=0, openai_api_key=openai.api_key)

In [116]:
url = "https://www.shopify.com/blog/ecommerce-seo-beginners-guide"
webpage_text = web_extractor.ExtractTextFromWebpage(url)

# Initialize the RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size =1000, chunk_overlap = 0)

# Split the text
split_text = text_splitter.split_text(webpage_text)

# Print the resulting pieces
# for piece in split_text:
#     print(piece)

In [88]:
from langchain.chains import create_extraction_chain

schema = {
    "properties": {
        "blog_article_title": {"type": "string"},
        "blog_article_summary": {"type": "string"},
        "blog_article_steps": {"type": "string"},
    },
    "required": ["blog_article_title", "blog_article_summary", "blog_article_steps"],
}

def extract(content: str, schema: dict):
    return create_extraction_chain(schema=schema, llm=llm).run(content)

In [105]:
# Process the first split 
extracted_content = extract(
        schema=schema, content=split_text[0]
    )
pprint.pprint(extracted_content)

[{'blog_article_steps': 'ShopifySolutionsStartStart your businessBuild your '
                        'brandCreate your websiteOnline store editorCustomize '
                        'your storeStore themesFind business appsShopify app '
                        'storeOwn your site domainDomains & hostingExplore '
                        'free business toolsTools to run your businessSellSell '
                        'your productsSell online or in personCheck out '
                        'customersWorld-class checkoutSell onlineGrow your '
                        'business onlineSell across channelsReach millions of '
                        'shoppers and boost',
  'blog_article_summary': 'Email address Create your store Build your dream '
                          'business for $1/month Start your free trial, then '
                          'enjoy 3 months of Shopify for $1/month when you '
                          'sign up for a monthly Basic or Starter plan. Sign '
               

MultiQueryRetriver:

In [115]:
# Build a sample vectorDB
from langchain.vectorstores import Chroma
from langchain.document_loaders import WebBaseLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter



# Load blog post
loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
data = loader.load()

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
splits = text_splitter.split_documents(data)

# VectorDB
embedding = OpenAIEmbeddings(openai_api_key=openai.api_key)
#vectordb = Chroma.from_documents(documents=splits, embedding=embedding)

In [112]:
from typing import List

from langchain.llms import OpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from langchain.pydantic_v1 import BaseModel, Field, validator

model_name = "gpt-3.5-turbo"
temperature = 0.0
model = OpenAI(model_name=model_name, temperature=temperature, openai_api_key=openai.api_key)



In [117]:
# Here's another example, but with a compound typed field.
class ShopifyBlog(BaseModel):
    name: str = Field(description="Article Title")
    article_steps: List[str] = Field(description="Extract the steps shown in the blog on How to create an ecommerce SEO strategy")


actor_query = split_text[0]

parser = PydanticOutputParser(pydantic_object=ShopifyBlog)

prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

_input = prompt.format_prompt(query=actor_query)

output = model(_input.to_string())

parser.parse(output)

ShopifyBlog(name="Ecommerce SEO: The Ultimate Beginner's Guide (2023)", article_steps=['Email address', 'Create your store', 'Build your dream business for $1/month', 'Start your free trial, then enjoy 3 months of Shopify for $1/month when you sign up for a monthly Basic or Starter plan.', 'Sign up for a free trial', 'Select a monthly Basic or Starter plan', '$1/month pricing will be applied at checkout', 'Add products, launch your store, and start selling!', 'Start a free trial and enjoy 3 months of Shopify for $1/month on select plans.', 'Sign up now', "ShopifySolutionsStartStart your businessBuild your brandCreate your websiteOnline store editorCustomize your storeStore themesFind business appsShopify app storeOwn your site domainDomains & hostingExplore free business toolsTools to run your businessSellSell your productsSell online or in personCheck out customersWorld-class checkoutSell onlineGrow your business onlineSell across channelsReach millions of shoppers and boost salesSell

Map Reduce to split page text than summarize each piece of text, and then after summaize them altogether

In [139]:
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.mapreduce import MapReduceChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain

llm = ChatOpenAI(temperature=0, openai_api_key=openai.api_key)

# Map
map_template = """The following is a set of documents
{docs}
Based on this list of docs, please identify the main themes 
Helpful Answer:"""
map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm, prompt=map_prompt)
loader = WebBaseLoader("https://www.shopify.com/blog/ecommerce-seo-beginners-guide")
docs = loader.load()

In [140]:
from langchain import hub
map_prompt = hub.pull("rlm/map-prompt")
map_chain = LLMChain(llm=llm, prompt=map_prompt)

In [141]:
# Reduce
reduce_template = """The following is set of summaries:
{doc_summaries}
Take these and distill it into a final, consolidated summary of the main themes. 
Helpful Answer:"""
reduce_prompt = PromptTemplate.from_template(reduce_template)

In [142]:
# Note we can also get this from the prompt hub, as noted above
reduce_prompt = hub.pull("rlm/map-prompt")

In [143]:
# Run chain
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="docs"
)

# Combines and iteravely reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=4000,
)

In [144]:
# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="docs",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
)

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, chunk_overlap=0
)
split_docs = text_splitter.split_documents(docs)

Created a chunk of size 1195, which is longer than the specified 1000


In [148]:
map_reduce_output = map_reduce_chain.run(split_docs)
print(map_reduce_output)

Based on the list of documents provided, the main themes can be identified as follows:

1. Ecommerce SEO: The Ultimate Beginner's Guide (2023)
2. Create your store
3. Build your dream business for $1/month
4. Start a free trial and enjoy 3 months of Shopify for $1/month on select plans.

The main themes in these documents are:

1. Starting a business: "Start your business", "Build your brand", "Create your website", "Customize your store", "Find business apps", "Own your site domain", "Explore free business tools", "Tools to run your business"
2. Selling products: "Sell your products", "Sell online or in person", "Check out customers", "World-class checkout", "Sell online", "Grow your business online", "Sell across channels", "Reach millions of shoppers and boost sales", "Sell in person", "Point of Sale (POS)", "Sell globally", "International sales", "Sell wholesale & direct", "Business-to-business (B2B)", "Accept online payments", "Set up forms of payment"
3. Marketing and promoting t

In [150]:
# Here's another example, but with a compound typed field.
class ShopifyBlog(BaseModel):
    name: str = Field(description="Article Title")
    article_steps: List[str] = Field(description="Extract the steps shown in the blog on How to create an ecommerce SEO strategy")


actor_query = map_reduce_output

parser = PydanticOutputParser(pydantic_object=ShopifyBlog)

prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

_input = prompt.format_prompt(query=actor_query)

output = model(_input.to_string())

parser.parse(output)

OutputParserException: Failed to parse ShopifyBlog from completion {"name": "Ecommerce SEO: The Ultimate Beginner's Guide (2023)", "article_main_idea": "Ecommerce SEO", "article_steps": ["Build your brand", "Create your website", "Customize your store", "Find business apps", "Own your site domain", "Explore free business tools", "Tools to run your business", "Sell your products", "Sell online or in person", "Check out customers", "World-class checkout", "Sell online", "Grow your business online", "Sell across channels", "Reach millions of shoppers and boost sales", "Sell in person", "Point of Sale (POS)", "Sell globally", "International sales", "Sell wholesale & direct", "Business-to-business (B2B)", "Accept online payments", "Set up forms of payment", "Market your business", "Reach & retain customers", "Market across social", "Social media integrations", "Chat with customers", "Nurture customers", "Know your audience", "Gain customer insights", "Manage your business", "Track sales, orders & analytics", "Measure your performance", "Analytics and Reporting", "Ship orders faster", "Manage your stock & orders", "Inventory & order management", "Outsource fulfillment & returns", "Get paid faster", "Secure business funding", "Automate your business", "Explore all Shopify products & features", "Shopify Editions", "New, innovative Shopify products", "Scale your business", "Essential tools", "Business name generator", "Logo maker", "Stock photography", "Business Plan Template", "Link in bio tool", "QR code generator", "Get 24/7 support", "How-to guides", "Read in-depth business guides", "Business Courses", "Learn from proven experts", "Shopify blog", "Business strategy tips", "What is Shopify?", "How our commerce platform works", "Founder Stories", "Build your brand from scratch", "Build a marketing plan", "Ecommerce SEO", "Improve your search ranking", "Social media strategy", "Turn social into sales", "Help and support", "How-to guides", "Read in-depth business guides", "Business Courses", "Learn from proven experts", "Shopify blog", "Business strategy tips", "Popular topics", "USA", "Australia", "Belgium", "Brasil", "Canada", "Colombia", "Danmark", "Deutschland", "España", "France", "Hong Kong SAR", "India", "Indonesia", "Ireland", "Italia", "Malaysia", "México", "Nederland", "New Zealand", "Nigeria", "Norway", "Philippines", "Singapore", "South Africa", "Sverige", "United Kingdom", "Việt Nam", "대한민국", "中国", "中國香港特別行政區", "台灣", "日本"]}

{"name": "Create your store", "article_main_idea": "Starting a business", "article_steps": ["Start your business", "Build your brand", "Create your website", "Customize your store", "Find business apps", "Own your site domain", "Explore free business tools", "Tools to run your business", "Sell your products", "Sell online or in person", "Check out customers", "World-class checkout", "Sell online", "Grow your business online", "Sell across channels", "Reach millions of shoppers and boost sales", "Sell in person", "Point of Sale (POS)", "Sell globally", "International sales", "Sell wholesale & direct", "Business-to-business (B2B)", "Accept online payments", "Set up forms of payment", "Market your business", "Reach & retain customers", "Market across social", "Social media integrations", "Chat with customers", "Nurture customers", "Know your audience", "Gain customer insights", "Manage your business", "Track sales, orders & analytics", "Measure your performance", "Analytics and Reporting", "Ship orders faster", "Manage your stock & orders", "Inventory & order management", "Outsource fulfillment & returns", "Get paid faster", "Secure business funding", "Automate your business", "Explore all Shopify products & features", "Shopify Editions", "New, innovative Shopify products", "Scale your business", "Essential tools", "Business name generator", "Logo maker", "Stock photography", "Business Plan Template", "Link in bio tool", "QR code generator", "Get 24/7 support", "How-to guides", "Read in-depth business guides", "Business Courses", "Learn from proven experts", "Shopify blog", "Business strategy tips", "What is Shopify?", "How our commerce platform works", "Founder Stories", "Build your brand from scratch", "Build a marketing plan", "Ecommerce SEO", "Improve your search ranking", "Social media strategy", "Turn social into sales", "Help and support", "How-to guides", "Read in-depth business guides", "Business Courses", "Learn from proven experts", "Shopify blog", "Business strategy tips", "Popular topics", "USA", "Australia", "Belgium", "Brasil", "Canada", "Colombia", "Danmark", "Deutschland", "España", "France", "Hong Kong SAR", "India", "Indonesia", "Ireland", "Italia", "Malaysia", "México", "Nederland", "New Zealand", "Nigeria", "Norway", "Philippines", "Singapore", "South Africa", "Sverige", "United Kingdom", "Việt Nam", "대한민국", "中国", "中國香港特別行政區", "台灣", "日本"]}

{"name": "Build your dream business for $1/month", "article_main_idea": "Starting a business", "article_steps": ["Start your business", "Build your brand", "Create your website", "Customize your store", "Find business apps", "Own your site domain", "Explore free business tools", "Tools to run your business", "Sell your products", "Sell online or in person", "Check out customers", "World-class checkout", "Sell online", "Grow your business online", "Sell across channels", "Reach millions of shoppers and boost sales", "Sell in person", "Point of Sale (POS)", "Sell globally", "International sales", "Sell wholesale & direct", "Business-to-business (B2B)", "Accept online payments", "Set up forms of payment", "Market your business", "Reach & retain customers", "Market across social", "Social media integrations", "Chat with customers", "Nurture customers", "Know your audience", "Gain customer insights", "Manage your business", "Track sales, orders & analytics", "Measure your performance", "Analytics and Reporting", "Ship orders faster", "Manage your stock & orders", "Inventory & order management", "Outsource fulfillment & returns", "Get paid faster", "Secure business funding", "Automate your business", "Explore all Shopify products & features", "Shopify Editions", "New, innovative Shopify products", "Scale your business", "Essential tools", "Business name generator", "Logo maker", "Stock photography", "Business Plan Template", "Link in bio tool", "QR code generator", "Get 24/7 support", "How-to guides", "Read in-depth business guides", "Business Courses", "Learn from proven experts", "Shopify blog", "Business strategy tips", "What is Shopify?", "How our commerce platform works", "Founder Stories", "Build your brand from scratch", "Build a marketing plan", "Ecommerce SEO", "Improve your search ranking", "Social media strategy", "Turn social into sales", "Help and support", "How-to guides", "Read in-depth business guides", "Business Courses", "Learn from proven experts", "Shopify blog", "Business strategy tips", "Popular topics", "USA", "Australia", "Belgium", "Brasil", "Canada", "Colombia", "Danmark", "Deutschland", "España", "France", "Hong Kong SAR", "India", "Indonesia", "Ireland", "Italia", "Malaysia", "México", "Nederland", "New Zealand", "Nigeria", "Norway", "Philippines", "Singapore", "South Africa", "Sverige", "United Kingdom", "Việt Nam", "대한민국", "中国", "中國香港特別行政區", "台灣", "日本"]}

{"name": "Start a free trial and enjoy 3 months of Shopify for $1/month on select plans", "article_main_idea": "Starting a business", "article_steps": ["Start your business", "Build your brand", "Create your website", "Customize your store", "Find business apps", "Own your site domain", "Explore free business tools", "Tools to run your business", "Sell your products", "Sell online or in person", "Check out customers", "World-class checkout", "Sell online", "Grow your business online", "Sell across channels", "Reach millions of shoppers and boost sales", "Sell in person", "Point of Sale (POS)", "Sell globally", "International sales", "Sell wholesale & direct", "Business-to-business (B2B)", "Accept online payments", "Set up forms of payment", "Market your business", "Reach & retain customers", "Market across social", "Social media integrations", "Chat with customers", "Nurture customers", "Know your audience", "Gain customer insights", "Manage your business", "Track sales, orders & analytics", "Measure your performance", "Analytics and Reporting", "Ship orders faster", "Manage your stock & orders", "Inventory & order management", "Outsource fulfillment & returns", "Get paid faster", "Secure business funding", "Automate your business", "Explore all Shopify products & features", "Shopify Editions", "New, innovative Shopify products", "Scale your business", "Essential tools", "Business name generator", "Logo maker", "Stock photography", "Business Plan Template", "Link in bio tool", "QR code generator", "Get 24/7 support", "How-to guides", "Read in-depth business guides", "Business Courses", "Learn from proven experts", "Shopify blog", "Business strategy tips", "What is Shopify?", "How our commerce platform works", "Founder Stories", "Build your brand from scratch", "Build a marketing plan", "Ecommerce SEO", "Improve your search ranking", "Social media strategy", "Turn social into sales", "Help and support", "How-to guides", "Read in-depth business guides", "Business Courses", "Learn from proven experts", "Shopify blog", "Business strategy tips", "Popular topics", "USA", "Australia", "Belgium", "Brasil", "Canada", "Colombia", "Danmark", "Deutschland", "España", "France", "Hong Kong SAR", "India", "Indonesia", "Ireland", "Italia", "Malaysia", "México", "Nederland", "New Zealand", "Nigeria", "Norway", "Philippines", "Singapore", "South Africa", "Sverige", "United Kingdom", "Việt Nam", "대한민국", "中国", "中國香港特別行政區", "台灣", "日本"]}. Got: Extra data: line 3 column 1 (char 2406)