![](2023-09-09-19-16-04.png)

In [1]:
import openai
from bs4 import BeautifulSoup
import os
from dotenv import load_dotenv
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
browserless_api_key = os.getenv("BROWSERLESS_API_KEY")

In [2]:
def get_response(prompt_question):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-16k-0613",
        messages=[{"role": "system", "content": "You are a helpful research and\
            programming assistant"},
                  {"role": "user", "content": prompt_question}]
    )
    
    return response["choices"][0]["message"]["content"]


# Goal

Given an initial url. I want an agent to use the resources found there to try and answer a given query (rather than look for it using the a search engine).

In [128]:
import re
import requests
import json
import tiktoken

def get_num_tokens(prompt, model="gpt-3.5-turbo-16k-0613"):
    
    enc = tiktoken.encoding_for_model(model)

    return len(enc.encode(prompt))


def scrape_website(url: str, max_token_size=16385) -> str:
    """Scrape the contents of a website and return the text"""
    response = requests.get(url)
    # soup = BeautifulSoup(response.text, "html.parser")
    # webpage_contents = soup.get_text()
    output = response.text
    num_tokens = get_num_tokens(output)
    if num_tokens>max_token_size:
        len_diff = int((num_tokens - max_token_size)*4.2)
        output = output[:len(output) - len_diff]
    
    return output 


def extract_clean_urls(input_string):
    # Step 1: Extract the URLs using regular expression
    url_pattern = r'https?://[^\s]+'
    raw_urls = re.findall(url_pattern, input_string)
    
    # Step 2: Clean the URLs
    cleaned_urls = []
    for url in raw_urls:
        # Remove trailing symbols like '\n1', '\n2', etc.
        cleaned_url = re.sub(r'\\n\d+$', '', url)
        cleaned_urls.append(cleaned_url)
    
    # Step 3: Return the cleaned URLs as a Python list
    return cleaned_urls

In [4]:
url = "https://keras.io/examples/"

website_contents = scrape_website(url)
website_contents

'<!DOCTYPE html>\n<html lang="en">\n\n<head>\n\n  <meta charset="utf-8">\n  <meta name="viewport" content="width=device-width, initial-scale=1">\n  <meta name="description" content="Keras documentation">\n  <meta name="author" content="Keras Team">\n  <link rel="shortcut icon" href="https://keras.io/img/favicon.ico">\n\n  <!-- Social -->\n  <meta property="og:title" content="Keras documentation: Code examples">\n  <meta property="og:image" content="https://keras.io/img/logo-k-keras-wb.png">\n  <meta name="twitter:title" content="Keras documentation: Code examples">\n  <meta name="twitter:image" content="https://keras.io/img/k-keras-social.png">\n  <meta name="twitter:card" content="summary">\n\n  <title>Code examples</title>\n\n  <!-- Bootstrap core CSS -->\n  <link href="/css/bootstrap.min.css" rel="stylesheet">\n\n  <!-- Custom fonts for this template -->\n  <link href="https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600;700;800&display=swap" rel="stylesheet">\n\n  <!-- C

In [5]:
# extract all the links from the page
links = extract_clean_urls(website_contents)
links

['https://keras.io/img/favicon.ico">',
 'https://keras.io/img/logo-k-keras-wb.png">',
 'https://keras.io/img/k-keras-social.png">',
 'https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600;700;800&display=swap"',
 "https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);",
 "https://www.google-analytics.com/analytics.js','ga');",
 'https://buttons.github.io/buttons.js"></script>',
 'https://www.googletagmanager.com/ns.html?id=GTM-5DNGF4N"',
 'https://colab.research.google.com/notebooks/welcome.ipynb">Google',
 'https://github.com/keras-team/keras-io">keras.io',
 'https://github.com/keras-team/keras-io/blob/master/README.md"><code>tutobooks</code>',
 'https://policies.google.com/terms">Terms</a>',
 'https://policies.google.com/privacy">Privacy</a>']

In [6]:
import tiktoken

def get_num_tokens(prompt):
    
    enc = tiktoken.encoding_for_model("gpt-3.5-turbo-16k-0613")

    return len(enc.encode(prompt))


get_num_tokens(website_contents)

14348

In [7]:
topic = "Image classification"
prompt = f"GIven the contents of a webpage shown below: {website_contents}, extract all the relevant urls related to this topic: '{topic}'. The output should ONLY be a Python list with the complete urls containing the domain, for example: ['https://domain.com/example/nested/', ....]. Output:"

urls = get_response(prompt)

In [8]:
import ast

urls = ast.literal_eval(urls)
urls

['https://keras.io/examples/vision/image_classification_from_scratch',
 'https://keras.io/examples/vision/mnist_convnet',
 'https://keras.io/examples/vision/image_classification_efficientnet_fine_tuning',
 'https://keras.io/examples/vision/image_classification_with_vision_transformer',
 'https://keras.io/examples/vision/bit',
 'https://keras.io/examples/vision/attention_mil_classification',
 'https://keras.io/examples/vision/mlp_image_classification',
 'https://keras.io/examples/vision/mobilevit',
 'https://keras.io/examples/vision/xray_classification_with_tpus',
 'https://keras.io/examples/vision/cct',
 'https://keras.io/examples/vision/convmixer',
 'https://keras.io/examples/vision/eanet',
 'https://keras.io/examples/vision/involution',
 'https://keras.io/examples/vision/perceiver_image_classification',
 'https://keras.io/examples/vision/reptile',
 'https://keras.io/examples/vision/semisupervised_simclr',
 'https://keras.io/examples/vision/swin_transformers',
 'https://keras.io/examp

Now, for each of these urls, we would like to search for information related to the problem or topic we are looking for.

In [9]:

for url in urls:
    url_contents = scrape_website(url)
    prompt2 = f"Given this topic im searching for: '{topic}', is there relevant information in this url: {url} whose contents are below: '''{url_contents}'''? The output should ONLY be 'True' or 'False', nothing else. Output:"
    if get_num_tokens(url_contents) > 16385:
        print("The contents of this url are too long to be processed by GPT-3. Please try another url.")
    else:
        response = get_response(prompt2)
        print("Answer:")
        print(response)
        break

The contents of this url are too long to be processed by GPT-3. Please try another url.
Answer:
False


In [10]:
url

'https://keras.io/examples/vision/mnist_convnet'

Now that we have an resource url containing relevant information we can prompt ChatGPT to extract what we want from that url.

In [11]:
url_contents = scrape_website(url)
prompt3 = f"Extract the relevant code for '{topic}' from the following contents: {url_contents}."
final_output = get_response(prompt3)

final_output

'Sure! The relevant code for \'Image classification\' is as follows:\n\n```python\nimport numpy as np\nfrom tensorflow import keras\nfrom tensorflow.keras import layers\n\n# Model / data parameters\nnum_classes = 10\ninput_shape = (28, 28, 1)\n\n# Load the data and split it between train and test sets\n(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()\n\n# Scale images to the [0, 1] range\nx_train = x_train.astype("float32") / 255\nx_test = x_test.astype("float32") / 255\n\n# Make sure images have shape (28, 28, 1)\nx_train = np.expand_dims(x_train, -1)\nx_test = np.expand_dims(x_test, -1)\nprint("x_train shape:", x_train.shape)\nprint(x_train.shape[0], "train samples")\nprint(x_test.shape[0], "test samples")\n\n# convert class vectors to binary class matrices\ny_train = keras.utils.to_categorical(y_train, num_classes)\ny_test = keras.utils.to_categorical(y_test, num_classes)\n\nmodel = keras.Sequential(\n    [\n        keras.Input(shape=input_shape),\n        lay

<!-- Final output example:

import numpy as np
from tensorflow import keras
from tensorflow.keras import layers

# Model / data parameters
num_classes = 10
input_shape = (28, 28, 1)

# Load the data and split it between train and test sets
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

# Scale images to the [0, 1] range
x_train = x_train.astype("float32") / 255
x_test = x_test.astype("float32") / 255

# Make sure images have shape (28, 28, 1)
x_train = np.expand_dims(x_train, -1)
x_test = np.expand_dims(x_test, -1)

# Convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

model = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Flatten(),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation="softmax"),
    ]
)

model.summary()

batch_size = 128
epochs = 15

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)

score = model.evaluate(x_test, y_test, verbose=0)
print("Test loss:", score[0])
print("Test accuracy:", score[1]) -->

First level automation with langchain would be to use PromptTemplates and OutputParsers to organize this code into a more cohesive tool
in this case we would join everything into a chain that we could use to perform the actions we want

In [78]:
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
# Lets run this first prompt inside a chain
from langchain.chains import LLMChain
from langchain.output_parsers import CommaSeparatedListOutputParser
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI

output_parser = CommaSeparatedListOutputParser()
topic = "Image classification"
#prompt = f"Given the contents of a webpage shown below: {website_contents}, extract all the relevant urls related to this topic: '{topic}'. The output should ONLY be a Python list with the complete urls containing the domain, for example: ['https://domain.com/example/nested/', ....]. Output:"
llm = ChatOpenAI(temperature=0.0, model="gpt-3.5-turbo-16k-0613")
prompt = PromptTemplate(
    input_variables=["website_contents", "topic"],
    template="Given the contents of a webpage shown below: {website_contents}, extract all the relevant urls related to this topic: '{topic}'. The output should ONLY be a Python list with the complete urls containing the domain, for example: ['https://domain.com/example/nested/', ....]. Output:",
    output_parser=output_parser
)

chain1 = LLMChain(llm=llm, prompt=prompt)
# Run the chain only specifying the input variable.
urls = chain1.run({"website_contents": website_contents,"topic": topic})
urls = ast.literal_eval(urls)
print(urls)

['https://keras.io/examples/vision/image_classification_from_scratch', 'https://keras.io/examples/vision/mnist_convnet', 'https://keras.io/examples/vision/image_classification_efficientnet_fine_tuning', 'https://keras.io/examples/vision/image_classification_with_vision_transformer', 'https://keras.io/examples/vision/bit', 'https://keras.io/examples/vision/attention_mil_classification', 'https://keras.io/examples/vision/mlp_image_classification', 'https://keras.io/examples/vision/mobilevit', 'https://keras.io/examples/vision/xray_classification_with_tpus', 'https://keras.io/examples/vision/cct', 'https://keras.io/examples/vision/convmixer', 'https://keras.io/examples/vision/eanet', 'https://keras.io/examples/vision/involution', 'https://keras.io/examples/vision/perceiver_image_classification', 'https://keras.io/examples/vision/reptile', 'https://keras.io/examples/vision/semisupervised_simclr', 'https://keras.io/examples/vision/swin_transformers', 'https://keras.io/examples/vision/vit_sm

Perfect! Now that we seem to have succeessffuly extracted the urls, let's go to the next step.

In [80]:

template = "Given this topic im searching for: '{topic}', is there relevant information in this url: {url} whose contents are below: '''{website_contents}'''? The output should ONLY be 'True' or 'False', nothing else. Output:"

prompt2 = PromptTemplate(
    input_variables=["topic", "url", "website_contents"],
    template=template,
    
    
)
chain2 = LLMChain(llm=llm, prompt=prompt2)
topic = "Image classification"
url = urls[1]
url_contents = scrape_website(url)
output = chain2.run({"topic": topic, "url": url, "website_contents": url_contents})
print(output)

False


In [95]:
template = "Given the contents of a webpage shown below: {website_contents}, extract the relevant formatted code related to this topic: '{topic}'.\
    The output should ONLY be Python code formatted as such:\
        '''\
        <imports>\
            Imports go here\
        <imports>\
        <Data loading and preprocessing>\
            Data code goes here\
        <Data loading and preprocessing>\
        <training and evaluation>\
            training and eval code go here\
        <training and evaluation>\
        <visualization>\
            viz code goes here\
        <visualization>\
        <inference code>\
            inference code goes here\
        <inference code>\
        <model saving code>\
            model saving code goes here\
        <model saving code>\
        '''\
    In the case where the code is not found or irrelevant for specific parts, just leave a blank there.\
    Output:\
        Code:\n"

prompt3 = PromptTemplate(
    input_variables=["website_contents", "topic"],
    template=template,
)
chain3 = LLMChain(llm=llm, prompt=prompt3)
chain3.run({"website_contents": website_contents, "topic": topic})

"''' \n# Imports\nimport numpy as np\nimport pandas as pd\nimport tensorflow as tf\nfrom tensorflow import keras\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\n\n# Data loading and preprocessing\ndata = pd.read_csv('health_data.csv')\nX = data.drop('target', axis=1)\ny = data['target']\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\nscaler = StandardScaler()\nX_train_scaled = scaler.fit_transform(X_train)\nX_test_scaled = scaler.transform(X_test)\n\n# Model architecture\nmodel = keras.Sequential([\n    keras.layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),\n    keras.layers.Dense(32, activation='relu'),\n    keras.layers.Dense(1, activation='sigmoid')\n])\n\n# Model compilation\nmodel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])\n\n# Model training\nmodel.fit(X_train_scaled, y_train, epochs=10, batch_size=32, validation_data=(X

In [130]:
from langchain.chains.base import Chain
from typing import Any, Dict, List, Optional

class ConcatenateChain(Chain):
    chain_1: LLMChain
    chain_2: LLMChain
    chain_3: LLMChain
    
    @property
    def input_keys(self) -> List[str]:
        all_input_vars = set(self.chain_1.input_keys)
        return list(all_input_vars)
    
    @property
    def output_keys(self) -> List[str]:
        return ['final_output']
    
    def _call(self, inputs: Dict[str, str]) -> Dict[str, str]:
        output1 = self.chain_1.run(inputs)
        output1 = ast.literal_eval(output1)
        assert type(output1)==list
        for url in output1:
            print("Inside loop!")
            print("Processing this url: ", url)
            website_contents = scrape_website(url)
            output2 = self.chain_2.run({"topic": topic, "url": url, "website_contents": website_contents})
            print("Output2: ", output2)
            if "False" in output2:
                continue
            elif "True" in output2 or "Yes" in output2[:4]:
                print("Inside the True part")
                output3 = self.chain_3.run({"website_contents": website_contents, "topic": topic})
                print("Relevant URL: ", url)
                return {"final_output": output3}
            else:
                print("Nor false or true in the output.")
                output3 = ""
        
        return {"final_output": "No relevant urls found for this topic"}    

In [131]:
concat_chain = ConcatenateChain(chain_1=chain1, chain_2=chain2, chain_3=chain3)
topic = "Pneumonia classifier"
url = "https://keras.io/examples/"
url_contents = scrape_website(url)
concat_chain.run({"website_contents": url_contents, "topic": topic, "url": url})
#chain1.run({"website_contents": url_contents,"topic": topic})

Inside loop!
Processing this url:  https://keras.io/examples/vision/xray_classification_with_tpus
Output2:  Yes, there is relevant information in the given URL. The URL provides a code example for building an X-ray image classification model to predict whether an X-ray scan shows the presence of pneumonia.
Inside the True part
Relevant URL:  https://keras.io/examples/vision/xray_classification_with_tpus


'```python\nimport re\nimport os\nimport random\nimport numpy as np\nimport pandas as pd\nimport tensorflow as tf\nimport matplotlib.pyplot as plt\n\ntry:\n    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()\n    print("Device:", tpu.master())\n    strategy = tf.distribute.TPUStrategy(tpu)\nexcept:\n    strategy = tf.distribute.get_strategy()\nprint("Number of replicas:", strategy.num_replicas_in_sync)\n\nAUTOTUNE = tf.data.AUTOTUNE\nBATCH_SIZE = 25 * strategy.num_replicas_in_sync\nIMAGE_SIZE = [180, 180]\nCLASS_NAMES = ["NORMAL", "PNEUMONIA"]\n\ntrain_images = tf.data.TFRecordDataset(\n    "gs://download.tensorflow.org/data/ChestXRay2017/train/images.tfrec"\n)\ntrain_paths = tf.data.TFRecordDataset(\n    "gs://download.tensorflow.org/data/ChestXRay2017/train/paths.tfrec"\n)\n\nds = tf.data.Dataset.zip((train_images, train_paths))\n\n\ndef get_label(file_path):\n    parts = tf.strings.split(file_path, "/")\n    return parts[-2] == "PNEUMONIA"\n\n\ndef decode_img(img):