Acquiring data in text form is the first step to use embeddings. This tutorial creates a new set of data by crawling the OpenAI website, a technique that you can also use for your own company or personal website.

link : https://platform.openai.com/docs/tutorials/web-qa-embeddings

While this crawler is written from scratch, open source packages like Scrapy can also help with these operations.

Most of the code here have more specific comments for better comprehension.

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
%cd gdrive/MyDrive/Colab Notebooks/

/content/gdrive/MyDrive/Colab Notebooks


In [None]:
!pip install -r requirements.txt

Collecting aiohttp==3.8.5 (from -r requirements.txt (line 1))
  Downloading aiohttp-3.8.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.0 MB[0m [31m4.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━[0m [32m0.8/1.0 MB[0m [31m11.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Collecting appnope==0.1.3 (from -r requirements.txt (line 3))
  Downloading appnope-0.1.3-py2.py3-none-any.whl (4.4 kB)
Collecting asttokens==2.2.1 (from -r requirements.txt (line 4))
  Downloading asttokens-2.2.1-py2.py3-none-any.whl (26 kB)
Collecting async-timeout==4.0.2 (from -r requirements.txt (line 5))
  Downloading async_timeou

In [None]:
import requests          # For making HTTP requests to a website.
import re               # 're' is for regular expressions, used for text pattern matching.
import urllib.request   # For opening and reading URLs.
from bs4 import BeautifulSoup   # BeautifulSoup, from bs4, is used for parsing HTML and XML documents.
from collections import deque   # 'deque' is a list-like container with fast appends and pops.
from html.parser import HTMLParser  # For parsing HTML.
from urllib.parse import urlparse  # Parses URLs into components.
import os               # Provides a way of using operating system dependent functionality.

In [None]:
# Regex pattern to match a URL
HTTP_URL_PATTERN = r'^http[s]*://.+'   # Regular expression to match http and https URLs.

domain = "openai.com" # Domain to be crawled.
full_url = "https://openai.com/" # Full URL of the domain, including http/https.

# Create a class to parse the HTML and get the hyperlinks
class HyperlinkParser(HTMLParser):
    def __init__(self):
        super().__init__()  # Initialize the base class.
        self.hyperlinks = []  # List to store the hyperlinks found.

    # Override the handle_starttag method of HTMLParser to extract hyperlinks.
    def handle_starttag(self, tag, attrs):
        attrs = dict(attrs)  # Convert attrs tuple to a dictionary.

        # Check if the tag is an anchor tag ('a') and has an 'href' attribute.
        if tag == "a" and "href" in attrs:
            self.hyperlinks.append(attrs["href"])  # Add the 'href' value to the hyperlinks list.


The next function takes a URL as an argument, opens the URL, and reads the HTML content. Then, it returns all the hyperlinks found on that page.

In [None]:
# Function to get the hyperlinks from a URL
def get_hyperlinks(url):
    try:
        # Open the URL and read the HTML
        with urllib.request.urlopen(url) as response:
            # Check if the content type is HTML, return empty if not
            if not response.info().get('Content-Type').startswith("text/html"):
                return []

            # Read and decode the HTML content
            html = response.read().decode('utf-8')

    except Exception as e:  # Handle exceptions like connection errors
        print(e)
        return []

    # Create an instance of the HyperlinkParser class
    parser = HyperlinkParser()
    parser.feed(html)  # Feed the HTML content to the parser

    return parser.hyperlinks  # Return the list of hyperlinks extracted


The goal is to crawl through and index only the content that lives under the OpenAI domain. For this purpose, a function that calls the get_hyperlinks function but filters out any URLs that are not part of the specified domain is needed.

In [None]:
# Function to get hyperlinks from a URL within the same domain
def get_domain_hyperlinks(local_domain, url):
    clean_links = []  # List to store the cleaned and filtered links
    for link in set(get_hyperlinks(url)):  # Get hyperlinks and remove duplicates using set
        clean_link = None  # Initialize a variable to store the processed link

        # Check if the link is an absolute URL and within the same domain
        if re.search(HTTP_URL_PATTERN, link):
            url_obj = urlparse(link)  # Parse the URL
            if url_obj.netloc == local_domain:  # Check if the domain matches
                clean_link = link

        # Handle relative links
        else:
            if link.startswith("/"):
                link = link[1:]  # Remove leading slash for consistency
            elif link.startswith("#") or link.startswith("mailto:"):
                continue  # Skip anchor and mailto links
            clean_link = "https://" + local_domain + "/" + link

        # Clean up the link and add to the list
        if clean_link is not None:
            if clean_link.endswith("/"):
                clean_link = clean_link[:-1]  # Remove trailing slash
            clean_links.append(clean_link)

    return list(set(clean_links))  # Return a list of unique links


The crawl function is the final step in the web scraping task setup. It keeps track of the visited URLs to avoid repeating the same page, which might be linked across multiple pages on a site. It also extracts the raw text from a page without the HTML tags, and writes the text content into a local .txt file specific to the page.

In [None]:
def crawl(url):
    # Extract the domain from the URL
    local_domain = urlparse(url).netloc

    # Initialize a queue with the starting URL
    queue = deque([url])

    # Set to keep track of visited URLs
    seen = set([url])

    # Create directories for storing text and processed data
    os.makedirs("text/", exist_ok=True)
    os.makedirs(f"text/{local_domain}/", exist_ok=True)
    os.makedirs("processed", exist_ok=True)

    # Start crawling
    while queue:
        current_url = queue.pop()
        print(current_url)  # Output for progress tracking

        # Open a file for the current URL
        filename = 'text/' + local_domain + '/' + current_url[8:].replace("/", "_") + ".txt"
        with open(filename, "w", encoding="UTF-8") as f:
            response = requests.get(current_url)
            soup = BeautifulSoup(response.text, "html.parser")
            text = soup.get_text()

            # Check for JavaScript dependency
            if "You need to enable JavaScript to run this app." in text:
                print("Unable to parse page " + current_url)

            # Write the text to the file
            f.write(text)

        # Add new links to the queue
        for link in get_domain_hyperlinks(local_domain, current_url):
            if link not in seen:
                queue.appendleft(link)
                seen.add(link)

# Start the crawling process
crawl(full_url)


CSV is a common format for storing embeddings. You can use this format with Python by converting the raw text files (which are in the text directory) into Pandas data frames. Pandas is a popular open source library that helps you work with tabular data (data stored in rows and columns).
Blank empty lines can clutter the text files and make them harder to process. A simple function can remove those lines and tidy up the files.

In [None]:
def remove_newlines(serie):
    # Replace newline characters with a space
    serie = serie.str.replace('\n', ' ')
    # Replace escaped newline characters with a space
    serie = serie.str.replace('\\n', ' ')
    # Replace double spaces with a single space
    serie = serie.str.replace('  ', ' ')
    # Repeat replacing double spaces to handle cases of multiple consecutive spaces
    serie = serie.str.replace('  ', ' ')
    return serie



The provided script converts text files from a specific directory into a CSV file using Pandas, a powerful data manipulation library in Python. The code reads the text files, processes their content, and stores them in a Pandas DataFrame which is then written to a CSV file:

In [None]:
import pandas as pd

texts = []  # List to store the text files' contents

# Loop through the files in the specified directory
for file in os.listdir("text/" + domain + "/"):
    with open("text/" + domain + "/" + file, "r", encoding="UTF-8") as f:
        text = f.read()

        # Modify the file name and append it with the text to the list
        modified_fname = file[11:-4].replace('-', ' ').replace('_', ' ').replace('#update', '')
        texts.append((modified_fname, text))

# Create a DataFrame from the list
df = pd.DataFrame(texts, columns=['fname', 'text'])

# Process the text column to remove new lines and extra spaces
df['text'] = df.fname + ". " + remove_newlines(df['text'])

# Write the DataFrame to a CSV file
df.to_csv('processed/scraped.csv')

# Display the first few rows of the DataFrame
df.head()


---------------------- the same as tutorial ------------------------


In [None]:
import tiktoken

# Load the cl100k_base tokenizer which is designed to work with the ada-002 model
tokenizer = tiktoken.get_encoding("cl100k_base")

df = pd.read_csv('processed/scraped.csv', index_col=0)
df.columns = ['title', 'text']

# Tokenize the text and save the number of tokens to a new column
df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))

# Visualize the distribution of the number of tokens per row using a histogram
df.n_tokens.hist()

In [None]:
max_tokens = 500

# Function to split the text into chunks of a maximum number of tokens
def split_into_many(text, max_tokens = max_tokens):

    # Split the text into sentences
    sentences = text.split('. ')

    # Get the number of tokens for each sentence
    n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]

    chunks = []
    tokens_so_far = 0
    chunk = []

    # Loop through the sentences and tokens joined together in a tuple
    for sentence, token in zip(sentences, n_tokens):

        # If the number of tokens so far plus the number of tokens in the current sentence is greater
        # than the max number of tokens, then add the chunk to the list of chunks and reset
        # the chunk and tokens so far
        if tokens_so_far + token > max_tokens:
            chunks.append(". ".join(chunk) + ".")
            chunk = []
            tokens_so_far = 0

        # If the number of tokens in the current sentence is greater than the max number of
        # tokens, go to the next sentence
        if token > max_tokens:
            continue

        # Otherwise, add the sentence to the chunk and add the number of tokens to the total
        chunk.append(sentence)
        tokens_so_far += token + 1

    return chunks


shortened = []

# Loop through the dataframe
for row in df.iterrows():

    # If the text is None, go to the next row
    if row[1]['text'] is None:
        continue

    # If the number of tokens is greater than the max number of tokens, split the text into chunks
    if row[1]['n_tokens'] > max_tokens:
        shortened += split_into_many(row[1]['text'])

    # Otherwise, add the text to the list of shortened texts
    else:
        shortened.append( row[1]['text'] )

In [None]:
df = pd.DataFrame(shortened, columns = ['text'])
df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))
df.n_tokens.hist()

In [None]:
from openai import OpenAI

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

df['embeddings'] = df.text.apply(lambda x: client.embeddings.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding'])

df.to_csv('processed/embeddings.csv')
df.head()

In [None]:
import numpy as np
from openai.embeddings_utils import distances_from_embeddings

df=pd.read_csv('processed/embeddings.csv', index_col=0)
df['embeddings'] = df['embeddings'].apply(eval).apply(np.array)

df.head()

In [None]:
def create_context(
    question, df, max_len=1800, size="ada"
):
    """
    Create a context for a question by finding the most similar context from the dataframe
    """

    # Get the embeddings for the question
    q_embeddings = client.embeddings.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding']

    # Get the distances from the embeddings
    df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine')


    returns = []
    cur_len = 0

    # Sort by distance and add the text to the context until the context is too long
    for i, row in df.sort_values('distances', ascending=True).iterrows():

        # Add the length of the text to the current length
        cur_len += row['n_tokens'] + 4

        # If the context is too long, break
        if cur_len > max_len:
            break

        # Else add it to the text that is being returned
        returns.append(row["text"])

    # Return the context
    return "\n\n###\n\n".join(returns)

In [None]:
def answer_question(
    df,
    model="gpt-3.5-turbo",
    question="Am I allowed to publish model outputs to Twitter, without a human review?",
    max_len=1800,
    size="ada",
    debug=False,
    max_tokens=150,
    stop_sequence=None
):
    """
    Answer a question based on the most similar context from the dataframe texts
    """
    context = create_context(
        question,
        df,
        max_len=max_len,
        size=size,
    )
    # If debug, print the raw model response
    if debug:
        print("Context:\n" + context)
        print("\n\n")

    try:
        # Create a chat completion using the question and context
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "Answer the question based on the context below, and if the question can't be answered based on the context, say \"I don't know\"\n\n"},
                {"role": "user", f"content": "Context: {context}\n\n---\n\nQuestion: {question}\nAnswer:"}
            ],
            temperature=0,
            max_tokens=max_tokens,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop=stop_sequence,
        )
        return response.choices[0].message.strip()
    except Exception as e:
        print(e)
        return ""

In [None]:
answer_question(df, question="What day is it?", debug=False)

answer_question(df, question="What is our newest embeddings model?")

answer_question(df, question="What is ChatGPT?")

# "I don't know."

# 'The newest embeddings model is text-embedding-ada-002.'

# 'ChatGPT is a model trained to interact in a conversational way. It is able to answer followup questions, admit its mistakes, challenge incorrect premises, and reject inappropriate requests.'