## Prepare Data

In [2]:
from langchain.llms import OpenAI
from langchain.docstore.document import Document
import requests
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.prompts import PromptTemplate
import pathlib
import subprocess
import tempfile

import os
os.environ['OPENAI_API_KEY'] = "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"

In [3]:
def get_github_docs(repo_owner, repo_name):
    with tempfile.TemporaryDirectory() as d:
        subprocess.check_call(
            f"git clone --depth 1 https://github.com/{repo_owner}/{repo_name}.git .",
            cwd=d,
            shell=True,
        )
        git_sha = (
            subprocess.check_output("git rev-parse HEAD", shell=True, cwd=d)
            .decode("utf-8")
            .strip()
        )
        repo_path = pathlib.Path(d)
        markdown_files = list(repo_path.glob("*/*.md")) + list(
            repo_path.glob("*/*.mdx")
        )
        for markdown_file in markdown_files:
            with open(markdown_file, "r") as f:
                relative_path = markdown_file.relative_to(repo_path)
                github_url = f"https://github.com/{repo_owner}/{repo_name}/blob/{git_sha}/{relative_path}"
                yield Document(page_content=f.read(), metadata={"source": github_url})

sources = get_github_docs("yirenlu92", "deno-manual-forked")

source_chunks = []
splitter = CharacterTextSplitter(separator=" ", chunk_size=1024, chunk_overlap=0)
for source in sources:
    for chunk in splitter.split_text(source.page_content):
        source_chunks.append(Document(page_content=chunk, metadata=source.metadata))

Cloning into '.'...


## Set Up Vector DB

In [4]:
# costs tokens
search_index = Chroma.from_documents(source_chunks, OpenAIEmbeddings())

Running Chroma using direct local API.
Using DuckDB in-memory for database. Data will be transient.


## Set Up LLM Chain with Custom Prompt

In [5]:
from langchain.chains import LLMChain
prompt_template = """Use the context below to write a 400 word blog post about the topic below:
    Context: {context}
    Topic: {topic}
    Blog post:"""

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "topic"]
)

llm = OpenAI(temperature=0)

chain = LLMChain(llm=llm, prompt=PROMPT)

## Generate Text

In [6]:
def generate_blog_post(topic):
    docs = search_index.similarity_search(topic, k=4)
    inputs = [{"context": doc.page_content, "topic": topic} for doc in docs]
    print(chain.apply(inputs))

In [8]:
generate_blog_post("environment variables")

[{'text': '\n\nEnvironment variables are a powerful tool for developers, allowing them to store and access data across different processes. They are especially useful when working with command line tools, as they can be used to pass data between different commands.\n\nIn this post, we\'ll look at two types of environment variables: shell variables and pipelines.\n\nShell variables are similar to environment variables, but they won\'t be exported to spawned commands. They are defined with the following syntax:\n\n```sh\nVAR_NAME=value\n```\n\nShell variables can be useful when we want to re-use a value, but don\'t want it available in any spawned processes.\n\nPipelines provide a way to pipe the output of one command to another. For example, the following command pipes the stdout output "Hello" to the stdin of the spawned Deno process:\n\n```sh\necho Hello | deno run main.ts\n```\n\nTo pipe stdout and stderr, use `|&` instead:\n\n```sh\ndeno eval \'console.log(1); console.error(2);\' |&