# Alzabo Recommender
```
User intent + Uqbar docs -> Recommender⭐ => Recommendations
User intent + Recommendations -> Planner => Plan
Plan -> Alzabo => a real action in userspace (sign txn, send message, etc)
```

⭐ You are here.

The Recommender transforms user input (such as "I want to message ~dev and ~rus") into real API calls, which are then passed to the Planner.

## Utilities
This notebook also provides utilities for:

1. Easily concatenating Markdown files in multiple directories (recursive) into a single document
1. Converting documents into embeddings

### Concatenate and Chunk Markdown

The OpenAI embeddings API has a size limit of 8191 tokens. Chunk all the .md files in the given dirs into `concatenated-#.md` files.

In [1]:
import re
from typing import List
from pathlib import Path
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens


def split_into_chunks(text: str, max_tokens: int, model: str) -> List[str]:
    tokens = re.findall(r'\S+|\n', text)  # Break text into tokens
    chunks = []
    current_chunk = []

    token_count = 0
    for token in tokens:
        token_count += num_tokens_from_string(token, model)

        if token_count > max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            token_count = num_tokens_from_string(token, model)

        current_chunk.append(token)

    chunks.append(" ".join(current_chunk))
    return chunks


def concatenate_markdown_files(directories, max_tokens_per_file=8191, model="cl100k_base"):
    index = 0
    output_file = f"concatenated-{index}.md"

    with open(output_file, 'w') as output:  # Create an empty output file
        pass

    for directory in directories:
        print(f"Starting {directory}")
        
        for input_file in directory.rglob("*.md"):
            if output_file in input_file.name or "node_modules" in input_file.parts:
                continue

            with open(input_file, 'r') as md_file:
                input_content = md_file.read()

            content_chunks = split_into_chunks(input_content, max_tokens_per_file, model)

            for chunk in content_chunks:
                with open(output_file, 'r') as output:
                    old_content = output.read()
                    current_tokens = num_tokens_from_string(old_content, model)

                tokens_to_add = num_tokens_from_string(chunk, model)
                total_tokens = current_tokens + tokens_to_add

                # Respect embedding API limit.
                if total_tokens > max_tokens_per_file:
                    index += 1
                    output_file = f"concatenated-{index}.md"

                with open(output_file, 'a') as output:
                    output.write(chunk)
                    output.write("\n")

In [None]:
input_directories = ['../uqbar-core', '../pongo', '../pokur', '../dev-suite', '../nectar']
directories = [Path(directory) for directory in input_directories]
print(f"For directories: {directories}")
concatenate_markdown_files(directories, 8191)

### Create Embeddings

For all `concatenated-#.md`, retrieve embeddings and write to `embeddings-{date}.json`.

It's known that just writing to json is not ideal. The conversion to some other format (OpenAI's examples use `pickle`) is pending.

In [3]:
# creates vector embeddings for the text in concatenated readme files.
# api only accepts 8191 tokens at a time so we gotta split inputs like this.

import json
import os
import re
import requests
from datetime import datetime
from pathlib import Path

def get_embeddings(text):
    contents_escaped = json.dumps(text)
    text_clean = ' '.join(contents_escaped.split())

    payload = {
        "input": text_clean,
        "model": "text-embedding-ada-002"
    }

    with open("payload.txt", 'w') as f:
        f.write(json.dumps(payload))

    openai_api_key = os.environ['OPENAI_API_KEY']
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {openai_api_key}"
    }
    response = requests.post("https://api.openai.com/v1/embeddings", headers=headers, json=payload)
    return response.json()

In [None]:
for file_path in Path(".").iterdir():
    if not file_path.is_file() or not re.compile(r"concatenated-\d+\.md").match(file_path.name):
        continue

    output_file = f"embeddings-{datetime.now().strftime('%Y-%m-%d-%H:%M')}.json"

    with open(output_file, 'w') as output:  # Create an empty output file
        pass

    with open(file_path, 'r') as f:
        contents = f.read()

    embeddings = get_embeddings(contents).get('data', {})[0].get('embedding', None)

    with open(output_file, 'a') as f:
        f.write(str(embeddings))

## Recommender

Given some string of user intent ("I want to order a burger on the blockchain"), search our embeddings for the nearest neighbor API capabilities.

### 1. Imports

In [None]:
# imports
import pandas as pd
import pickle
from typing import list

from openai.embeddings_utils import (
    get_embedding,
    distances_from_embeddings,
    tsne_components_from_embeddings,
    chart_from_components,
    indices_of_nearest_neighbors_from_distances,
)

# constants
EMBEDDING_MODEL = "text-embedding-ada-002"

### 2. Load data