In [128]:
import os
import re
from time import sleep

from transformers import GPT2TokenizerFast
import numpy as np
import openai
import pandas as pd
import fandom
import nltk
from nltk import word_tokenize

In [2]:
openai.api_key = "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"
COMPLETION_MODEL = 'text-davinci-003'

In [3]:
os.makedirs(os.path.join('.', 'valorant_wiki'), exist_ok=True)
wiki_path = os.path.join('.', 'valorant_wiki')

In [4]:
fandom.set_wiki("valorant")

agents_list = ['Astra', 'Breach', 'Brimstone', 'Chamber', 'Cypher', 'Fade', 'Gekko', 'Harbor', 'Jett', 'KAYO', 'Killjoy', 'Neon', 'Omen', 'Phoenix', 'Raze', 'Reyna', 'Sage', 'Skye', 'Sova', 'Viper', 'Yoru']

In [143]:
def retrieve_agent_page(name: str, output_dir: str):
    assert name in agents_list
    page = fandom.page(name)
    os.makedirs(os.path.join(output_dir, f'{page.title}'), exist_ok=True)
    agent_path = os.path.join(output_dir, page.title)
    needed_sections = ['Biography', 'Personality', 'Appearance', 'Abilities', 'Relations']
    ability_map = {
        'C': '',
        'Q': '',
        'E': '',
        'X': ''
    }
    for key in ability_map.keys():
        ability_map[key] = re.findall(f'{key} – (.+)\n', page.section('Abilities'))[0]
    raw_list = []
    for sec in needed_sections:
        if page.section(sec) is None:
            continue
        fc = re.sub(r'(https?:\/\/(?:www\.|(?!www)))[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', '', page.section(sec))
        lines = fc.splitlines()
        if sec == 'Biography':
            lines = lines[4:]
            fc = '\n'.join(lines)
            fc = re.sub(r'[^\n]*Early life.*', '', fc, flags=re.IGNORECASE | re.DOTALL)
        sc = re.sub(r'\[.+\]', '', fc)
        tc = sc.removeprefix(f'{sec}\n')

        raw_list.append([name, sec, tc, len(word_tokenize(tc))])

    df = pd.DataFrame(raw_list, columns=['title', 'heading', 'content', 'tokens'])
    df.set_index(['title', 'heading'])

    # with open(os.path.join(agent_path, f'abilities.txt'), 'w') as f:
    #     for value in ability_map.values():
    #         f.write(page.title + '\n')
    #         new_page = fandom.page(value)
    #         f.write(f'{new_page.section("Description")}')
    df.to_csv(
        os.path.join(agent_path, f'{name}.csv'),
        header=True,
        index=False
    )
    return df


In [144]:
agent_dfs = []
for agent in agents_list:
    agent_dfs.append(retrieve_agent_page(agent, os.path.join(wiki_path, 'agents')))
complete = pd.concat(agent_dfs, axis=0)
complete.head()

Unnamed: 0,title,heading,content,tokens
0,Astra,Biography,"Hailing from Accra, Ghana, Efia Danso is a Rad...",449
1,Astra,Personality,"“\n ""You can tell a person's character by thei...",123
2,Astra,Appearance,Astra is of Ghanaian descent and has a dark sk...,161
3,Astra,Abilities,\nPassive\nX – Astral Form\nACTIVATE to enter ...,307
4,Astra,Relations,Astra has been observing many of VALORANT's ag...,110


In [145]:
complete.to_csv(os.path.join(wiki_path, 'agents', 'full.csv'), index=False, header=True)

In [146]:
EMBEDDING_MODEL = 'curie'

DOC_EMBEDDING_MODEL = f'text-search-{EMBEDDING_MODEL}-doc-001'
QUERY_EMBEDDING_MODEL = f'text-search-{EMBEDDING_MODEL}-query-001'

In [147]:
def get_embedding(text: str, model: str) -> list[float]:
    sleep(1)
    result = openai.Embedding.create(
        model=model,
        input=text)
    return result["data"][0]["embedding"]

def get_doc_embedding(text: str) -> list[float]:
    return get_embedding(text, DOC_EMBEDDING_MODEL)

def get_query_embedding(text: str) -> list[float]:
    return get_embedding(text, QUERY_EMBEDDING_MODEL)

def compute_doc_embeddings(df: pd.DataFrame) -> dict[tuple[str, str], list[float]]:
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.

    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    return {
        idx: get_doc_embedding(r.content.replace("\n", " ")) for idx, r in df.iterrows()
    }

In [148]:
def load_embeddings(fname: str) -> dict[tuple[str, str], list[float]]:
    """
    Read the document embeddings and their keys from a CSV.

    fname is the path to a CSV with exactly these named columns:
        "title", "heading", "0", "1", ... up to the length of the embedding vectors.
    """

    df = pd.read_csv(fname, header=0)
    max_dim = max([int(c) for c in df.columns if c != "title" and c != "heading"])
    return {
        (r.title, r.heading): [r[str(i)] for i in range(max_dim + 1)] for _, r in df.iterrows()
    }

In [149]:
context_embeddings = compute_doc_embeddings(complete)
with open('embeddings.txt', 'w') as f:
    f.write(str(context_embeddings))

In [150]:
def vector_similarity(x: list[float], y: list[float]) -> float:
    """
    We could use cosine similarity or dot product to calculate the similarity between vectors.
    In practice, we have found it makes little difference.
    """
    return np.dot(np.array(x), np.array(y))

In [151]:
def order_document_sections_by_query_similarity(query: str, contexts: dict[tuple[str, str], np.array]) -> list[tuple[float, tuple[str, str]]]:
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections.

    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_query_embedding(query)

    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)

    return document_similarities

In [173]:
MAX_SECTION_LEN = 4000
SEPARATOR = "\n* "

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
separator_len = len(tokenizer.tokenize(SEPARATOR))

In [174]:
def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame) -> str:
    """
    Fetch relevant
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
    print(df)
    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []

    for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.
        document_section = df.iloc[section_index]
        chosen_sections_len += document_section.tokens.sum() + separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break

        chosen_sections.append(SEPARATOR + document_section.content.replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))

    # Useful diagnostic information
    print(f"Selected {len(chosen_sections)} document sections:")
    print("\n".join(chosen_sections_indexes))

    header = """Answer the question as creatively as possible using the provided context. \n\nContext:\n"""

    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"

In [1]:
prompt = construct_prompt("""Generate a new Valorant sentinel agent. Print name, role, biography, abilities (C, Q, E, X, be creative, but make them fit into Valorant's meta), appearance, relationships with other agents.
    """,
                          context_embeddings,
                          complete)
print(len(prompt))
completion = openai.Completion.create(
    model=COMPLETION_MODEL,
    prompt=prompt,
    temperature=0.0,
    max_tokens=800
)
print(completion.choices[0].text)

NameError: name 'construct_prompt' is not defined