In [1]:
from datasets import load_dataset, Dataset
import re
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.set_default_device('cuda')

# Loading the godot docs and cleaning the .rst files

In [3]:

def clean_rst(content):
    # with open(file_path, 'r', encoding='utf-8') as f:
    #     content = f.read()

    # Remove directives (e.g., .. image::, .. note::)
    content = re.sub(r"\.\. [^\n]*::[^\n]*(\n +[^\n]*)*", "", content)

    # Remove inline roles (e.g., :ref:`label`)
    content = re.sub(r":[a-zA-Z0-9]+:`([^`]*)`", r"\1", content)

    # Remove hyperlinks (e.g., `text <url>`_)
    content = re.sub(r"`([^`]+) <[^>]+>`_", r"\1", content)

    # Remove substitution references (e.g., |substitution|)
    content = re.sub(r"\|[^|]+\|", "", content)

    # Remove comments (e.g., .. this is a comment)
    content = re.sub(r"\.\. .*\n", "", content)
        
    # Remove segmentation symbols
    content = re.sub(r'[~=-]+',"",content)
    
    # Remove excessive blank lines
    content = re.sub(r"\n{3,}", "\n", content)

    return content

In [4]:
from os import listdir
from os.path import isfile, join
mypath = "godot_docs/"
files = [mypath+f for f in listdir(mypath) if isfile(join(mypath, f))]

In [5]:
corpus = []
for file in files:
    with open(file,encoding="utf8") as f:
        doc = f.read()
    cleaned_doc = clean_rst(doc)
    corpus.append(cleaned_doc)
        

In [6]:
filenames = [re.sub(r"godot_docs/","",file) for file in files]

In [7]:
dataset = pd.DataFrame.from_dict({'File':filenames,'document':corpus})

In [8]:
print(dataset['document'][0])


Setting up the project
In this short first part, we'll set up and organize the project.

Launch Godot and create a new project.
When creating the new project, you only need to choose a valid *Project Path*. You can leave the other default settings alone.
    Download dodge_the_creeps_2d_assets.zip.
    The archive contains the images and sounds you'll be using
    to make the game. Extract the archive and move the ``art/``
    and ``fonts/`` directories to your project's directory.

 

    Download dodge_the_creeps_2d_assets.zip.
    The archive contains the images and sounds you'll be using
    to make the game. Extract the archive and move the ``art/``
    and ``fonts/`` directories to your project's directory.

    Ensure that you have the required dependencies to use C# in Godot.
    You need the latest stable .NET SDK, and an editor such as VS Code.
    See doc_c_sharp_setup.

 

    The C++ part of this tutorial wasn't rewritten for the new GDExtension system yet.

Your project 

In [9]:
hf_dataset = Dataset.from_pandas(dataset)

In [10]:
hf_dataset

Dataset({
    features: ['File', 'document'],
    num_rows: 1381
})

# Loading a pre-trained model to get an understanding of the baseline performance.

In [11]:
checkpoint = "sentence-transformers/multi-qa-mpnet-base-dot-v1"

In [12]:
model = AutoModel.from_pretrained(checkpoint)

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [13]:
model.to(torch.get_default_device())

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

In [14]:
def cls_pooling(model_output):
    # print(model_output.last_hidden_state[:,0,:].shape)
    return model_output.last_hidden_state[:, 0]

In [15]:
def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(torch.get_default_device()) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    # print(model_output.last_hidden_state.shape)
    return cls_pooling(model_output)

In [16]:
embeddings_dataset = hf_dataset.map(
    lambda x: {"embeddings": get_embeddings(x["document"]).detach().cpu().numpy()[0]}
)

Map: 100%|██████████| 1381/1381 [00:43<00:00, 31.41 examples/s]


In [17]:
embeddings_dataset.add_faiss_index(column="embeddings")

100%|██████████| 2/2 [00:00<00:00, 495.28it/s]


Dataset({
    features: ['File', 'document', 'embeddings'],
    num_rows: 1381
})

In [18]:
question = "What is an array?"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape

(1, 768)

In [19]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
)

In [20]:
samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

In [21]:
samples_df

Unnamed: 0,File,document,embeddings,scores
4,class_packedvector3array.rst.txt,:github_url: hide\nPackedVector3Array\nA packe...,"[0.26605814695358276, -0.5201044678688049, -0....",29.815216
3,class_packedfloat64array.rst.txt,:github_url: hide\nPackedFloat64Array\nA packe...,"[0.00835494790226221, -0.6012332439422607, -0....",29.756012
2,class_packedint64array.rst.txt,:github_url: hide\nPackedInt64Array\nA packed ...,"[-0.11035095900297165, -0.5851216912269592, -0...",29.539976
1,class_array.rst.txt,:github_url: hide\nArray\nA builtin data struc...,"[-0.05643428489565849, -0.22368930280208588, -...",27.741829
0,class_packedstringarray.rst.txt,:github_url: hide\nPackedStringArray\nA packed...,"[0.03704645112156868, -0.43851298093795776, -0...",27.18614


In [22]:
for _, row in samples_df.iterrows():
    # print(f"DOCUMENT: {row.document}")
    print(f"SCORE: {row.scores}")
    print(f"FILE: {row.File}")
    print("=" * 50)
    print()

SCORE: 29.815216064453125
FILE: class_packedvector3array.rst.txt

SCORE: 29.756011962890625
FILE: class_packedfloat64array.rst.txt

SCORE: 29.539976119995117
FILE: class_packedint64array.rst.txt

SCORE: 27.74182891845703
FILE: class_array.rst.txt

SCORE: 27.186140060424805
FILE: class_packedstringarray.rst.txt



# Finetuning the base bi-encoder using glaiveai/godot-4-docs dataset from HF

In [23]:
from datasets import load_dataset

ds = load_dataset("glaiveai/godot_4_docs")

In [24]:
ds.set_format("torch")

In [25]:
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments, SentenceTransformerModelCardData
from sentence_transformers.training_args import BatchSamplers

In [26]:
checkpoint = "sentence-transformers/multi-qa-mpnet-base-dot-v1"

In [27]:
model = SentenceTransformer(checkpoint)

In [28]:
loss = MultipleNegativesRankingLoss(model)

In [29]:
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="models/multi-qa-mpnet-base-dot-v1",
    # Optional training parameters:
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # losses that use "in-batch negatives" benefit from no duplicates
    # Optional tracking/debugging parameters:
    eval_strategy="no",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    logging_steps=100,
)

In [None]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=ds,
    loss=loss,
)
trainer.train()

In [31]:
# 8. Save the trained model
model.save_pretrained("models/multi-qa-mpnet-glaive-godotdocs-dot/final")



In [None]:
# 9. (Optional) Push it to the Hugging Face Hub
model.push_to_hub("multi-qa-mpnet-glaive-godotdocs-dot")

In [23]:
ft_model = AutoModel.from_pretrained('models/multi-qa-mpnet-glaive-godotdocs-dot/final')


In [25]:
ft_tokenizer = AutoTokenizer.from_pretrained('models/multi-qa-mpnet-glaive-godotdocs-dot/final')

# Performing a semantic search on godot docs with the finetuned docs

In [24]:
def cls_pooling(model_output):
    # print(model_output.last_hidden_state[:,0,:].shape)
    return model_output.last_hidden_state[:, 0]

In [26]:
def get_embeddings_ft(text_list):
    encoded_input = ft_tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(torch.get_default_device()) for k, v in encoded_input.items()}
    model_output = ft_model(**encoded_input)
    # print(model_output.last_hidden_state.shape)
    return cls_pooling(model_output)

In [27]:
embeddings_dataset = hf_dataset.map(
    lambda x: {"embeddings": get_embeddings_ft(x["document"]).detach().cpu().numpy()[0]}
)

Map: 100%|██████████| 1381/1381 [00:44<00:00, 30.94 examples/s]


In [28]:
embeddings_dataset.add_faiss_index(column="embeddings")

100%|██████████| 2/2 [00:00<00:00, 468.22it/s]


Dataset({
    features: ['File', 'document', 'embeddings'],
    num_rows: 1381
})

In [44]:
question = "How do I write a vertex shader to mimic ps1 look?"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape

(1, 768)

In [45]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
)

In [46]:
samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

In [47]:
samples_df

Unnamed: 0,File,document,embeddings,scores
4,compute_shaders.rst.txt,\nUsing compute shaders\nThis tutorial will wa...,"[-0.040888749063014984, -0.1411910355091095, -...",53.245289
3,class_visualshader.rst.txt,:github_url: hide\nVisualShader\n**Inherits:**...,"[0.028908975422382355, -0.14371901750564575, -...",52.143013
2,class_color.rst.txt,:github_url: hide\nColor\nA color represented ...,"[0.006059261970221996, -0.21860522031784058, -...",51.480659
1,shaders_style_guide.rst.txt,\nShaders style guide\nThis style guide lists ...,"[0.35104429721832275, 0.028790568932890892, -0...",51.431374
0,class_gradienttexture1d.rst.txt,:github_url: hide\nGradientTexture1D\n**Inheri...,"[0.22399283945560455, -0.2904146909713745, -0....",51.217575


In [48]:
for _, row in samples_df.iterrows():
    # print(f"DOCUMENT: {row.document}")
    print(f"SCORE: {row.scores}")
    print(f"FILE: {row.File}")
    print("=" * 50)
    print()

SCORE: 53.24528884887695
FILE: compute_shaders.rst.txt

SCORE: 52.14301300048828
FILE: class_visualshader.rst.txt

SCORE: 51.48065948486328
FILE: class_color.rst.txt

SCORE: 51.431373596191406
FILE: shaders_style_guide.rst.txt

SCORE: 51.21757507324219
FILE: class_gradienttexture1d.rst.txt

