<a href="https://colab.research.google.com/github/ComponentSoftTeam/Mistral-Kubectl-Instruct/blob/main/notebooks/kubectl_v2_openai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip -q install transformers peft bitsandbytes torch scipy pinecone-client sentence_transformers tqdm openai

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.7/174.7 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.4/179.4 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m220.5/220.5 kB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.4/300.4 kB[0m [31m

## Set up semantic search

In [None]:
from getpass import getpass
import pinecone
from openai import OpenAI

if not 'OPENAI_API_KEY' in globals():
    OPENAI_API_KEY =  getpass("OpenAI api key:")

if not 'PINECONE_API_KEY' in globals():
    PINECONE_API_KEY = getpass("Pinecone api key:")

openai = OpenAI(api_key=OPENAI_API_KEY)
pinecone.init(api_key=PINECONE_API_KEY, environment="gcp-starter")

  from tqdm.autonotebook import tqdm


OpenAI api key:··········
Pinecone api key:··········


In [None]:
import os
import pickle
import hashlib
import time
from torch import nn
import numpy as np
from sentence_transformers.cross_encoder import CrossEncoder

CACHE_DIR = './.cache'
INDEX_NAME = 'k8s'

if not os.path.exists(CACHE_DIR):
    os.makedirs(CACHE_DIR)


def cached(func):
    def wrapper(*args, **kwargs):
        SEP = '$|$'
        cache_token = (
            f'{func.__name__}{SEP}'
            f'{SEP.join(str(arg) for arg in args)}{SEP}'
            f'{SEP.join( str(key) + SEP * 2 + str(val) for key, val in kwargs.items())}'
        )

        hex_hash = hashlib.sha256(cache_token.encode()).hexdigest()
        cache_filename: str = os.path.join(CACHE_DIR, f"{hex_hash}")

        if os.path.exists(cache_filename):
            with open(cache_filename, "rb") as cache_file:
                return pickle.load(cache_file)

        result = func(*args, **kwargs)
        with open(cache_filename, "wb") as cache_file:
            pickle.dump(result, cache_file)

        return result
    return wrapper

@cached
def create_embedding(text: str):
    for _ in range(4):
      try:
        res = openai.embeddings.create(input=text, model="text-embedding-ada-002")
      except Exception as ex:
        print(f"Unknow error: {str(ex)}")
        time.sleep(3)
      else:
        break
    else:
      return None

    return res.data[0].embedding


index = pinecone.Index(INDEX_NAME)
def query_from_pinecone(query, top_k=3):
    embedding = create_embedding(query)
    if not embedding:
        return None

    return index.query(
      vector=embedding,
      top_k=top_k,
      include_metadata=True   # gets the metadata (text)
    ).get('matches')



cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
def get_results_from_pinecone(query, top_k=3, re_rank=True, verbose=True):

    results_from_pinecone = query_from_pinecone(query, top_k=top_k)
    if not results_from_pinecone:
        return []

    if verbose:
        print("Query:", query)


    final_results = []

    if re_rank:
        if verbose:
            print('Document ID (Hash)\t\tRetrieval Score\tCE Score\tText')

        sentence_combinations = [[query, result_from_pinecone['metadata']['text']] for result_from_pinecone in results_from_pinecone]

        # Compute the similarity scores for these combinations
        similarity_scores = cross_encoder.predict(sentence_combinations, activation_fct=nn.Sigmoid())

        # Sort the scores in decreasing order
        sim_scores_argsort = reversed(np.argsort(similarity_scores))

        # Print the scores
        for idx in sim_scores_argsort:
            result_from_pinecone = results_from_pinecone[idx]
            final_results.append(result_from_pinecone)
            if verbose:
                print(f"{result_from_pinecone['id']}\t{result_from_pinecone['score']:.2f}\t{similarity_scores[idx]:.2f}\t{result_from_pinecone['metadata']['text'][:50]}")
        return final_results

    if verbose:
        print('Document ID (Hash)\t\tRetrieval Score\tText')
    for result_from_pinecone in results_from_pinecone:
        final_results.append(result_from_pinecone)
        if verbose:
            print(f"{result_from_pinecone['id']}\t{result_from_pinecone['score']:.2f}\t{result_from_pinecone['metadata']['text'][:50]}")

    return final_results

config.json:   0%|          | 0.00/791 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
def semantic_search(prompt):
    final_results = get_results_from_pinecone(prompt, top_k=9, re_rank=True, verbose=True)
    if not final_results:
        return ""

    return '\n\n'.join(res['metadata']['text'].strip() for res in final_results[:3])

## Set up mistral

In [None]:
from transformers.generation.stopping_criteria import StoppingCriteria, StoppingCriteriaList
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import torch
from sentence_transformers import SentenceTransformer

base_model_id = 'mistralai/Mistral-7B-Instruct-v0.1'
lora_model_id = 'ComponentSoft/mistral-kubectl-instruct'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    lora_model_id,
    padding_side="left",
    add_eos_token=False,
    add_bos_token=True,
)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    use_cache=True,
    trust_remote_code=True,
)

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/5.06G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [None]:
model = PeftModel.from_pretrained(base_model, lora_model_id)
model.eval()

adapter_config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

adapter_model.bin:   0%|          | 0.00/170M [00:00<?, ?B/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): Linear4bit(
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
              )
              (k_proj): Linear4bit(
                (lora_dropout): ModuleDict(

In [None]:
def create_stop_criterion(*args):
    term_tokens = [torch.tensor(tokenizer.encode(term, add_special_tokens=False)).to("cuda") for term in args]

    class CustomStopCriterion(StoppingCriteria):
        def __call__(self, input_ids: torch.LongTensor, score: torch.FloatTensor, **kwargs):
            return any(torch.equal(e, input_ids[0][-len(e):]) for e in term_tokens)

    return CustomStopCriterion()

eval_stop_criterion = create_stop_criterion("</s>", "#End")
category_stop_criterion = create_stop_criterion("</s>", "\n")

start_template = '### Answer:'
command_template = '# Command:'
end_template = '#End'

In [None]:
def text_to_text_generation(prompt):
    prompt = prompt.strip()

    is_kubectl_prompt = (
        f"You are a helpful assistant who classifies prompts into three categories. [INST] Respond with 0 if it pertains to a 'kubectl' operation. This is an instruction that can be answered with a 'kubectl' action. Look for keywords like 'get', 'list', 'create', 'show', 'view', and other command-like words. This category is an instruction instead of a question. Respond with 1 only if the prompt is a question, and is about a definition related to Kubernetes, or non-action inquiries. Respond with 2 every other scenario, for example if the question is a general question, not related to Kubernetes or 'kubectl'.\n"
        f"So for instance the following:\n"
        f'text: "List all pods in Kubernetes"\n'
        f"Would get a response:\n"
        f'response (0/1/2): 0 [/INST] \n'
        f'text: "{prompt}"'
        f'response (0/1/2): '
    )


    model_input = tokenizer(is_kubectl_prompt, return_tensors="pt").to("cuda")
    with torch.no_grad():
        response = tokenizer.decode(model.generate(**model_input, max_new_tokens=8, pad_token_id=tokenizer.eos_token_id, repetition_penalty=1.15, stopping_criteria=StoppingCriteriaList([category_stop_criterion]))[0], skip_special_tokens=True)
    response = response[len(is_kubectl_prompt):]

    print(f'{" Query Start ":-^40}')
    print('Classified as: ' + response)

    response_num = 0 if '0' in response else ( 1 if '1' in response else 2 )

    match response_num:
        case 0:
          prompt = f'[INST] {prompt}\n Lets think step by step. [/INST] {start_template}'

        case 1:
          retrieved_results = semantic_search(prompt)
          prompt = f'You are a helpful kubernetes professional.  [INST] Use the following documentation, if it is relevant to answer the question bellow. [/INST]\nDocumentation: {retrieved_results} </s>\n<s> [INST] Answer the following question: {prompt} [/INST]\nAnswer: '

        case _:
          prompt = f'[INST] {prompt} [/INST]'

    print('Query:')
    print(prompt)


    # Generate output
    model_input = tokenizer(prompt, return_tensors="pt").to("cuda")
    with torch.no_grad():
      response = tokenizer.decode(model.generate(**model_input, max_new_tokens=256, pad_token_id=tokenizer.eos_token_id, repetition_penalty=1.15, stopping_criteria=StoppingCriteriaList([eval_stop_criterion]))[0], skip_special_tokens=True)


    start = response.index(start_template) + len(start_template) if start_template in response else len(prompt)
    start = response.index(command_template) + len(command_template) if command_template in response else start
    end = response.index(end_template) if end_template in response else len(response)

    if response_num == 1:
        start -= 3 + 4 # Accounting for the two special tokens


    true_response = response[start:end].strip()
    print('Returned: ' + true_response)
    print(f'{" QUERY END ":-^40}')


    return true_response.strip()

## Get results

In [None]:
res = text_to_text_generation('What is a stateless service?')
print('Response:\n' + res)

------------- Query Start --------------
Classified as: 1
Query: What is a stateless service?
Document ID (Hash)		Retrieval Score	CE Score	Text
96	0.89	1.00	The Stateless Service pattern describes how to cre
102	0.84	0.99	A complex distributed system is usually composed o
97	0.88	0.98	The microservices architecture style is the domina
113	0.84	0.98	Declares this Service as headless. Stateless Pods 
104	0.84	0.97	We have seen many Kubernetes primitives for creati
103	0.86	0.88	Distributed stateful applications require features
92	0.83	0.08	As the name suggests, this mechanism relies on a m
106	0.83	0.03	Similar to the storage requirements, a distributed
107	0.82	0.00	As you can see from the preceding requirements, cl
Query:
You are a helpful kubernetes professional.  [INST] Use the following documentation, if it is relevant to answer the question bellow. [/INST]
Documentation: The Stateless Service pattern describes how to create and operate applications that are composed of identical e