In [None]:
!rm -f minsearch.py
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

--2024-11-06 17:46:41--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: ‘minsearch.py’


2024-11-06 17:46:41 (71.6 MB/s) - ‘minsearch.py’ saved [3832/3832]



In [None]:
import requests
import minsearch

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.Index at 0x7bde957271c0>

In [None]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=3
    )

    return results

In [None]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

### Huggingface log in

In [None]:
from google.colab import userdata
# userdata.get('HF_TOKEN')

In [None]:
import os
os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')

In [None]:
from huggingface_hub import login

In [None]:
login(token=os.environ['HF_TOKEN'])

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


### Mistral-7b

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U accelerate
!pip install -q -U transformers

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig

In [None]:
# 設定量化配置
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype="float16"
)

# 加載模型
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1",
    device_map="auto",
    quantization_config=quantization_config
)

# 加載 tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "mistralai/Mistral-7B-v0.1",
    padding_side="left"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from transformers import pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

### How Mistral generates answers

In [None]:
# model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to("cuda")

In [None]:
# generated_ids = model.generate(**model_inputs)
# tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

### Put LLM with RAG

In [None]:
# def build_prompt(query, search_results):
#     prompt_template = """
# You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
# Use only the facts from the CONTEXT when answering the QUESTION.

# QUESTION: {question}

# CONTEXT:
# {context}
# """.strip()

#     context = ""

#     for doc in search_results:
#         context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

#     prompt = prompt_template.format(question=query, context=context).strip()
#     return prompt

# def llm(prompt):
#     model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
#     generated_ids = model.generate(**model_inputs, max_new_tokens=512)
#     result = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
#     return result

In [None]:
# rag("I just discovered the course. Can I still join it?")

In [None]:
# print(_)

In [None]:
def build_prompt(query, search_results):
    prompt_template = """
QUESTION: {question}

CONTEXT:
{context}

ANSWER:
""".strip()

    context = ""

    for doc in search_results:
        context = context + f"{doc['question']}\n{doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

def llm(prompt):
    response = generator(prompt, max_length=500, temperature=0.7, top_p=0.95, num_return_sequences=1)
    response_final = response[0]['generated_text']
    return response_final[len(prompt):].strip()

In [None]:
rag("I just discovered the course. Can I still join it?")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


'Yes, you can still join the course.'