In [21]:
import warnings
warnings.filterwarnings('ignore')

In [22]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
overlay          32G   19G   12G  62% /
tmpfs            64M     0   64M   0% /dev
shm              64M  4.0K   64M   1% /dev/shm
/dev/root        29G   24G  5.5G  82% /vscode
/dev/loop3       32G   19G   12G  62% /workspaces
/dev/sdb1        44G  8.1G   34G  20% /tmp


In [23]:
import os 
os.environ['HF_HOME'] = '/tmp/cache/'

In [24]:
# importing api key from .env
from dotenv import load_dotenv
load_dotenv()

hf_token = os.getenv('HF_API_KEY')

In [25]:
# loging to hf using token
from huggingface_hub import login

login(token = hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /tmp/cache/token
Login successful


In [26]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")


Downloading shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [01:20<00:00, 40.01s/it]
Loading checkpoint shards:   0%|                                                                                                         | 0/2 [00:00<?, ?it/s]


RuntimeError: unable to mmap 9942981696 bytes from file </tmp/cache/hub/models--mistralai--Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24/model-00001-of-00002.safetensors>: Cannot allocate memory (12)

In [6]:
import requests 
import minsearch

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.Index at 0x7c8b481713c0>

In [7]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [8]:
def build_prompt(query, search_results):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    
    QUESTION: {question}
    
    CONTEXT:
    {context}
    """.strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [None]:
def llm(prompt):
    messages = [
        {"role": "user", "content": prompt},
    ]

    model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
    model.to(device)
    
    generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True)
    decoded = tokenizer.batch_decode(generated_ids)[0]
    return decoded[0]

In [None]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [None]:
rag("The course has already started, Can I still join in?")