If you're not running in Saturn Cloud, you need to install these libraries:

Make sure you use the latest versions

```
pip install -U transformers accelerate bitsandbytes
```

In [1]:
import os

os.environ['HF_HOME'] = '/run/cache/'

In [2]:
# -- Rodar só uma vez, para baixar
#!rm -f minsearch.py
#!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

In [3]:
import requests 
import minsearch

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.Index at 0x7f92ba79ea90>

In [4]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [5]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [6]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

#### Baixando o llm
###### Link do código/modelo: https://huggingface.co/mistralai/Mistral-7B-v0.1

In [7]:
!df -h # Checa espaços disponíveis para bixar o modelo (~10gb)

Filesystem      Size  Used Avail Use% Mounted on
overlay         100G   36G   65G  36% /
tmpfs            64M     0   64M   0% /dev
tmpfs           7.7G     0  7.7G   0% /sys/fs/cgroup
/dev/nvme0n1p1  100G   36G   65G  36% /run
tmpfs            14G     0   14G   0% /dev/shm
/dev/nvme2n1    2.0G  159M  1.8G   9% /home/jovyan
tmpfs            14G  120K   14G   1% /home/jovyan/.saturn
tmpfs            14G   12K   14G   1% /run/secrets/kubernetes.io/serviceaccount
tmpfs           7.7G   12K  7.7G   1% /proc/driver/nvidia
tmpfs           7.7G  3.0M  7.7G   1% /run/nvidia-persistenced/socket
tmpfs           7.7G     0  7.7G   0% /proc/acpi
tmpfs           7.7G     0  7.7G   0% /sys/firmware


In [8]:
os.environ['HF_HOME']

'/run/cache/'

In [10]:
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
from huggingface_hub import login

# Autenticação:

# hugging face -> log in (meu gmail) -> settings -> access tokens -> create (type=read)

In [11]:
os.environ['HF_TOKEN'] = 'hf_token_copiada_do_site_hf' # Já fiz - pra não vazar- , não rodar para não alterar

In [12]:
login(token=os.environ['HF_TOKEN'])

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /run/cache/token
Login successful


In [13]:
# Baixando o modelo mistral 7b
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", device_map="auto", load_in_4bit=True)
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="left")


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [16]:
# Exemplo - GENÉRICO
model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to("cuda")
generated_ids = model.generate(**model_inputs)
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'A list of colors: red, blue, green, yellow, orange, purple, pink,'

In [27]:
## Fluxo COM RAG

# Flucxo COM RAG

def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}

ANSWER: 

""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

def llm_mistral_7b(prompt):
    
    model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
    generated_ids = model.generate(**model_inputs, max_new_tokens=500)
    result = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return result

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm_mistral_7b(prompt)
    return answer


In [29]:
# Pergunta com RAG
# Esse modelo é meio ruim pra essa tarefa, por ter um prompt longo, que vem "carregado" na resposta
resposta = rag('I just discovered the course. Can I still join it?')
resposta

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


"You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.\nUse only the facts from the CONTEXT when answering the QUESTION.\n\nQUESTION: I just discovered the course. Can I still join it?\n\nCONTEXT: \nsection: General course-related questions\nquestion: Course - Can I still join the course after the start date?\nanswer: Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.\n\nsection: General course-related questions\nquestion: Course - Can I follow the course after it finishes?\nanswer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.\n\nsection: General

In [35]:
prompt = build_prompt('I just discovered the course. Can I still join it?', search('I just discovered the course. Can I still join it?'))
prompt

"You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.\nUse only the facts from the CONTEXT when answering the QUESTION.\n\nQUESTION: I just discovered the course. Can I still join it?\n\nCONTEXT: \nsection: General course-related questions\nquestion: Course - Can I still join the course after the start date?\nanswer: Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.\n\nsection: General course-related questions\nquestion: Course - Can I follow the course after it finishes?\nanswer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.\n\nsection: General

In [37]:
print(resposta)

You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: I just discovered the course. Can I still join it?

CONTEXT: 
section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section: General course-related questions
question: Course - Can I follow the course after it finishes?
answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.

section: General course-related q

In [39]:
# TERIA QUE DIMINUIR O CONTEXTO / PROMPT OU AUMENTAR MUITO OS PARÂMETROS DE MAX_LENGTH PARA ENTRADA E/OU SAÍDA
# MODELO NÂO ADEQUADO PARA ESSA TASK

In [10]:
import torch 
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline 

torch.random.manual_seed(0) 

<torch._C.Generator at 0x7fda362628d0>

In [11]:
# Baixa o modelo Phi-3
model = AutoModelForCausalLM.from_pretrained( 
    "microsoft/Phi-3-mini-128k-instruct",  
    device_map="cuda",  
    torch_dtype="auto",  
    trust_remote_code=True,  
) 

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct") 


config.json:   0%|          | 0.00/3.48k [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-128k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-128k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

In [12]:
# Pipeline do modelo
pipe = pipeline( 
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
) 


In [13]:
# Parâmetros da chamada - EXEMPLO
messages = [ 
    {"role": "system", "content": "You are a helpful AI assistant."}, 
    {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"}, 
    {"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."}, 
    {"role": "user", "content": "What about solving an 2x + 3 = 7 equation?"}, 
] 



generation_args = { 
    "max_new_tokens": 500, 
    "return_full_text": False, 
    "temperature": 0.0, 
    "do_sample": False, 
} 

output = pipe(messages, **generation_args) 
print(output[0]['generated_text'])


The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
You are not running the flash-attention implementation, expect numerical differences.


 To solve the equation 2x + 3 = 7, follow these steps:

1. Subtract 3 from both sides of the equation:
   2x + 3 - 3 = 7 - 3
   2x = 4

2. Divide both sides of the equation by 2:
   2x/2 = 4/2
   x = 2

So, the solution to the equation 2x + 3 = 7 is x = 2.


In [14]:
# Pergunta do curso - datatalks - sem rag 

messages = [ 
    {"role": "user", "content": "I just discovered the course. Can I still join?"}
] 



generation_args = { 
    "max_new_tokens": 500, 
    "return_full_text": False, 
    "temperature": 0.0, 
    "do_sample": False, 
} 

# -- RESPOSTA GENÉRICA
output = pipe(messages, **generation_args) 
print(output[0]['generated_text'])


 I'm sorry, but it seems you've missed the opportunity to join the course. However, I can help you find similar courses or provide information on how to enroll in future sessions. Would you like me to assist you with that?


In [17]:
# Flucxo COM RAG

def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

def llm_phi3(prompt):
    messages = [ 
    {"role": "user", "content": prompt}
] 



    generation_args = { 
        "max_new_tokens": 500, 
        "return_full_text": False, 
        "temperature": 0.0, 
        "do_sample": False, 
    }
    
    #input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to("cuda")
    output = pipe(messages, **generation_args) 
    result = output[0]['generated_text']

    return result

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm_phi3(prompt)
    return answer


In [18]:
# Resposta COM RAG
rag("I just discovered the course. Can I still join?")

' Yes, you can still join the course even if you discover it after the start date. You are still eligible to submit the homeworks, but remember to meet the deadlines for the final projects.'