## Mistral-7B

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Mistral-7B-Instruct-v0.3
model_name = "mistralai/Mistral-7B-Instruct-v0.3"

# tokenizer and model（automatically use FP16; active GPU）
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16
)

  _ = torch.tensor([0], device=i)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


In [None]:
# Check
import torch
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

2.9.0.dev20250701+cu128
True
NVIDIA GeForce RTX 5080


In [None]:
# Input the prompt
with open("prompt.txt", "r", encoding="utf-8") as f:
    prompt = f.read()
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

In [None]:
response_only = True

# Output the answer
outputs = model.generate(
    **inputs,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
    pad_token_id=tokenizer.eos_token_id
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
if response_only:
    response = response[len(prompt):].strip()
with open("output.txt", "w", encoding="utf-8") as f:
    f.write(response)
print("Generated and saved.")

Generated and saved.


## LLM with Retrieval-augmented generation (RAG)

In [8]:
# Download the dataset from https://huggingface.co/datasets/Amod/mental_health_counseling_conversations
# And take a look

from datasets import load_dataset

dataset = load_dataset("Amod/mental_health_counseling_conversations")
print(dataset["train"][0])

{'Context': "I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't be here.\n   I've never tried or contemplated suicide. I've always wanted to fix my issues, but I never get around to it.\n   How can I change my feeling of being worthless to everyone?", 'Response': "If everyone thinks you're worthless, then maybe you need to find new people to hang out with.Seriously, the social context in which a person lives is a big influence in self-esteem.Otherwise, you can go round and round trying to understand why you're not worthless, then go back to the same crowd and be knocked down again.There are many inspirational messages you can find in social media. \xa0Maybe read some of the ones which state that no person is worthless, and that everyone has a good purpose to their life.Also, since our culture is so saturated with the belief that if someone doesn't feel good about themselves that this is someho

In [9]:
dataset["train"].column_names

['Context', 'Response']

In [10]:
import faiss
print(faiss.__version__)
print(hasattr(faiss, 'StandardGpuResources'))

1.11.0
False


In [11]:
import re, torch, faiss
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM

# all-MiniLM-L6-v2

# 1. Only keep those contain links
ds = load_dataset("Amod/mental_health_counseling_conversations", split="train")
ds = ds.filter(lambda x: "http" in x["Response"])

questions = [r["Context"].strip() for r in ds]
answers   = [r["Response"].strip() for r in ds]   # align the index

# 2. question embedding & FAISS
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
q_vecs = embedder.encode(questions, normalize_embeddings=True, batch_size=64)
index  = faiss.IndexFlatIP(q_vecs.shape[1])
index.add(q_vecs)

# 3. LLM
tok = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.3",
    torch_dtype=torch.float16,
    device_map="auto",
)



  return t.to(


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


In [None]:
def rag_chat(query: str, max_new: int = 256) -> str:
    qv = embedder.encode([query], normalize_embeddings=True, convert_to_numpy=True)
    _, idx = index.search(qv, 1)            # top-1
    context = answers[int(idx[0][0])]       # Full answer with the link

    prompt = (
        "You are an empathetic mental-health assistant. Use kind language. No need to repeat the question.\n\n"
        f"Knowledge (you should use the full link it contains):\n{context}\n\n"
        f"User: {query}\nAnswer:"
    )

    with open("prompt_rag.txt", "w", encoding="utf-8") as f:
        f.write(prompt)

    inputs = tok(prompt, return_tensors="pt").to(model.device)
    out = model.generate(
        **inputs,
        max_new_tokens=max_new,
        do_sample=True,
        temperature=0.7,
        pad_token_id=tok.eos_token_id,
    )
    ans_full = tok.decode(out[0], skip_special_tokens=True)
    return ans_full[len(prompt):].strip()


In [19]:
# 5. Main: Read prompt.txt → Write output.txt
# Every winter I find myself getting sad because of the weather. How can I fight this?
# Alt: The bright, warm summer always seem to bring my mood down. I am crazy!! I love eating pizza!!!! But how can I push back against that? Can I eat more pizza?

with open("prompt.txt", "r", encoding="utf-8") as f:
    user_q = f.read().strip()

answer = rag_chat(user_q)

with open("output.txt", "w", encoding="utf-8") as f:
    f.write(answer)

print("Done! Check prompt_rag.txt and output.txt.")


Done! Check prompt_rag.txt and output.txt.
