## Mistral-7B

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Mistral-7B-Instruct-v0.3
model_name = "mistralai/Mistral-7B-Instruct-v0.3"

# tokenizer and model（automatically use FP16; active GPU）
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16
)

  _ = torch.tensor([0], device=i)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


In [2]:
# Check
import torch
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

2.9.0.dev20250707+cu128
True
NVIDIA GeForce RTX 5080


In [2]:
# Input the prompt
with open("prompt.txt", "r", encoding="utf-8") as f:
    prompt = f.read()
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

In [3]:
response_only = True

# Output the answer
outputs = model.generate(
    **inputs,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
    pad_token_id=tokenizer.eos_token_id
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
if response_only:
    response = response[len(prompt):].strip()
with open("output.txt", "w", encoding="utf-8") as f:
    f.write(response)
print("Generated and saved.")

Generated and saved.


## LLM with Retrieval-augmented generation (RAG)

In [1]:
# Download the dataset from https://huggingface.co/datasets/Amod/mental_health_counseling_conversations
# And take a look

from datasets import load_dataset

dataset = load_dataset("Amod/mental_health_counseling_conversations")
print(dataset["train"][0])

Using the latest cached version of the dataset since Amod/mental_health_counseling_conversations couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at C:\Users\Laplace\.cache\huggingface\datasets\Amod___mental_health_counseling_conversations\default\0.0.0\4672e03c7f1a7b2215eb4302b83ca50449ce2553 (last modified on Wed Jul  2 18:13:28 2025).


{'Context': "I'm going through some things with my feelings and myself. I barely sleep and I do nothing but think about how I'm worthless and how I shouldn't be here.\n   I've never tried or contemplated suicide. I've always wanted to fix my issues, but I never get around to it.\n   How can I change my feeling of being worthless to everyone?", 'Response': "If everyone thinks you're worthless, then maybe you need to find new people to hang out with.Seriously, the social context in which a person lives is a big influence in self-esteem.Otherwise, you can go round and round trying to understand why you're not worthless, then go back to the same crowd and be knocked down again.There are many inspirational messages you can find in social media. \xa0Maybe read some of the ones which state that no person is worthless, and that everyone has a good purpose to their life.Also, since our culture is so saturated with the belief that if someone doesn't feel good about themselves that this is someho

In [2]:
dataset["train"].column_names

['Context', 'Response']

In [3]:
import faiss
print(faiss.__version__)
print(hasattr(faiss, 'StandardGpuResources'))

1.7.2
True


In [4]:
import re, torch, faiss
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM

# all-MiniLM-L6-v2

# 1. Only keep those contain links
ds = load_dataset("Amod/mental_health_counseling_conversations", split="train")
ds = ds.filter(lambda x: "http" in x["Response"])

questions = [r["Context"].strip() for r in ds]
answers   = [r["Response"].strip() for r in ds]   # align the index

# 2. question embedding & FAISS
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
q_vecs = embedder.encode(questions, normalize_embeddings=True, batch_size=64)
index  = faiss.IndexFlatIP(q_vecs.shape[1])
index.add(q_vecs)

# 3. LLM
tok = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.3",
    torch_dtype=torch.float16,
    device_map="auto",
)



Using the latest cached version of the dataset since Amod/mental_health_counseling_conversations couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at C:\Users\Laplace\.cache\huggingface\datasets\Amod___mental_health_counseling_conversations\default\0.0.0\4672e03c7f1a7b2215eb4302b83ca50449ce2553 (last modified on Wed Jul  2 18:13:28 2025).


Filter:   0%|          | 0/3512 [00:00<?, ? examples/s]

  return t.to(


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


In [12]:
def rag_chat(query: str, max_new: int = 128) -> str: #256
    qv = embedder.encode([query], normalize_embeddings=True, convert_to_numpy=True)
    _, idx = index.search(qv, 1)            # top-1
    context = answers[int(idx[0][0])]       # Full answer with the link

    prompt = (
        "You are an empathetic mental-health assistant. Use kind language and complete sentence. No need to repeat the question.\n\n"
        f"Knowledge (you should use the full link it contains):\n{context}\n\n"
        f"User: {query}\nAnswer:"
    )

    with open("prompt_rag.txt", "w", encoding="utf-8") as f:
        f.write(prompt)

    inputs = tok(prompt, return_tensors="pt").to(model.device)
    out = model.generate(
        **inputs,
        max_new_tokens=max_new,
        do_sample=True,
        temperature=0.7,
        pad_token_id=tok.eos_token_id,
    )
    ans_full = tok.decode(out[0], skip_special_tokens=True)
    return ans_full[len(prompt):].strip()


In [13]:
# 5. Main: Read prompt.txt → Write output.txt
# Every winter I find myself getting sad because of the weather. How can I fight this?
# Alt: The bright, warm summer always seem to bring my mood down. I am crazy!! I love eating pizza!!!! But how can I push back against that? Can I eat more pizza?

with open("prompt.txt", "r", encoding="utf-8") as f:
    user_q = f.read().strip()

answer = rag_chat(user_q)

with open("output.txt", "w", encoding="utf-8") as f:
    f.write(answer)

print("Done! Check prompt_rag.txt and output.txt.")


Done! Check prompt_rag.txt and output.txt.


In [4]:
import faiss
import torch
import subprocess

print("FAISS Version:", faiss.__version__)
print("FAISS has GPU support:", hasattr(faiss, 'StandardGpuResources'))

print("\nPyTorch CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA device name:", torch.cuda.get_device_name(0))
    print("CUDA version (from PyTorch):", torch.version.cuda)

try:
    # Check system CUDA version from nvcc
    output = subprocess.check_output(['nvcc', '--version']).decode()
    print("\nnvcc version output:\n", output)
except FileNotFoundError:
    print("\n`nvcc` not found in PATH — you may not have installed CUDA toolkit properly.")


FAISS Version: 1.7.2
FAISS has GPU support: True

PyTorch CUDA available: True
CUDA device name: NVIDIA GeForce RTX 5080
CUDA version (from PyTorch): 12.8

nvcc version output:
 nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2025 NVIDIA Corporation
Built on Fri_Feb_21_20:42:46_Pacific_Standard_Time_2025
Cuda compilation tools, release 12.8, V12.8.93
Build cuda_12.8.r12.8/compiler.35583870_0



## Experinment

In [1]:
# Model & tokenizer (simply same)
from transformers import (AutoTokenizer, AutoModelForCausalLM,
                          TextIteratorStreamer)
import torch, time, threading, sys
from pathlib import Path

MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
tok = AutoTokenizer.from_pretrained(MODEL)
llm = AutoModelForCausalLM.from_pretrained(
    MODEL, device_map="auto", torch_dtype=torch.float16
)
_ = llm.generate(**tok("hi", return_tensors="pt").to(llm.device), max_new_tokens=1) # Warm up

  _ = torch.tensor([0], device=i)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [2]:
# Build / load FAISS (LangChain) on questions
from datasets import load_dataset
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.docstore.document import Document
from langchain.memory import ConversationBufferWindowMemory


if not Path("qa_faiss.faiss").exists():
    ds = load_dataset("Amod/mental_health_counseling_conversations", split="train")
    ds = ds.filter(lambda x: "http" in x["Response"])
    docs = [Document(page_content=q.strip(), metadata={"answer": a.strip()})
            for q, a in zip(ds["Context"], ds["Response"])]
    emb = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectordb = FAISS.from_documents(docs, emb)
    vectordb.save_local(".", index_name="qa_faiss")
else:
    emb = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectordb = FAISS.load_local(".", emb, index_name="qa_faiss",
                                allow_dangerous_deserialization=True)
retriever = vectordb.as_retriever(search_kwargs={"k": 1})
memory = ConversationBufferWindowMemory(k=5, ai_prefix="AI", human_prefix="User")


  emb = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  memory = ConversationBufferWindowMemory(k=5, ai_prefix="AI", human_prefix="User")


In [None]:
# Streaming chat
def stream_chat_from_txt(max_new=64, temp=0.7):
    # read user prompt
    user_q = Path("prompt.txt").read_text(encoding="utf-8").strip()

    '''
    You are a compassionate and conversational therapist trained in active listening. 
    You ask thoughtful, open-ended questions and build emotional rapport. 
    Use empathy, paraphrasing, and gentle nudging. 
    '''

    history_block = memory.load_memory_variables({}).get("history", "")
    if history_block:
        history_block = "Conversation History:\n" + history_block + "\n\n"


    # RAG & timing
    t_rag_start = time.time()
    doc = retriever.get_relevant_documents(user_q)[0]
    ctx  = doc.metadata["answer"]
    full_prompt = (
        "You are an empathetic mental-health assistant. "
        "Use kind language and complete sentences. "
        "Do not repeat the question. \n\n"
        f"{history_block}"
        f"Knowledge you can refer to if you need:\n{ctx}\n\n"
        f"User: {user_q}\nAnswer:"
    )
    rag_latency = time.time() - t_rag_start
    print(f"\nRAG latency: {rag_latency:.2f} s")
    Path("prompt_rag.txt").write_text(full_prompt, encoding="utf-8")

    # generate (stream, first-token timer)
    streamer = TextIteratorStreamer(tok, skip_prompt=True, skip_special_tokens=True)
    inputs   = tok(full_prompt, return_tensors="pt").to(llm.device)

    threading.Thread(
        target=llm.generate,
        kwargs=dict(**inputs, streamer=streamer,
                    max_new_tokens=max_new,
                    temperature=temp,
                    do_sample=True,
                    pad_token_id=tok.eos_token_id),
        daemon=True
    ).start()

    t0 = time.time(); first=True; answer=""
    for chunk in streamer:
        if not chunk:
            continue
        if first:
            print(f"\nFirst token latency: {time.time()-t0:.2f} s\n")
            first = False
        sys.stdout.write(chunk); sys.stdout.flush()
        answer += chunk
    gen_time = time.time() - t0
    n_tokens = len(tok.encode(answer, add_special_tokens=False))
    print(f"\n\nTokens per second: {n_tokens/gen_time:.2f}")
    
    memory.save_context({"input": user_q}, {"output": answer})
    Path("output.txt").write_text(answer, encoding="utf-8")
    print("Written to output.txt")

In [13]:
if __name__ == "__main__":
    stream_chat_from_txt()


RAG latency: 0.01 s

First token latency: 0.51 s

I don't have personal preferences when it comes to food, as I am a computer program. However, I can appreciate a wide variety of dishes from different cultures, such as sushi, pasta, tacos, and curry. Enjoying food is part of maintaining a balanced and healthy lifestyle.

Tokens per second: 4.78
Written to output.txt
