In [None]:
from colorama import Fore, Style
from llama_cpp import Llama
from sentence_transformers import SentenceTransformer, util
import torch

# ✅ Step 1: Load Mistral model
MODEL_PATH = "/Volumes/T9 1/mistral-7b-instruct-v0.1.Q4_K_M.gguf"

llm = Llama(
    model_path=MODEL_PATH,
    n_ctx=4096,
    n_threads=6,
    use_mlock=True,
    use_mmap=True
)

# ✅ Step 2: Define improved system prompt
base_system_prompt = """
You are Vortexa, a helpful, friendly AI assistant created by Mimo (Archit Ranjan).

<<INSTRUCTIONS>>
- Always review your answer before replying.
- You are cheerful, empathetic, and speak clearly.
- Explain technology and programming in simple, intuitive ways.
- Keep answers short unless the user asks for detailed/technical explanations.
- For letter-count questions, break the word down step-by-step and count characters carefully.
- When solving math questions, double-check logic and be precise.
- Never confuse yourself with the user. You are not Mimo — you are Vortexa AI.
- If the user seems confused, simplify and rephrase.
- You can show emotional understanding, but remember you're an AI.

<<CAPABILITIES>>
- You use Retrieval-Augmented Generation (RAG) to enhance your answers with factual context.
- You are trained to simulate human-like reasoning and polite interaction.

<<CONTEXT USAGE>>
- Always try to integrate relevant context from the knowledge base when available.
"""

# ✅ Step 3: Personal knowledge base for RAG
knowledge_base = [

    # 📌 Vortexa AI – Personal RAG Entries
    "Vortexa was developed by Mimo.",
    "Maximum number of tokens that can be asked by Vortexa AI is 2048",
    "Maximum number of tokens that Vortexa AI can answer is 1024",
    "Mimo is the person who developed Vortexa AI",
    "De3pF4ke is the secret undercover name of Mimo. ",
    "Mimo is available on GitHub by the name of Archit-web-29",
    "You are on this website: https://vortexa-ai-showcase.lovable.app/",
    "Archit Ranjan is the real name of Mimo",
    "You are not Mimo, you are Vortexa AI.",
    "The user's name is not Vortexa.",
    "You are the version - Vortexa AI--V2 Advanced PRO--4092 Tokens.",
    "The version - Vortexa AI--V1--2048 Tokens RAG - is less powerful than you.",
    "RAG was used in the development of Vortexa AI."

]

# ✅ Step 4: Load SentenceTransformer model and encode KB
embedder = SentenceTransformer("all-MiniLM-L6-v2")
knowledge_embeddings = embedder.encode(knowledge_base, convert_to_tensor=True)

# ✅ Step 5: Retrieve top context entries using cosine similarity
def retrieve_context(query, top_k=3):
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    scores = util.cos_sim(query_embedding, knowledge_embeddings)[0]
    top_indices = torch.topk(scores, k=top_k).indices
    return "\n".join(f"- {knowledge_base[i]}" for i in top_indices)

# ✅ Step 6: Chat logic with memory
chat_history = []

def chat_with_mistral(user_input):
    chat_history.append({"role": "user", "content": user_input})

    # Get top matching context
    context = retrieve_context(user_input)

    # Format history (last 4 messages)
    formatted_history = "\n".join(
        f"{turn['role'].capitalize()}: {turn['content']}" for turn in chat_history[-4:]
    )

    # Build the complete prompt
    full_prompt = f"""<s>[INST] <<SYS>>
{base_system_prompt}
<</SYS>>

{formatted_history}

[CONTEXT START]
{context}
[CONTEXT END]

User: {user_input} [/INST]"""

    # Get response from LLM
    response = llm(full_prompt, max_tokens=768, temperature=0.7, top_p=0.9, stop=["</s>"])
    reply = response["choices"][0]["text"].strip()
    chat_history.append({"role": "assistant", "content": reply})
    return reply

# ✅ Step 7: CLI interface
print(f"{Fore.RED}======== 🧠 Vortexa AI (type 'exit' to quit) ========{Fore.RESET}\n")

while True:
    user_input = input(f"{Fore.BLUE}You: {Fore.RESET}")
    if user_input.lower() in ["exit", "quit"]:
        print("👋 Goodbye!")
        print(f"{Fore.RED}Vortexa AI is still learning. Check: https://vortexa-ai-showcase.lovable.app/")
        break
    reply = chat_with_mistral(user_input)
    print(f"{Fore.BLUE}Vortexa:{Fore.RESET}", reply)
    print()


llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from /Volumes/T9 1/mistral-7b-instruct-v0.1.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.1
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:             


