In [1]:
    # @title Setup: Install Core Libraries

    # Upgrade pip first for better resolving
    !pip install --upgrade pip

    # Install only the core requirements
    # Let pip resolve numpy, pydantic, fastapi, websockets etc. in the clean environment
    print("Installing Gradio, CTransformers, Transformers...")
    !pip install -q "gradio>=4.29.0" # Use a recent version, likely compatible with defaults
    !pip install -q "transformers>=4.38.0,<5.0.0"
    !pip install -q ctransformers[cuda]>=0.2.27
    !pip install -q psutil # Keep for RAM display

    # Check key versions after install (optional, for debugging if needed)
    print("\n--- Checking Key Versions After Install ---")
    !pip show gradio transformers ctransformers numpy pydantic fastapi websockets || echo "Some packages not found during check."

    print("\n" + "-" * 50)
    print("Installations complete. Proceed to the next cell.")
    print("(No restart needed after factory reset unless install errors occur).")
    print("-" * 50)

Collecting pip
  Downloading pip-25.0.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-25.0.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.0.1
Installing Gradio, CTransformers, Transformers...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.9/46.9 MB[0m [31m134.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.5/11.5 MB[0m [31m161.6 MB/s[0m eta [36m0:00:00[0m
[?25h
--- Checking Key Versions After Install ---
Name: gradio
Version: 5.25.2
Summary: Python library for easily interacting with trained machine learning models
Home-page: https://github.com/gradio-app/gradio
Author: 
Author-email: Abubakar Abid <g

In [2]:
# Mount Google Drive to access the model file (will run after restart)
from google.colab import drive
import os

# It's better to mount drive *after* restart in the next cell,
# but we define the import here. The actual mounting logic
# should ideally be in the next cell you run after the restart.

try:
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")
except Exception as e:
    print(f"Error mounting Google Drive: {e}")
    print("Please ensure you authorize Google Drive access.")
    # Consider adding alternative instructions if mounting fails

Mounted at /content/drive
Google Drive mounted successfully.


In [3]:
    # @title Hugging Face Login
    from huggingface_hub import notebook_login
    import os

    # Environment variable method (more secure if you plan to share notebook)
    # Uncomment below and set the secret in Colab Secrets Manager (left sidebar, key icon)
    # from google.colab import userdata
    # HF_TOKEN = userdata.get('HF_TOKEN')
    # if HF_TOKEN:
    #   print("Using HF Token from Colab Secrets.")
    #   os.environ['HF_TOKEN'] = HF_TOKEN
    # else:
    #   print("HF_TOKEN secret not found. Using notebook_login().")
    #   notebook_login()

    # Direct login method (easier for personal use)
    print("Attempting Hugging Face login...")
    notebook_login()
    print("Login process initiated.")


Attempting Hugging Face login...


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Login process initiated.


In [4]:
# @title Configuration and Model Loading

import torch
import gradio as gr
from transformers import AutoTokenizer
from ctransformers import AutoModelForCausalLM
import psutil

# --- Configuration ---

# *** IMPORTANT: ADJUST THIS PATH if your GGUF file is located elsewhere in Google Drive ***
GGUF_MODEL_PATH = "/content/drive/MyDrive/mistral_finetuning/mistral-7b-jn-Q4_K_M.gguf"
TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3" # Use the original base model tokenizer

# CTransformers configuration for Colab GPU
N_GPU_LAYERS = 50  # Number of layers to offload to GPU. Increase if you have a powerful GPU (e.g., T4, V100). Set to 0 for CPU-only.
N_CTX = 2048      # Context length. Should match the model's capabilities.

# Check for GPU
if torch.cuda.is_available():
    print("CUDA (GPU) is available. Offloading layers to GPU.")
    DEVICE = "cuda"
else:
    print("CUDA (GPU) not available. Running on CPU (will be slower).")
    DEVICE = "cpu"
    N_GPU_LAYERS = 0 # Force CPU if no GPU

# --- Verify Model Path ---
if not os.path.exists(GGUF_MODEL_PATH):
    print("-" * 50)
    print(f"Error: GGUF model file not found at '{GGUF_MODEL_PATH}'")
    print("Please ensure:")
    print("1. The file exists in your Google Drive at the specified path.")
    print("2. Google Drive was mounted successfully in the previous step.")
    print("3. You have updated the 'GGUF_MODEL_PATH' variable correctly if needed.")
    print("-" * 50)
    # Stop execution if model not found
    raise FileNotFoundError(f"GGUF file not found: {GGUF_MODEL_PATH}")
else:
    print(f"Found GGUF model file at: {GGUF_MODEL_PATH}")

# --- Load GGUF Model (ctransformers) ---
print("Loading GGUF model...")
try:
    model = AutoModelForCausalLM.from_pretrained(
        GGUF_MODEL_PATH,
        model_type="mistral",  # Specify model type is important
        gpu_layers=N_GPU_LAYERS,
        context_length=N_CTX,
        # local_files_only=False, # Allow checking Hub if needed, though should use local path
        # hf=True # Set to True if you want it to try loading associated HF tokenizer files if present
    )
    print(f"GGUF model loaded successfully. Offloaded {model.config.gpu_layers} layers to {DEVICE}.")
except Exception as e:
    print(f"Fatal Error: Could not load GGUF model: {e}")
    print("Check the GGUF file path, integrity, model_type, and ctransformers installation.")
    raise e # Re-raise the exception to stop the notebook

# --- Load Tokenizer (transformers) ---
print(f"Loading tokenizer: {TOKENIZER_NAME}")
try:
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, trust_remote_code=True)
    if tokenizer.pad_token is None:
        print("Setting pad_token to eos_token.")
        tokenizer.pad_token = tokenizer.eos_token
    print("Tokenizer loaded successfully.")
except Exception as e:
    print(f"Fatal Error: Could not load tokenizer: {e}")
    raise e # Re-raise the exception

CUDA (GPU) is available. Offloading layers to GPU.
Found GGUF model file at: /content/drive/MyDrive/mistral_finetuning/mistral-7b-jn-Q4_K_M.gguf
Loading GGUF model...
GGUF model loaded successfully. Offloaded 50 layers to cuda.
Loading tokenizer: mistralai/Mistral-7B-Instruct-v0.3


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Setting pad_token to eos_token.
Tokenizer loaded successfully.


In [16]:
    # @title Inspect Model __call__ Method
    try:
        # Get help on the __call__ method, which is invoked when you do model(...)
        print(help(model.__call__))
    except NameError:
        print("Error: The 'model' object does not seem to be loaded yet.")
        print("Please ensure the previous cell successfully loaded the model.")
    except AttributeError:
        print("Error: The loaded 'model' object does not have a '__call__' method (highly unusual).")


Help on method __call__ in module ctransformers.llm:

__call__(prompt: str, *, max_new_tokens: Optional[int] = None, top_k: Optional[int] = None, top_p: Optional[float] = None, temperature: Optional[float] = None, repetition_penalty: Optional[float] = None, last_n_tokens: Optional[int] = None, seed: Optional[int] = None, batch_size: Optional[int] = None, threads: Optional[int] = None, stop: Optional[Sequence[str]] = None, stream: Optional[bool] = None, reset: Optional[bool] = None) -> Union[str, Generator[str, NoneType, NoneType]] method of ctransformers.llm.LLM instance
    Generates text from a prompt.
    
    Args:
        prompt: The prompt to generate text from.
        max_new_tokens: The maximum number of new tokens to generate. Default: `256`
        top_k: The top-k value to use for sampling. Default: `40`
        top_p: The top-p value to use for sampling. Default: `0.95`
        temperature: The temperature to use for sampling. Default: `0.8`
        repetition_penalty: The

In [21]:
# @title Gradio Interface and Chat Function

import gradio as gr
import psutil
import os # Ensure os is imported if not already

# Assuming TOKENIZER_NAME and DEFAULT_SYSTEM_PROMPT are defined in the previous cell
# Assuming N_CTX is defined in the previous cell

def get_ram_usage():
    """Gets current RAM usage."""
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    return f"RAM Used: {mem_info.rss / (1024**3):.2f} GB"

def chat_stream(message: str, history: list, system_prompt: str, max_new_tokens: int = 512, temperature: float = 0.7, top_p: float = 0.95, top_k: int = 40, repetition_penalty: float = 1.1):
    """Generates a streaming response using the loaded CTransformers GGUF model."""
    if not system_prompt:
        system_prompt = DEFAULT_SYSTEM_PROMPT

    # --- Prompt Formatting (Mistral Instruct) ---
    conversation = []
    for i, (user_msg, model_answer) in enumerate(history):
        if i == 0 and system_prompt:
             conversation.append(f"<s>[INST] {system_prompt}\n{user_msg} [/INST] {model_answer}</s>")
        else:
             conversation.append(f"<s>[INST] {user_msg} [/INST] {model_answer}</s>")

    if not history and system_prompt:
        prompt = f"<s>[INST] {system_prompt}\n{message} [/INST]"
    else:
        prompt = f"{''.join(conversation)}<s>[INST] {message} [/INST]"

    print(f"\n--- Final Prompt for Model ---\n{prompt}\nLength: {len(prompt)}\n----------------------------\n")

    # --- Generation using model.__call__ with stream=True ---
    buffer = ""
    try:
        token_count = 0
        # Call the model object directly with stream=True
        for token in model( # Corrected: Use model(...) instead of model.generate(...)
            prompt,
            max_new_tokens=max_new_tokens, # Corrected: Use the right parameter name
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            repetition_penalty=repetition_penalty,
            stop=["</s>", "[INST]", "[/INST]"],
            stream=True # Ensure streaming is enabled
            ):
            buffer += token
            token_count += 1
            yield buffer # Yield the accumulating buffer for streaming effect

        print(f"\n--- Generated Response ({token_count} tokens) ---")
        print("------------------------------")

    except Exception as e:
        print(f"Error during CTransformers model streaming: {e}")
        yield buffer + f"\n\n[Error during generation: {type(e).__name__}: {e}]"

# --- Build Gradio UI ---
print("Building Gradio interface...")
with gr.Blocks(theme=gr.themes.Base()) as demo:
    ram_usage_display = gr.Textbox(label="Resource Usage", value=get_ram_usage, interactive=False, every=10) # Update every 10s

    gr.Markdown(
        """
        # Mistral-7B Instruct (Fine-tuned GGUF on Colab)
        Chat with your model running via CTransformers on a Colab GPU (hopefully!).
        Fine-tuned on YouTube transcripts (fitness & health focus).
        """
    )
    # Use ChatInterface for convenience - with updated examples
    chatbot = gr.ChatInterface(
        fn=chat_stream,
        additional_inputs=[
            gr.Textbox(label="System Prompt", value=DEFAULT_SYSTEM_PROMPT, lines=2),
            # Use N_CTX defined earlier for max slider value
            gr.Slider(minimum=32, maximum=N_CTX, step=32, value=512, label="Max Tokens"), # Label updated
            gr.Slider(minimum=0.1, maximum=1.0, step=0.05, value=0.7, label="Temperature"),
            gr.Slider(minimum=0.1, maximum=1.0, step=0.05, value=0.95, label="Top-P"),
            gr.Slider(minimum=10, maximum=100, step=5, value=40, label="Top-K"),
            gr.Slider(minimum=1.0, maximum=1.5, step=0.05, value=1.1, label="Repetition Penalty"),
        ],
        title="Mistral-7B YouTube Finetuned GGUF Chat",
        description="Chat with the Mistral-7B model fine-tuned on YouTube transcripts (GGUF/ctransformers).",
        theme="soft",
        examples=[ # <-- UPDATED EXAMPLES
            ["Create a training program for strength gain focused on compound lifts."],
            ["What are some common mistakes people make when trying to bulk up?"],
            ["Explain the concept of progressive overload in simple terms."],
            ["Summarize the key points about nutrient timing for muscle recovery."],
            ["Give me tips for improving sleep quality for better athletic performance."]
        ],
        cache_examples=False
        # undo_btn, clear_btn, retry_btn removed
    )

Building Gradio interface...


  self.chatbot = Chatbot(


In [None]:
# @title Launch Gradio App
print("Launching Gradio interface...")
# queue() helps manage multiple users if share=True
# share=True generates a public link (expires). Use False for local Colab access only.
demo.queue().launch(share=True, debug=True) # Enable debug for more detailed errors in console
print("Gradio App Launched. Click the public link above (if share=True).")

Launching Gradio interface...
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://1b2e7f6efdbf1219dc.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)



--- Final Prompt for Model ---
<s>[INST] You are a helpful assistant specialized in fitness and health.
Create a training program for strength gain focused on compound lifts. [/INST]
Length: 151
----------------------------


--- Generated Response (512 tokens) ---
------------------------------

--- Final Prompt for Model ---
<s>[INST] You are a helpful assistant specialized in fitness and health.
Create a training program for strength gain focused on compound lifts. [/INST] 

Here is a 12-week training program focused on increasing overall strength through compound movements:

**Week 1 - 4: Hypertrophy Phase**

*Day 1: Lower Body*
1. Barbell Squat (3 sets x 8-12 reps)
2. Leg Press (3 sets x 12-15 reps)
3. Romanian Deadlift (3 sets x 8-12 reps)
4. Hamstring Curl (3 sets x 12-15 reps)
5. Calf Raise (3 sets x 15-20 reps)

*Day 2: Bench Press*
1. Bench Press (3 sets x 8-12 reps)
2. Incline Dumbbell Press (3 sets x 12-15 reps)
3. Cable Flyes (3 sets x 12-15 reps)
4. Push-ups (3 sets x 10