In [1]:
# Instalamos herramientas necesarias
!apt-get install -y cmake build-essential
!pip install -q llama-stack transformers sentencepiece gradio llama-cpp-python==0.2.61

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
build-essential is already the newest version (12.9ubuntu3).
cmake is already the newest version (3.22.1-1ubuntu1.22.04.2).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.4/37.4 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m101.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.

In [None]:
# Mostramos modelos disponibles (opcional)
!llama model list --show-all

# Descargamos el modelo LLaMA 3.2 3B Instruct
!llama model download --source meta --model-id Llama3.2-3B-Instruct


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[2K[1A[2K[1A[2K[1A[2K[1A[2K[1A[2K[1;34mDownloading checklist.chk[0m       [90m━━━━━━━━━━━[0m [35m100.0%[0m [32m209/209    [0m -         [36m0:00:00[0m
                                                   [32mbytes      [0m                  
[1;34mDownloading tokenizer.model[0m     [90m━━━━━━━━━━━[0m [35m100.0%[0m [32m2.2/2.2 MB [0m -         [36m0:00:00[0m
[1;34mDownloading params.json[0m         [90m━━━━━━━━━━━[0m [35m100.0%[0m [32m220/220    [0m -         [36m0:00:00[0m
                                                   [32mbytes      [0m                  
[2K[1A[2K[1A[2K[1A[2K[1A[2K[1A[2K[1;34mDownloading checklist.chk[0m       [90m━━━━━━━━━━━[0m [35m100.0%[0m [32m209/209    [0m -         [36m0:00:00[0m
                                                   [32mbytes      [0m                  
[1;34mDownloading tokenizer.model[0m     [90m━━━━━━━━━━━[0m 

In [None]:
# Movemos archivos a una ruta más accesible
!mkdir -p /content/models/Llama3.2-3B-Instruct
!cp /root/.llama/checkpoints/Llama3.2-3B-Instruct/* /content/models/Llama3.2-3B-Instruct/


In [None]:
%cd /content
!git clone https://github.com/ggerganov/llama.cpp.git
%cd llama.cpp
!cmake -S . -B build
!cmake --build build


/content
Cloning into 'llama.cpp'...
remote: Enumerating objects: 52398, done.[K
remote: Counting objects: 100% (445/445), done.[K
remote: Compressing objects: 100% (279/279), done.[K
remote: Total 52398 (delta 336), reused 166 (delta 166), pack-reused 51953 (from 2)[K
Receiving objects: 100% (52398/52398), 125.11 MiB | 17.30 MiB/s, done.
Resolving deltas: 100% (37880/37880), done.
/content/llama.cpp
-- The C compiler identification is GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Found Git: /usr/bin/git (found version "2.34.1")
-- Performing Test CMAKE_HAVE_LIBC_

In [None]:
# Descargamos el script
!wget https://raw.githubusercontent.com/huggingface/transformers/main/src/transformers/models/llama/convert_llama_weights_to_hf.py

# Convertimos a formato HF
!python3 convert_llama_weights_to_hf.py \
  --input_dir /content/models/Llama3.2-3B-Instruct \
  --model_size 3B \
  --output_dir /content/models/Llama3.2-3B-Instruct-HF \
  --llama_version 3


In [None]:
# Descargamos script actualizado de conversión
!wget https://raw.githubusercontent.com/ggerganov/llama.cpp/master/scripts/convert_hf_to_gguf.py -O convert_hf_to_gguf.py

# Convertimos a GGUF
!python3 convert_hf_to_gguf.py \
  /content/models/Llama3.2-3B-Instruct-HF \
  --outfile /content/models/Llama3.2-3B-Instruct/llama-3.2b-instruct.gguf \
  --outtype f16


In [None]:
from llama_cpp import Llama
import gradio as gr

# Ruta al modelo convertido
MODEL_PATH = "/content/models/Llama3.2-3B-Instruct/llama-3.2b-instruct.gguf"

# Cargar modelo con uso total de la GPU (n_gpu_layers=-1)
llm = Llama(
    model_path=MODEL_PATH,
    n_ctx=4096,
    n_threads=8,
    n_gpu_layers=-1  # <-- usa la GPU completa si está disponible
)

# Función que responde a la entrada del usuario
def responder(mensaje, historia):
    prompt = f"{mensaje}\nAssistant:"
    respuesta = llm(prompt, max_tokens=256, stop=["\n", "</s>"])
    texto = respuesta["choices"][0]["text"].strip()
    historia.append((mensaje, texto))
    return historia, historia

# Interfaz Gradio
chatbot = gr.ChatInterface(
    fn=responder,
    title="Asistente LLaMA 3.2 3B Instruct",
    chatbot=gr.Chatbot(),
    textbox=gr.Textbox(placeholder="Haz una pregunta...", lines=2),
    examples=["¿Qué es la computación cuántica?", "Explícame la Segunda Guerra Mundial"],
    cache_examples=False
)

# Lanzamos la app (modo público en Colab)
chatbot.launch()
