#### GPT2

In [None]:
# GPT2 : downloaded via wget (login not needed)
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load model from the local folder
model = AutoModelForCausalLM.from_pretrained("../../../models/gpt2")
tokenizer = AutoTokenizer.from_pretrained("../../../models/gpt2")

# Set device for Apple Silicon
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
model.to(device)

# Example text generation
text = "What is the capital of Paris?"
inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_length=50)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("\nGenerated Text:\n", generated_text)

### Mistral

In [None]:
# mistral: download via terminal login
# first download it for offline use 

#Download for offline use
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "mistralai/Mistral-7B-Instruct-v0.3"

# Download model & tokenizer
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Save them locally
model.save_pretrained("./models/Mistral-7B-Instruct")
tokenizer.save_pretrained("./models/Mistral-7B-Instruct")

print("Model and tokenizer downloaded and saved locally.")

In [None]:
# run the model offline 
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load model from local storage
# model = AutoModelForCausalLM.from_pretrained("../../../models/Mistral-7B-Instruct", torch_dtype="auto")
# Load model in 16-bit (FP16) instead of full precision
model = AutoModelForCausalLM.from_pretrained("../../../models/Mistral-7B-Instruct", torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained("../../../models/Mistral-7B-Instruct")

print("step1")
# Set device (M1/M2 Macs use 'mps', fallback to 'cpu' if not available)
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
model.to(device)

print("step2")
# Example text generation
text = "Tell me a fun fact about space."
inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_length=100)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("step3")
print("\nGenerated Text:\n", generated_text)

##### with GGUF + LLAMA


In [3]:
from llama_cpp import Llama

# model_path = "/Users/firaterman/Documents/fer/models/Mistral-7B-Instruct/GGUF/Mistral-7B-Instruct-v0.3-Q6_K.gguf"
# model_path = "/Users/firaterman/Documents/fer/models/Mistral-7B-Instruct/GGUF/Llama-3.3-70B-Instruct-Q2_K.gguf"
model_path = "/Users/firaterman/Documents/fer/models/Mistral-7B-Instruct/GGUF/Nous-Hermes-13B.Q4_K_M.gguf"
llm = Llama(model_path=model_path)

output = llm("You are a teacher, you need to answer to my question in a short and concise sentence. Respond to this question: what is the capital of Japan?")

response = output["choices"][0]["text"].strip()  # Store the response as a string
print(response)  # Print or use it in your code


llama_model_load_from_file_impl: using device Metal (Apple M1 Pro) - 10922 MiB free
llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from /Users/firaterman/Documents/fer/models/Mistral-7B-Instruct/GGUF/Nous-Hermes-13B.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = nousresearch_nous-hermes-13b
llama_model_loader: - kv   2:                       llama.context_length u32              = 2048
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
llama_model_loader: - kv   4:                          llama.block_count u32              = 40
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
llama_model_loader: 

A: The capital of Japan is Tokyo.


In [2]:
response

'The capital of Japan is Tokyo.'

In [7]:
from llama_cpp import Llama

# model_path = "/Users/firaterman/Documents/fer/models/Mistral-7B-Instruct/GGUF/Mistral-7B-Instruct-v0.3-Q6_K.gguf"
# model_path = "/Users/firaterman/Documents/fer/models/models/Mistral-7B-Instruct/GGUF/Llama-3.3-70B-Instruct-Q2_K.gguf"
model_path = "/Users/firaterman/.lmstudio/models/lmstudio-community/gemma-3-12b-it-GGUF/gemma-3-12b-it-Q4_K_M.gguf"
llm = Llama(model_path=model_path)

output = llm("You are a teacher, you need to answer to my question in a short and concise sentence. Respond to this question: what is the capital of Japan?")

response = output["choices"][0]["text"].strip()  # Store the response as a string
print(response)  # Print or use it in your code


llama_model_load_from_file_impl: using device Metal (Apple M1 Pro) - 10915 MiB free
llama_model_loader: loaded meta data with 40 key-value pairs and 626 tensors from /Users/firaterman/.lmstudio/models/lmstudio-community/gemma-3-12b-it-GGUF/gemma-3-12b-it-Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = gemma3
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Gemma 3 12b It
llama_model_loader: - kv   3:                           general.finetune str              = it
llama_model_loader: - kv   4:                           general.basename str              = gemma-3
llama_model_loader: - kv   5:                         general.size_label str              = 12B
llama_model_l

ValueError: Failed to load model from file: /Users/firaterman/.lmstudio/models/lmstudio-community/gemma-3-12b-it-GGUF/gemma-3-12b-it-Q4_K_M.gguf

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Check if CUDA is available and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model from local storage in FP16 for CUDA
model_path = "../../../models/Mistral-7B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,  # Use FP16 for better performance
    device_map="auto"  # Automatically map model to GPU (if available)
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

print("Step 1: Model Loaded")

# Move model to the appropriate device (if "auto" isn't used)
model.to(device)

print("Step 2: Model moved to", device)

# Example text generation
text = "Tell me a fun fact about space."
inputs = tokenizer(text, return_tensors="pt").to(device)

# Generate output
with torch.no_grad():  # Disable gradients for inference
    outputs = model.generate(**inputs, max_length=100)

# Decode and print generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Step 3: Generation complete")
print("\nGenerated Text:\n", generated_text)


In [3]:
import torch
print(torch.cuda.is_available())  # Should print True if CUDA is detected
print(torch.cuda.device_count())  # Number of available GPUs
print(torch.cuda.get_device_name(0))  # Name of the GPU


False
0


AssertionError: Torch not compiled with CUDA enabled

In [None]:

T
# Check if CUDA is available and set device
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

his step is giving me the error: requires the protobuf library but it was not found in your environment. checkout the instruction on the installation page
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
# Load model from local storage in FP16 for CUDA
model_path = "../../../models/Mistral-7B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,  # Use FP16 for better performance
    device_map="auto"  # Automatically map model to GPU (if available)
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load Phi-4 model from Hugging Face
model_name = "microsoft/Phi-4"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  # Use FP16 for better performance
    device_map="auto"  # Automatically map model to GPU (if available)
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Example inference
text = "Tell me a fun fact about space."
inputs = tokenizer(text, return_tensors="pt").to(device)

with torch.no_grad():
    outputs = model.generate(**inputs, max_length=100)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("\nGenerated Text:\n", generated_text)


config.json:   0%|          | 0.00/802 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/6 [00:00<?, ?it/s]

model-00001-of-00006.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [2]:
0.43 *0.15 * 0.89

0.057405000000000005