<a href="https://colab.research.google.com/github/DhruvBhatia14/Large-Language-Models/blob/main/ZeroShotLlama3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installing Required Libraries for Zero-Shot LLM Inference

In [1]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" -q
!pip install --no-deps "trl<0.9.0" peft accelerate bitsandbytes xformers datasets -q
!pip install triton

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m68.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.1/108.1 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.5/209.5 MB[0m [31m4.7 MB/s[0m eta [36m0:

##Loading and Preparing the Model for Zero-Shot Inference


In [4]:
from unsloth import FastLanguageModel
import torch

max_sequence_length = 2048 # Can go up to 8K tokens
dtype = None # Automatically chooses best one depending on ur CPU/GPU availability
load_in_4bit = True # 4-bit quantization to load smaller, more efficient version of model

# Loading pre trained Llama3 model; 8 billion parameters; quantized to 4-bit
# Passing defined parameters to optimize loading process
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_sequence_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
)

# Preparing model for zero-shot inference
FastLanguageModel.for_inference(model)

==((====))==  Unsloth 2024.10.0: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-

##Defining the Prompt Template for Zero-Shot Inference

In [6]:
# The prompt template gives the model a structure to follow
# Includes placeholder for instruction, input and the response generated by the model
prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

##Tokenizing Input and Generating a Response


In [7]:
# Defining a clear instruction and input for the model
instruction = "You are a highly knowledgeable assistant skilled in providing clear and concise answers." # Giving a clear instruction to the model
input = "What is Machine Learning?"

# Prepare input for model by tokenizing the prompt
# 'return_tensors='pt'' ensures tokenized data is returned as PyTorch tensors
# '.to('cuda')' moves tensors to GPU for faster processing
inputs = tokenizer([prompt.format(instruction, input, "")], return_tensors='pt').to('cuda')

# Using model to generate a response based on given inputs
# max_new_tokens limits model to generating upto 100 new tokens for the response
outputs = model.generate(**inputs, max_new_tokens = 100)

# Decoding generated token IDs into readable string and retrieving the first decoded response
response = tokenizer.batch_decode(outputs)[0]
print(response)

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are a highly knowledgeable assistant skilled in providing clear and concise answers.

### Input:
What is Machine Learning?

### Response:
Machine learning is a field of computer science that uses statistical techniques to give computers the ability to "learn" (that is, progressively improve performance on a specific task) without being explicitly programmed.

### Explanation:
Machine learning is a field of computer science that uses statistical techniques to give computers the ability to "learn" (that is, progressively improve performance on a specific task) without being explicitly programmed.<|end_of_text|>


In [10]:
# Defining a clear instruction and input for the model
instruction = "You are a highly knowledgeable assistant skilled in providing clear and concise answers." # Giving a clear instruction to the model
input = "Which country produces the highest amount of coffee?"

# Prepare input for model by tokenizing the prompt
# 'return_tensors='pt'' ensures tokenized data is returned as PyTorch tensors
# '.to('cuda')' moves tensors to GPU for faster processing
inputs = tokenizer([prompt.format(instruction, input, "")], return_tensors='pt').to('cuda')

# Using model to generate a response based on given inputs
# max_new_tokens limits model to generating upto 100 new tokens for the response
outputs = model.generate(**inputs, max_new_tokens = 100)

# Decoding generated token IDs into readable string and retrieving the first decoded response
response = tokenizer.batch_decode(outputs)[0]
print(response)

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are a highly knowledgeable assistant skilled in providing clear and concise answers.

### Input:
Which country produces the highest amount of coffee?

### Response:
The country that produces the most coffee is Brazil.<|end_of_text|>
