In [3]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.26" trl peft accelerate bitsandbytes

from transformers import TextStreamer
from unsloth      import FastLanguageModel

In [4]:
!git clone https://huggingface.co/nehalahmedshaikh/lora_model
!git clone https://github.com/CS-5302/CS-5302-Project-Group-15.git

In [5]:
# Setting up parameters for the model
max_seq_length   = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype            = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit     = True # Use 4bit quantization to reduce memory usage. Can be False.

# Load the model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name     = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = max_seq_length,
    dtype          = dtype,
    load_in_4bit   = load_in_4bit,
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# Prompt for the model
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# Tokenizing the inputs
inputs = tokenizer(
[
    alpaca_prompt.format(
        "i have the following symptoms, what disease do i have?", # instruction
        "hoarse voice, sore throat", # input
        "", # output - leave this blank for generation!
    ),
], return_tensors = "pt").to("cuda")

# Generating outputs
outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

In [6]:
# Using a text streamer for the model
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)