In [1]:
from openai import OpenAI
import os 
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
import torch
from langchain_huggingface import HuggingFacePipeline

# Initialize the OpenAI API client
client =  OpenAI(
    base_url="http://localhost:11434/v1/",
    api_key="ollama", 
    )

In [None]:
# Model and tokenizer setup
model_id = env_values['MODEL_ID']

# Configure 8-bit quantization
quantization_config = BitsAndBytesConfig(load_in_8bit=True)

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
base_model = AutoModelForCausalLM.from_pretrained(model_id,
                                                quantization_config=quantization_config,
                                                device_map='auto')

In [None]:
# Set up pipeline
pipe = pipeline("text-generation",
                model=base_model,
                tokenizer=tokenizer,
                max_length=256,
                truncation=True,  # Explicitly enable truncation
                do_sample=True,
                temperature=0.6,
                top_p=0.95,
                repetition_penalty=1.2)

# Initialize LangChain HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=pipe)

In [None]:
template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Suggest 2 ways to lose my weight.

Answer:"""

print(llm.invoke(template))

In [None]:
prompt_1 = """
Suggest 2 ways to lose my weight.
""".strip()

prompt_2 = """
Tell me a joke
""".strip()

llm_results = llm.generate([ prompt_1, prompt_2 ])
llm_results.generations[1][0].text

In [None]:
# Direct pipeline usage
pipeline_output = pipe(template, return_full_text=False)
print("Pipeline Output:", pipeline_output[0]['generated_text'])