In [8]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from transformers.utils import logging
logging.set_verbosity_info()


In [None]:

model_id = "meta-llama/Llama-3.3-70B-Instruct"
BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True
)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=quantization_config,
    token=""
)



loading configuration file config.json from cache at C:\Users\visionary\.cache\huggingface\hub\models--meta-llama--Llama-3.3-70B-Instruct\snapshots\6f6073b423013f6a7d4d9f39144961bfbfbc386b\config.json
Model config LlamaConfig {
  "_name_or_path": "meta-llama/Llama-3.3-70B-Instruct",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 8192,
  "initializer_range": 0.02,
  "intermediate_size": 28672,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 64,
  "num_hidden_layers": 80,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 8.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta

ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [1]:
import os
import torch
from accelerate import init_empty_weights, infer_auto_device_map
from transformers import AutoConfig, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "meta-llama/Llama-3.3-70B-Instruct"

quant_config = BitsAndBytesConfig(
    load_in_8bit=True,  # or load_in_4bit=True
    llm_int8_enable_fp32_cpu_offload=True,  # Only needed for 8-bit
)
config = AutoConfig.from_pretrained(model_id)

# 1) Create an empty model with the config
with init_empty_weights():
    empty_model = AutoModelForCausalLM.from_config(config)

# 2) Decide how much GPU/CPU memory you want to allow
#    Adjust these strings to match your actual resources
max_memory = {
    "cpu": "64GiB",       # If you have enough system RAM to offload large chunks
    0: "24GiB",    # If your GPU has 24GB VRAM
}

# 3) Infer an auto device map based on those memory constraints
device_map = infer_auto_device_map(
    empty_model, 
    max_memory=max_memory, 
    no_split_module_classes=["LlamaDecoderLayer"]  # typical for Llama-based models
)

# 4) Now load the real model with that device map
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    config=config,
    device_map=device_map,
    torch_dtype=torch.bfloat16,  # or float16
    quantization_config=quant_config,
    token="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"
)


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 30/30 [00:09<00:00,  3.23it/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some parameters are on the meta device because they were offloaded to the cpu and disk.


In [19]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_id = "meta-llama/Llama-3.2-1B-Instruct"

# 1) Choose quantization config (8-bit or 4-bit).
#    For 1B, 8-bit is likely fine even on a modest GPU, but 4-bit is also an option.
quant_config = BitsAndBytesConfig(load_in_8bit=True)  # or load_in_4bit=True

# 2) Load the model with the quant config.
#    Using device_map="auto" so it places model on GPU if enough memory is available,
#    else partial offload to CPU. If you only have a small GPU, you can skip offload
#    because 1B is tiny compared to 70B.
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16,       # or bfloat16 if supported
    quantization_config=quant_config,
    token="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"            # if needed
)


In [21]:

# 3) Load the matching tokenizer.
#    "meta-llama/Llama-3.2-1B-Instruct" might have a built-in tokenizer if they provided it,
#    else you can use a known Llama tokenizer if they share the same vocab.
tokenizer = AutoTokenizer.from_pretrained(model_id, token="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")

# 4) Test generating a short response
prompt = "Explain the benefits of solar energy in simple terms."
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.inference_mode():
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Explain the benefits of solar energy in simple terms. Here's how it works:

**Solar Energy: A Simple Explanation**

Solar energy is the power that comes from the sun. It's like a super-efficient power source that uses sunlight to generate electricity. Here's how it works:

1. **Sunlight hits the Earth**: The sun's rays strike the Earth's surface, heating up the atmosphere and oceans.
2. **Solar panels convert sunlight**: Special devices called solar panels absorb the sunlight and convert it into electricity.
3. **Electricity is generated**: The electricity generated by the solar panels is sent through a wire to a device called an inverter, which converts the DC (direct current) electricity into AC (alternating current) electricity.
4. **Electricity is used**: The AC electricity is fed into the electrical grid and used to power homes, businesses, and industries.

**Benefits of Solar Energy:**

1. **Zero Emissions**: Solar energy produces no greenhouse gas emissions, which reduces our re

In [4]:
from transformers import pipeline
from langchain_huggingface import HuggingFacePipeline


# 1) Create a HF Transformers pipeline from your model & tokenizer
generate_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=100,     # Tweak as needed
    temperature=0.7,
    top_p=0.9,
    do_sample=True
)

# 2) Wrap pipeline as a LangChain LLM
llm = HuggingFacePipeline(pipeline=generate_pipeline)

# Now llm is an object you can pass to many LangChain chains


Device set to use cuda:0


In [18]:

from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name=r"D:\Anand\Jstore_Ai\usecase1\output\sbert_finetuned",
    model_kwargs={"device": "cuda"}
)


In [25]:
from langchain_community.vectorstores import FAISS
product_store_path = r"D:\Anand\Jstore_Ai\usecase1\output\product_vector_store"  # Path from your config
faiss_store = FAISS.load_local(
    product_store_path, 
    embeddings,
    allow_dangerous_deserialization=True
)


In [2]:
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)
from langchain_huggingface import (
    HuggingFacePipeline,
    HuggingFaceEmbeddings
)
from langchain.chains import RetrievalQA, LLMChain
from langchain.prompts import PromptTemplate
from langchain_community.vectorstores import FAISS

#################################
# 1) Load the Llama Model (1B)
#################################
model_id = "meta-llama/Llama-3.2-1B-Instruct"

quant_config = BitsAndBytesConfig(load_in_8bit=True)  # or load_in_4bit=True
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",         # places model on GPU if enough memory, else partial offload
    torch_dtype=torch.float16,  # or torch.bfloat16 if your GPU supports it
    quantization_config=quant_config,
    token="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"
)
tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    token="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"
)

# (Optional) Save the downloaded model locally for reuse:
save_dir = "D:/local_llama_1B_instruct"
os.makedirs(save_dir, exist_ok=True)
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print(f"Llama model saved locally to: {save_dir}")


  from .autonotebook import tqdm as notebook_tqdm


Llama model saved locally to: D:/local_llama_1B_instruct


In [1]:
save_dir = r"D:\Anand\Jstore_Ai\usecase1\output\BGI-llama"

In [5]:
from loguru import logger 
import os
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)
from langchain_huggingface import (
    HuggingFacePipeline,
    HuggingFaceEmbeddings
)
from langchain.chains import RetrievalQA

from langchain_community.vectorstores import FAISS


In [6]:
def initialize_llm(save_dir: str):
    """
    Initialize the LangChain LLM by loading the saved Llama model and tokenizer,
    creating a text-generation pipeline, and wrapping it into a HuggingFacePipeline.
    
    Args:
        save_dir (str): Directory where the model and tokenizer are saved.
    
    Returns:
        HuggingFacePipeline: Wrapped LangChain pipeline for text generation.
    """
    try:
        logger.info(f"Initializing LangChain LLM from saved directory: {save_dir}")
        
        if not os.path.exists(save_dir):
            raise FileNotFoundError(f"The directory {save_dir} does not exist.")
        
        # Load the model
        logger.info("Loading the model...")
        model = AutoModelForCausalLM.from_pretrained(
            save_dir,
            device_map="auto",           # Automatically maps the model to available devices
            torch_dtype=torch.float16    # Ensure the dtype matches what was saved
        )
        logger.info("Model loaded successfully.")
        
        # Load the tokenizer
        logger.info("Loading the tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(save_dir)
        logger.info("Tokenizer loaded successfully.")
        
        # Define pipeline parameters within the function
        max_new_tokens = 200
        temperature = 0.7
        top_p = 0.9
        do_sample = True
        
        # Create the text-generation pipeline
        logger.info("Creating text-generation pipeline.")
        llama_pipeline = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            do_sample=do_sample
        )
        logger.info("Text-generation pipeline created successfully.")
        
        # Wrap the pipeline into LangChain's HuggingFacePipeline
        logger.info("Wrapping pipeline into LangChain HuggingFacePipeline.")
        llm = HuggingFacePipeline(pipeline=llama_pipeline)
        logger.info("LangChain HuggingFacePipeline initialized successfully.")

        # Inspect pad_token_id and eos_token_id
        print(f"Pad Token ID before setting: {tokenizer.pad_token_id}")
        print(f"EOS Token ID: {tokenizer.eos_token_id}")
        
        return llm
    except Exception as e:
        logger.error(f"Failed to initialize LangChain LLM: {e}")
        raise e
    

In [7]:
save_dir = r"D:\Anand\Jstore_Ai\usecase1\output\BGI-llama"
llm = initialize_llm(save_dir)


[32m2025-01-28 17:19:42.059[0m | [1mINFO    [0m | [36m__main__[0m:[36minitialize_llm[0m:[36m13[0m - [1mInitializing LangChain LLM from saved directory: D:\Anand\Jstore_Ai\usecase1\output\BGI-llama[0m
[32m2025-01-28 17:19:42.059[0m | [1mINFO    [0m | [36m__main__[0m:[36minitialize_llm[0m:[36m19[0m - [1mLoading the model...[0m
Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
[32m2025-01-28 17:19:43.531[0m | [1mINFO    [0m | [36m__main__[0m:[36minitialize_llm[0m:[36m25[0m - [1mModel loaded successfully.[0m
[32m2025-01-28 17:19:43.532[0m | [1mINFO    [0m | [36m__main__[0m:[36minitialize_llm[0m:[36m28[0m - [1mLoading the tokenizer...[0m
[32m2025-01-28 17:19:43.876[0m | [1mINFO    [0m | [36m__main__[0m:[36minitialize_llm[0m:[36m30[0m - [1mTokenizer loaded successfully.[0m
[32m2025-01-28 17:19:43.876[0m | [1mINFO    

Pad Token ID before setting: None
EOS Token ID: 128001


In [8]:


#################################
# 2) Load SBERT Embeddings
#################################
# This references your local directory with the fine-tuned SBERT
# which has config.json, modules.json, model.safetensors, etc.
sbert_path = r"D:\Anand\Jstore_Ai\usecase1\output\sbert_finetuned"
embeddings = HuggingFaceEmbeddings(
    model_name=sbert_path,
    model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"}
)

#################################
# 3) Load FAISS Vector Store
#################################
product_store_path = r"D:\Anand\Jstore_Ai\usecase1\output\product_vector_store"
faiss_store = FAISS.load_local(
    folder_path=product_store_path,
    embeddings=embeddings,
    allow_dangerous_deserialization=True  # because it uses pickle
)



In [9]:
from langchain_core.prompts import PromptTemplate

# Define the map prompt
map_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You have the following chunk of data (could be a product):
{context}

User question: {question}

- Summarize any relevant items here, product id and name. 
- If nothing is relevant, say so.
"""
)

# Define the reduce prompt
reduce_prompt = PromptTemplate(
    input_variables=["summaries", "question"],
    template="""
We have partial answers from multiple chunks:
{summaries}

Combine them into a single, cohesive answer to: "{question}"

Requirements:
1) Only provide the product IDs of relevant items.
2) If no relevant items are found, say "No relevant items found."
"""
)


def create_qa_chain(llm, vector_store):
    chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="map_reduce",
        retriever=vector_store.as_retriever(search_kwargs={"k": 20}),
        return_source_documents=True,
        chain_type_kwargs={
            "question_prompt": map_prompt,
            "combine_prompt": reduce_prompt
        }
    )
    logger.info("map_reduce RetrievalQA chain created successfully.")
    return chain

qa_chain = create_qa_chain(llm, faiss_store)


[32m2025-01-28 17:19:51.622[0m | [1mINFO    [0m | [36m__main__[0m:[36mcreate_qa_chain[0m:[36m44[0m - [1mmap_reduce RetrievalQA chain created successfully.[0m


In [10]:


def semantic_search_tool(query):
    """
    Calls qa_chain.invoke(query) to run the map_reduce retrieval QA and returns both the LLM result and product IDs.
    """
    max_retries = 5
    for attempt in range(max_retries):
        try:
            # Run the query through the QA chain
            response = qa_chain.invoke(query)

            # Check if the response is a dictionary
            if isinstance(response, dict):
                llm_result = response.get("result", "No result from LLM.")
                source_docs = response.get("source_documents", [])
            else:
                llm_result = response
                source_docs = []

            # Handle case where no documents are found
            if not source_docs:
                return f"**LLM Result:** {llm_result}\n\nNo relevant product IDs found."

            # Extract product IDs from source documents
            product_ids = [doc.metadata.get("product_id", "N/A") for doc in source_docs]

            # Return both the LLM result and matching product IDs
            return f"**LLM Result:** {llm_result}\n\n**Matching Product IDs:** {', '.join(product_ids)}"

        except Exception as e:
            logger.error(f"Attempt {attempt + 1} - Error: {e}")
            if "429" in str(e):
                sleep_time = 2 ** attempt
                logger.info(f"Rate limited. Retrying in {sleep_time} seconds...")
                time.sleep(sleep_time)
            else:
                return f"An error occurred: {e}"

    return "An error occurred after multiple attempts."


In [11]:
user_query = "show me some products under 2000"
print(f"Query: {user_query}\n")
answer = semantic_search_tool(user_query)
print(answer)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Query: show me some products under 2000



Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

KeyboardInterrupt: 