# LangChain

## Preprocessing

In [1]:
import pandas as pd 

In [2]:
# Read the online retail dataset
data = pd.read_csv('amazon.csv')
df = data[:100].copy()
df.dropna(subset=['rating_count'], inplace=True)

df['sub_category'] = df['category'].astype(str).str.split('|').str[-1]
df['main_category'] = df['category'].astype(str).str.split('|').str[0]

In [3]:
df.columns

Index(['product_id', 'product_name', 'category', 'discounted_price',
       'actual_price', 'discount_percentage', 'rating', 'rating_count',
       'about_product', 'user_id', 'user_name', 'review_id', 'review_title',
       'review_content', 'img_link', 'product_link', 'sub_category',
       'main_category'],
      dtype='object')

In [4]:
df1 = df.copy()
df1['product_name'] = df1['product_name'].str.lower() 
df1 = df1.drop_duplicates(subset=['product_name'])    # Remove duplicates based on 'product_name'

In [5]:
print(df.shape)
print(df1.shape)

(100, 18)
(100, 18)


### Check Product Name & Description

In [6]:
df1['product_name'][0]

'wayona nylon braided usb to lightning fast charging and data sync cable compatible for iphone 13, 12,11, x, 8, 7, 6, 5, ipad air, pro, mini (3 ft pack of 1, grey)'

In [7]:
df1['about_product']

0     High Compatibility : Compatible With iPhone 12...
1     Compatible with all Type C enabled devices, be...
2     【 Fast Charger& Data Sync】-With built-in safet...
3     The boAt Deuce USB 300 2 in 1 cable is compati...
4     [CHARGE & SYNC FUNCTION]- This cable comes wit...
                            ...                        
95    Supports 150Mbps Wireless data transmission ra...
96    Compatible with MI Smart TV 4A 32 inch LED TV ...
97    The cable comes with 3 Different pins allowing...
98    Fastest USB 3.0 and Gigabit solution ensure hi...
99    【Power Delivery Fast Charging】: Charge your iP...
Name: about_product, Length: 100, dtype: object

In [8]:
df2 = df1[['product_id','product_name', 'about_product','main_category','sub_category', 'actual_price','discount_percentage','rating','rating_count' ]]

In [9]:
df2.head()

Unnamed: 0,product_id,product_name,about_product,main_category,sub_category,actual_price,discount_percentage,rating,rating_count
0,B07JW9H4J1,wayona nylon braided usb to lightning fast cha...,High Compatibility : Compatible With iPhone 12...,Computers&Accessories,USBCables,"₹1,099",64%,4.2,24269
1,B098NS6PVG,ambrane unbreakable 60w / 3a fast charging 1.5...,"Compatible with all Type C enabled devices, be...",Computers&Accessories,USBCables,₹349,43%,4.0,43994
2,B096MSW6CT,sounce fast phone charging cable & data sync u...,【 Fast Charger& Data Sync】-With built-in safet...,Computers&Accessories,USBCables,"₹1,899",90%,3.9,7928
3,B08HDJ86NZ,boat deuce usb 300 2 in 1 type-c & micro usb s...,The boAt Deuce USB 300 2 in 1 cable is compati...,Computers&Accessories,USBCables,₹699,53%,4.2,94363
4,B08CF3B7N1,portronics konnect l 1.2m fast charging 3a 8 p...,[CHARGE & SYNC FUNCTION]- This cable comes wit...,Computers&Accessories,USBCables,₹399,61%,4.2,16905


In [10]:
df2.to_csv('amazon_rag.csv', index=False)

### Load Document
- Document Loader	Data Type
- CSVLoader	CSV files
- DirectoryLoader	All files in a given directory
- Unstructured	Many file types (see https://docs.unstructured.io/platform/supported-file-types)
- JSONLoader	JSON files
- BSHTMLLoader	HTML files

In [11]:
# from langchain.document_loaders.csv_loader import CSVLoader
# from langchain_community.document_loaders import DirectoryLoader, TextLoader
# from langchain_community.document_loaders import NotionDirectoryLoader, NotionDBLoader

# loader = DirectoryLoader("../", glob="**/*.md", loader_cls=TextLoader)
# docs = loader.load()

In [12]:
from typing import List
from dotenv import load_dotenv
import os

from langchain.embeddings.huggingface_hub import HuggingFaceHubEmbeddings
from sentence_transformers import SentenceTransformer
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.vectorstores import Chroma
from langchain.schema.document import Document


# This will expose your Langchain api token as an environment variable
load_dotenv()

# def read_csv(file_path: str, source_column: str = "about_product") -> List[Document]:
def read_csv(file_path: str, source_column: str = "product_name") -> List[Document]:
    """Reads a CSV file and returns a list of Documents.

    Args:
        file_path (str): The path to the CSV file to read.
        source_column (str, optional): The name of the column in the CSV file that contains the text data. Defaults to "Description".

    Returns:
        List[Document]: A list of Documents, where each Document contains the text data from the corresponding row in the CSV file.

    Raises:
        FileNotFoundError: If the CSV file does not exist.
        IOError: If there is an error reading the CSV file.
    """

    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File does not exist: {file_path}")

    loader = CSVLoader(file_path=file_path, source_column=source_column)
    data = loader.load()

    return data



## Load Embedding Model 

### Access Token
- reference: https://huggingface.co/docs/hub/security-tokens

In [13]:
from langchain_huggingface import HuggingFaceEmbeddings
#reference: https://python.langchain.com/docs/integrations/providers/huggingface/#huggingfaceembeddings
# model_name = "intfloat/multilingual-e5-large-instruct"
model_name = 'jhgan/ko-sroberta-nli'

# Function to load embeddings model
def load_embeddings_model(model_name: str) -> HuggingFaceEmbeddings:
    """Loads a Hugging Face Transformer model and returns an Embeddings object.

    Args:
        model_name (str): The name of the Hugging Face Transformer model to load.

    Returns:
        HuggingFaceEmbeddings: An Embeddings object that can be used to encode text into embeddings.
    """
    embedding_function = HuggingFaceEmbeddings(
        model_name=model_name,
        # huggingfacehub_api_token=os.environ["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"],
        model_kwargs={'device':'cpu'},
        encode_kwargs={'normalize_embeddings':True}
    )
    return embedding_function


#### Test Embeddings 

In [14]:
# Load the embeddings

# model_name = "intfloat/multilingual-e5-large-instruct"
model_name = 'sentence-transformers/all-mpnet-base-v2'
# model_name = 'jhgan/ko-sroberta-nli'
embeddings = load_embeddings_model(model_name)

# Test embedding a query
text = "This is a test document."
query_result = embeddings.embed_query(text)
print(query_result[:3])


[-0.04895179346203804, -0.03986209258437157, -0.021562770009040833]


## Vector Database

In [15]:
def vectorize_documents(data: List[Document], embedding_function: HuggingFaceEmbeddings) -> Chroma:
    """Vectorizes a list of Documents using a Hugging Face Transformer model.

    Args:
        data (List[Document]): A list of Documents to vectorize.
        embedding_function (HuggingFaceEmbeddings): An Embeddings object that can be used to encode text into embeddings.

    Returns:
        Chroma: A Chroma object that contains the vectorized documents.
    """

    ## Chroma, as a vector database, cosine similarity by default for searches.
    db = Chroma.from_documents(data, embedding=embedding_function, 
                            #    collection_metadata={"hnsw:space": "l2"}
                               collection_metadata={"hnsw:space": "cosine"}
                               )
    return db

#### Store Embeddeing data to Vector Store 

In [16]:
def init_llm():
    """Initializes the LLM by reading the CSV file, loading the embeddings model, and vectorizing the documents.

    Returns:
        Chroma: A Chroma object that contains the vectorized documents.
    """
    # Replace 'read_csv' with the appropriate CSV reading logic
    data = read_csv(file_path='amazon_rag.csv', source_column="product_name")
    model_name = 'sentence-transformers/all-mpnet-base-v2'
    # model_name = 'jhgan/ko-sroberta-nli'
    embedding_function = load_embeddings_model(model_name)
    db = vectorize_documents(data, embedding_function)
    return db

In [17]:
db = init_llm()

### Similarity Search

In [20]:
# Query the vector database
query = "iPhone USB charger and adapter"
found_docs = db.similarity_search_with_score(query, k=5)


In [21]:
# Load documents
found_docs

[(Document(metadata={'row': 78, 'source': 'swapkart fast charging cable and data sync usb cable compatible for iphone 6/6s/7/7+/8/8+/10/11, 12, 13 pro max ipad air/mini, ipod and ios devices (white)'}, page_content="product_id: B0B2DJDCPX\nproduct_name: swapkart fast charging cable and data sync usb cable compatible for iphone 6/6s/7/7+/8/8+/10/11, 12, 13 pro max ipad air/mini, ipod and ios devices (white)\nabout_product: [High Compatibility] : This iphone data cable supports with iPhone 6,6s,6 plus,6s plus,7 7 plus ,8 8plus,x,xs,11 pro max,12 mini pro max,13 mini pro max iPad Air, iPad mini, iPod Nano and iPod Touch|[Fast Charge&Data Sync ] : It can charge and sync simultaneously at a rapid speed, Compatible with any charging adaptor, multi-port charging station or power bank ,for fast charging ,fast adapter is must.|😍【Durable Spring Protection】：The easy-to-break connection port is protected by spring, which is a flexible and durable cable.You can use it with confidence.|【 Ultra High 

In [22]:
# Document with Score
document, score = found_docs[-1]
print(document.page_content)
print(f"\nScore: {score}")

product_id: B09Q8HMKZX
product_name: portronics konnect l 20w pd quick charge type-c to 8-pin usb mobile charging cable, 1.2m, tangle resistant, fast data sync(grey)
about_product: [1.2 M LONG DURABLE CABLE] : The Konnect L is about 1.2 M long and this feature allows for maximum convenience of the user. Not only is the cable length ample it also is durably built which ensures that no matter how roughly its handled no harm is done to it!|[20 W PD QUICK CHARGE FOR 8 PIN USB] : The Konnect L cable for 8 pin USB is an efficient cable for Apple smartphones charging and is as powerful as its Apple counterparts. You can plug in your smartphone for lightning speed charging!|[TPE & NYLON BRAIDED CORD] : The cable has been designed for optimal results and keeping this in mind it has been built with TPE and has further been nylon braided to ensure longevity and safe upkeep.|[TANGLE RESISTANT] : Isn’t it frustrating when we are in a hurry to charge our devices, and while pulling out the charger fi

In [24]:
found_docs[-1][0].page_content

'product_id: B09Q8HMKZX\nproduct_name: portronics konnect l 20w pd quick charge type-c to 8-pin usb mobile charging cable, 1.2m, tangle resistant, fast data sync(grey)\nabout_product: [1.2 M LONG DURABLE CABLE] : The Konnect L is about 1.2 M long and this feature allows for maximum convenience of the user. Not only is the cable length ample it also is durably built which ensures that no matter how roughly its handled no harm is done to it!|[20 W PD QUICK CHARGE FOR 8 PIN USB] : The Konnect L cable for 8 pin USB is an efficient cable for Apple smartphones charging and is as powerful as its Apple counterparts. You can plug in your smartphone for lightning speed charging!|[TPE & NYLON BRAIDED CORD] : The cable has been designed for optimal results and keeping this in mind it has been built with TPE and has further been nylon braided to ensure longevity and safe upkeep.|[TANGLE RESISTANT] : Isn’t it frustrating when we are in a hurry to charge our devices, and while pulling out the charger

In [25]:
#Get source 
found_docs[0][0].metadata['source']

'swapkart fast charging cable and data sync usb cable compatible for iphone 6/6s/7/7+/8/8+/10/11, 12, 13 pro max ipad air/mini, ipod and ios devices (white)'

In [26]:
# def run_vector_search(query: str):
#     # print(query)
#     query_vector = db.similarity_search_with_score(query, k=5)
#     # print(query_vector)
#     # document, score = query_vector
    
#     # return query_vector.metadata['source'], document.page_content, score
#     return query_vector

def run_vector_search(query: str, k = 3) -> str:
    """Performs a vector search and returns results in a structured format."""
    query_vector = db.similarity_search_with_score(query, k=k)

    # Extract and structure the search results
    results = []
    for document, score in query_vector:
        # Extract metadata and page content
        metadata = document.metadata
        page_content = {k.strip(): v.strip() for k, v in [line.split(':', 1) for line in document.page_content.split('\n') if ':' in line]}
        
        # Combine all data into a single dictionary
        combined_result = {
            "source": metadata.get("source", "Unknown Source"),
            **page_content,
            "score": score
        }
        results.append(combined_result)

    # Convert to a DataFrame
    df_results = pd.DataFrame(results)
    return df_results

In [27]:
run_vector_search('what is the iphone cable?')

Unnamed: 0,source,product_id,product_name,about_product,main_category,sub_category,actual_price,discount_percentage,rating,rating_count,score
0,duracell usb c to lightning apple certified (m...,B09W5XR9RT,duracell usb c to lightning apple certified (m...,1.2M Tangle Free durable tough braiding sync &...,Computers&Accessories,USBCables,"₹1,999",51%,4.4,184,0.429515
1,"gilary multi charging cable, 3 in 1 nylon brai...",B08LKS3LSP,"gilary multi charging cable, 3 in 1 nylon brai...",The cable comes with 3 Different pins allowing...,Computers&Accessories,USBCables,₹999,65%,3.7,1097,0.430282
2,swapkart fast charging cable and data sync usb...,B0B2DJDCPX,swapkart fast charging cable and data sync usb...,[High Compatibility] : This iphone data cable ...,Computers&Accessories,USBCables,₹499,58%,3.9,536,0.432239


### Text generation model

#### Generate Prompt

In [31]:
pip install accelerate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting accelerate
  Downloading accelerate-1.2.0-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.2.0-py3-none-any.whl (336 kB)
Installing collected packages: accelerate
Successfully installed accelerate-1.2.0
Note: you may need to restart the kernel to use updated packages.


In [33]:
import accelerate
print(accelerate.__version__)

1.2.0


In [38]:
pip install 'accelerate>=0.26.0'

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [40]:
pip install accelerate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [41]:
pip install --upgrade accelerate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [42]:
python3 -m pip install accelerate

SyntaxError: invalid syntax (1348341046.py, line 1)

In [39]:
from transformers import AutoTokenizer
import transformers 
import torch

# Load model
# model = "nilq/mistral-1L-tiny" 
#huggingface-cli login
#Reference: https://huggingface.co/docs/huggingface_hub/en/guides/cli

model = "ArliAI/Gemma-2-2B-ArliAI-RPMax-v1.1"#'ArliAI/Mistral-Small-22B-ArliAI-RPMax-v1.1'
tokenizer = AutoTokenizer.from_pretrained(
    model,  
    use_auth_token= "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", 
)

# pipeline 
pipeline = transformers.pipeline(
    "text-generation",  
    model=model,    
    torch_dtype=torch.float16,  
    device_map="auto",  
)

ImportError: Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install 'accelerate>=0.26.0'`

In [32]:
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

hf = HuggingFacePipeline(pipeline=pipeline)


NameError: name 'pipeline' is not defined

In [33]:
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain import PromptTemplate

promptTemplate_fstring = """
You are a customer service assistant, tasked with providing clear and concise answers based on the given context. 
Context:
{context}
Question:
{query}
Answer:
"""

In [34]:
from langchain_core.prompts import PromptTemplate

template = """Question: {question}
Answer:"""

prompt = PromptTemplate.from_template(template)
print(prompt)
chain = prompt | hf

question = "What is electroencephalography?"

print(chain.invoke({"question": question}))

input_variables=['question'] input_types={} partial_variables={} template='Question: {question}\nAnswer:'


The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.


Question: What is electroencephalography?
Answer: Electroencephalography is used to record the electrical activity of the brain through electrodes attached to the scalp.


In [39]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

# Query definition
query = "suggest cool iPhone USB charger and adapter"
# query = "what is the iphone cable?"
# query = "What is the caracteristic of iPhone USB charger and adapter"

# Perform vector search
doc_context = run_vector_search(query)

# Extract relevant columns
doc = doc_context[['product_name', 'about_product']]
# doc = doc_context[[ 'about_product']]

# print(doc)
# Convert context to string
context = doc.to_string(index=False)

#You are an assistant in customer service. Use the following context to answer the question:

# Define the prompt template
# promptTemplate_fstring = """
# Context:
# {context}
# Question:
# {query}
# Answer:
# """

promptTemplate_fstring = """
You are a customer service assistant, tasked with providing clear and concise answers based on the given context. 
Context:
{context}
Question:
{query}
Answer:
"""

# Initialize the prompt
prompt = PromptTemplate(
    # input_variables=["query", "context"],
    template=promptTemplate_fstring,
)

# print(prompt)
# Create the chain
chain = LLMChain(prompt=prompt, llm=hf)

# Run the chain and get the response
response = chain.run({"query": query, "context": context})

# Print the response
print(response)


You are a customer service assistant, tasked with providing clear and concise answers based on the given context. 
Context:
                                                                                                                                               product_name                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                