# LangChain

## Preprocessing

In [1]:
import pandas as pd 

In [2]:
# Read the online retail dataset
data = pd.read_csv('./data/amazon.csv')
df = data[:100].copy()
df.dropna(subset=['rating_count'], inplace=True)

df['sub_category'] = df['category'].astype(str).str.split('|').str[-1]
df['main_category'] = df['category'].astype(str).str.split('|').str[0]

In [3]:
df.columns

Index(['product_id', 'product_name', 'category', 'discounted_price',
       'actual_price', 'discount_percentage', 'rating', 'rating_count',
       'about_product', 'user_id', 'user_name', 'review_id', 'review_title',
       'review_content', 'img_link', 'product_link', 'sub_category',
       'main_category'],
      dtype='object')

In [4]:
df1 = df.copy()
df1['product_name'] = df1['product_name'].str.lower() 
df1 = df1.drop_duplicates(subset=['product_name'])    # Remove duplicates based on 'product_name'

In [5]:
print(df.shape)
print(df1.shape)

(100, 18)
(100, 18)


### Check Product Name & Description

In [6]:
df1['product_name'][0]

'wayona nylon braided usb to lightning fast charging and data sync cable compatible for iphone 13, 12,11, x, 8, 7, 6, 5, ipad air, pro, mini (3 ft pack of 1, grey)'

In [7]:
df1['about_product']

0     High Compatibility : Compatible With iPhone 12...
1     Compatible with all Type C enabled devices, be...
2     【 Fast Charger& Data Sync】-With built-in safet...
3     The boAt Deuce USB 300 2 in 1 cable is compati...
4     [CHARGE & SYNC FUNCTION]- This cable comes wit...
                            ...                        
95    Supports 150Mbps Wireless data transmission ra...
96    Compatible with MI Smart TV 4A 32 inch LED TV ...
97    The cable comes with 3 Different pins allowing...
98    Fastest USB 3.0 and Gigabit solution ensure hi...
99    【Power Delivery Fast Charging】: Charge your iP...
Name: about_product, Length: 100, dtype: object

In [8]:
df2 = df1[['product_id','product_name', 'about_product','main_category','sub_category', 'actual_price','discount_percentage','rating','rating_count' ]]

In [9]:
df2.head()

Unnamed: 0,product_id,product_name,about_product,main_category,sub_category,actual_price,discount_percentage,rating,rating_count
0,B07JW9H4J1,wayona nylon braided usb to lightning fast cha...,High Compatibility : Compatible With iPhone 12...,Computers&Accessories,USBCables,"₹1,099",64%,4.2,24269
1,B098NS6PVG,ambrane unbreakable 60w / 3a fast charging 1.5...,"Compatible with all Type C enabled devices, be...",Computers&Accessories,USBCables,₹349,43%,4.0,43994
2,B096MSW6CT,sounce fast phone charging cable & data sync u...,【 Fast Charger& Data Sync】-With built-in safet...,Computers&Accessories,USBCables,"₹1,899",90%,3.9,7928
3,B08HDJ86NZ,boat deuce usb 300 2 in 1 type-c & micro usb s...,The boAt Deuce USB 300 2 in 1 cable is compati...,Computers&Accessories,USBCables,₹699,53%,4.2,94363
4,B08CF3B7N1,portronics konnect l 1.2m fast charging 3a 8 p...,[CHARGE & SYNC FUNCTION]- This cable comes wit...,Computers&Accessories,USBCables,₹399,61%,4.2,16905


In [10]:
df2.to_csv('data/amazon_rag.csv', index=False)

### Load Document
- Document Loader	Data Type
- CSVLoader	CSV files
- DirectoryLoader	All files in a given directory
- Unstructured	Many file types (see https://docs.unstructured.io/platform/supported-file-types)
- JSONLoader	JSON files
- BSHTMLLoader	HTML files

In [11]:
# from langchain.document_loaders.csv_loader import CSVLoader
# from langchain_community.document_loaders import DirectoryLoader, TextLoader
# from langchain_community.document_loaders import NotionDirectoryLoader, NotionDBLoader

# loader = DirectoryLoader("../", glob="**/*.md", loader_cls=TextLoader)
# docs = loader.load()

In [13]:
from typing import List
from dotenv import load_dotenv
import os

from langchain.embeddings.huggingface_hub import HuggingFaceHubEmbeddings
from sentence_transformers import SentenceTransformer
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.vectorstores import Chroma
from langchain.schema.document import Document


# This will expose your Langchain api token as an environment variable
load_dotenv()

# def read_csv(file_path: str, source_column: str = "about_product") -> List[Document]:
def read_csv(file_path: str, source_column: str = "product_name") -> List[Document]:
    """Reads a CSV file and returns a list of Documents.

    Args:
        file_path (str): The path to the CSV file to read.
        source_column (str, optional): The name of the column in the CSV file that contains the text data. Defaults to "Description".

    Returns:
        List[Document]: A list of Documents, where each Document contains the text data from the corresponding row in the CSV file.

    Raises:
        FileNotFoundError: If the CSV file does not exist.
        IOError: If there is an error reading the CSV file.
    """

    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File does not exist: {file_path}")

    loader = CSVLoader(file_path=file_path, source_column=source_column)
    data = loader.load()

    return data

## Load Embedding Model 

### Access Token
- reference: https://huggingface.co/docs/hub/security-tokens

In [14]:
# Set the token
os.environ["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"] = '' #본인의 Hugging Face token 입력

In [17]:
from langchain_huggingface import HuggingFaceEmbeddings
#reference: https://python.langchain.com/docs/integrations/providers/huggingface/#huggingfaceembeddings
# model_name = "intfloat/multilingual-e5-large-instruct"
model_name = 'jhgan/ko-sroberta-nli'

# Function to load embeddings model
def load_embeddings_model(model_name: str) -> HuggingFaceEmbeddings:
    """Loads a Hugging Face Transformer model and returns an Embeddings object.

    Args:
        model_name (str): The name of the Hugging Face Transformer model to load.

    Returns:
        HuggingFaceEmbeddings: An Embeddings object that can be used to encode text into embeddings.
    """
    embedding_function = HuggingFaceEmbeddings(
        model_name=model_name,
        huggingfacehub_api_token=os.environ["HUGGINGFACEHUB_API_TOKEN"],
        model_kwargs={'device':'cpu'},
        encode_kwargs={'normalize_embeddings':True}
    )
    return embedding_function


#### Test Embeddings 

In [20]:
def load_embeddings_model(model_name: str) -> SentenceTransformer:
    """Loads a SentenceTransformer model and returns it."""
    # SentenceTransformer를 사용하여 모델을 로드
    model = SentenceTransformer(model_name)
    return model

# Load the embeddings
model_name = 'sentence-transformers/all-mpnet-base-v2'
embeddings = load_embeddings_model(model_name)

# Test embedding a query
text = "This is a test document."
query_result = embeddings.encode([text])  # SentenceTransformer에서 사용하는 방식
print(query_result[:3])  # 출력되는 벡터의 처음 세 개의 값만 출력

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[[-4.89517488e-02 -3.98619249e-02 -2.15627961e-02  9.90846660e-03
  -3.81040238e-02  1.26843844e-02  4.34945486e-02  7.18339905e-02
   9.74856503e-03 -6.98698079e-03  6.35281429e-02 -3.03226467e-02
   1.38394646e-02  2.58058943e-02 -1.13624346e-03 -1.45636220e-02
   4.16402668e-02  3.62282805e-02 -2.68008839e-02  2.51206737e-02
  -2.49786302e-02 -4.53322427e-03 -2.66672336e-02  4.10072273e-03
  -5.20480089e-02 -9.93037038e-03 -5.20652793e-02  8.99204239e-03
  -3.83005217e-02 -4.40584458e-02 -4.20440501e-03  7.04797134e-02
   5.13389474e-03 -7.16154054e-02  1.69753173e-06 -6.04771404e-03
  -1.10763488e-02  1.75133739e-02 -2.22998373e-02  4.09549549e-02
   3.37901674e-02  5.66503368e-02 -7.11493790e-02  4.09766398e-02
  -5.90610830e-03 -3.29703279e-02 -4.01169434e-02  3.69054787e-02
   1.04445955e-02  7.07238093e-02  1.90010853e-03 -1.20581510e-02
   4.28838059e-02 -4.50334251e-02  1.06468126e-01 -5.94848348e-03
  -1.17971823e-02  2.58937515e-02  3.90455052e-02 -1.23679964e-02
   1.53814

## Vector Database

In [67]:
def vectorize_documents(data: List[Document], embedding_function: HuggingFaceEmbeddings) -> Chroma:
    """Vectorizes a list of Documents using a Hugging Face Transformer model.

    Args:
        data (List[Document]): A list of Documents to vectorize.
        embedding_function (HuggingFaceEmbeddings): An Embeddings object that can be used to encode text into embeddings.

    Returns:
        Chroma: A Chroma object that contains the vectorized documents.
    """

    ## Chroma, as a vector database, cosine similarity by default for searches.
    db = Chroma.from_documents(data, embedding=embedding_function, 
                            #    collection_metadata={"hnsw:space": "l2"}
                               collection_metadata={"hnsw:space": "cosine"}
                               )
    return db

#### Store Embeddeing data to Vector Store 

In [68]:
def init_llm():
    """Initializes the LLM by reading the CSV file, loading the embeddings model, and vectorizing the documents.

    Returns:
        Chroma: A Chroma object that contains the vectorized documents.
    """
    # Replace 'read_csv' with the appropriate CSV reading logic
    data = read_csv(file_path='data/amazon_rag.csv', source_column="product_name")
    model_name = 'sentence-transformers/all-mpnet-base-v2'
    # model_name = 'jhgan/ko-sroberta-nli'
    embedding_function = load_embeddings_model(model_name)
    db = vectorize_documents(data, embedding_function)
    return db

In [69]:
db = init_llm()

AttributeError: 'str' object has no attribute 'page_content'

### Similarity Search

In [51]:
# Query the vector database
query = "iPhone USB charger and adapter"
found_docs = db.similarity_search_with_score(query, k=5)


In [52]:
# Load documents
found_docs

[(Document(metadata={}, page_content='portronics konnect l 1.2m fast charging 3a 8 pin usb cable with charge & sync function for iphone, ipad (grey)'),
  0.4407852292060852),
 (Document(metadata={}, page_content='swapkart fast charging cable and data sync usb cable compatible for iphone 6/6s/7/7+/8/8+/10/11, 12, 13 pro max ipad air/mini, ipod and ios devices (white)'),
  0.46232080459594727),
 (Document(metadata={}, page_content='amazonbasics new release nylon usb-a to lightning cable cord, fast charging mfi certified charger for apple iphone, ipad (6-ft, rose gold)'),
  0.47595155239105225),
 (Document(metadata={}, page_content='sounce fast phone charging cable & data sync usb cable compatible for iphone 13, 12,11, x, 8, 7, 6, 5, ipad air, pro, mini & ios devices'),
  0.49668216705322266),
 (Document(metadata={}, page_content='amazonbasics nylon braided usb-c to lightning cable, fast charging mfi certified smartphone, iphone charger (6-foot, dark grey)'),
  0.49811893701553345)]

In [53]:
# Document with Score
document, score = found_docs[-1]
print(document.page_content)
print(f"\nScore: {score}")

amazonbasics nylon braided usb-c to lightning cable, fast charging mfi certified smartphone, iphone charger (6-foot, dark grey)

Score: 0.49811893701553345


In [63]:
found_docs[-1][0].page_content

'amazonbasics nylon braided usb-c to lightning cable, fast charging mfi certified smartphone, iphone charger (6-foot, dark grey)'

In [None]:
#Get source 
found_docs[0][0].metadata['source']

KeyError: 'source'

In [60]:
# def run_vector_search(query: str):
#     # print(query)
#     query_vector = db.similarity_search_with_score(query, k=5)
#     # print(query_vector)
#     # document, score = query_vector
    
#     # return query_vector.metadata['source'], document.page_content, score
#     return query_vector

def run_vector_search(query: str, k = 3) -> str:
    """Performs a vector search and returns results in a structured format."""
    query_vector = db.similarity_search_with_score(query, k=k)

    # Extract and structure the search results
    results = []
    for document, score in query_vector:
        # Extract metadata and page content
        metadata = document.metadata
        page_content = {k.strip(): v.strip() for k, v in [line.split(':', 1) for line in document.page_content.split('\n') if ':' in line]}
        
        # Combine all data into a single dictionary
        combined_result = {
            "source": metadata.get("source", "Unknown Source"),
            **page_content,
            "score": score
        }
        results.append(combined_result)

    # Convert to a DataFrame
    df_results = pd.DataFrame(results)
    return df_results

In [None]:
run_vector_search('what is the iphone cable?')

Unnamed: 0,source,score
0,Unknown Source,0.354823
1,Unknown Source,0.355708
2,Unknown Source,0.35737


### Text generation model

#### Generate Prompt

In [31]:
from transformers import AutoTokenizer
import transformers 
import torch

# Load model
# model = "nilq/mistral-1L-tiny" 
#huggingface-cli login
#Reference: https://huggingface.co/docs/huggingface_hub/en/guides/cli

model = "ArliAI/Gemma-2-2B-ArliAI-RPMax-v1.1"#'ArliAI/Mistral-Small-22B-ArliAI-RPMax-v1.1'
tokenizer = AutoTokenizer.from_pretrained(
    model,  
    use_auth_token= '', #본인의 Token 업데이트, 
)

# pipeline 
pipeline = transformers.pipeline(
    "text-generation",  
    model=model,    
    torch_dtype=torch.float16,  
    device_map="auto",  
)



Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:  53%|#####2    | 2.63G/4.99G [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs-us-1.hf.co/repos/42/7f/427f46441aabe23286fce919e9893cd9c666f839b80658e8a7a972bc1d646d83/db5ecebda6b1cd7203acc4e9a7eae9bdcf6edd21fdb59ad31b3fb622ed27c19c?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model-00001-of-00002.safetensors%3B+filename%3D%22model-00001-of-00002.safetensors%22%3B&Expires=1733705384&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczMzcwNTM4NH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzQyLzdmLzQyN2Y0NjQ0MWFhYmUyMzI4NmZjZTkxOWU5ODkzY2Q5YzY2NmY4MzliODA2NThlOGE3YTk3MmJjMWQ2NDZkODMvZGI1ZWNlYmRhNmIxY2Q3MjAzYWNjNGU5YTdlYWU5YmRjZjZlZGQyMWZkYjU5YWQzMWIzZmI2MjJlZDI3YzE5Yz9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=ga9ATrvD5CRvQKVMRumo3n40Qz4ATbLg6eFWsO5ZQbNAQaDvBcbKRsGuLT1zcSoVmIC43EO9DZoolQNlU2KYefRW1r6TYGhSPjE3dtYQTFuC4EFSbULlVilcZFme3GCsFaYZf5K4n6YseTiaE4rW%7EI8zNCWfkP1TPoeLcMWr-WWEFp%7EJqPGeddA61X6x%7EqmOBZdxt2YvG1EL95QN6gKm8EfnMCAg3BZu

model-00001-of-00002.safetensors:  81%|########1 | 4.06G/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

--- Logging error ---
Traceback (most recent call last):
  File "/opt/anaconda3/envs/torch-env2/lib/python3.9/logging/__init__.py", line 1083, in emit
    msg = self.format(record)
  File "/opt/anaconda3/envs/torch-env2/lib/python3.9/logging/__init__.py", line 927, in format
    return fmt.format(record)
  File "/opt/anaconda3/envs/torch-env2/lib/python3.9/logging/__init__.py", line 663, in format
    record.message = record.getMessage()
  File "/opt/anaconda3/envs/torch-env2/lib/python3.9/logging/__init__.py", line 367, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "/opt/anaconda3/envs/torch-env2/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/opt/anaconda3/envs/torch-env2/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/opt/anaconda3/envs/torch-env2/lib/python3.9/site-packages/ipykernel_launcher.py", line 18,

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/213 [00:00<?, ?B/s]

Device set to use mps


In [32]:
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

hf = HuggingFacePipeline(pipeline=pipeline)


In [33]:
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain import PromptTemplate

promptTemplate_fstring = """
You are a customer service assistant, tasked with providing clear and concise answers based on the given context. 
Context:
{context}
Question:
{query}
Answer:
"""

In [34]:
from langchain_core.prompts import PromptTemplate

template = """Question: {question}
Answer:"""

prompt = PromptTemplate.from_template(template)
print(prompt)
chain = prompt | hf

question = "What is electroencephalography?"

print(chain.invoke({"question": question}))

input_variables=['question'] input_types={} partial_variables={} template='Question: {question}\nAnswer:'


The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.


Question: What is electroencephalography?
Answer: Electroencephalography is used to record the electrical activity of the brain through electrodes attached to the scalp.


In [39]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

# Query definition
query = "suggest cool iPhone USB charger and adapter"
# query = "what is the iphone cable?"
# query = "What is the caracteristic of iPhone USB charger and adapter"

# Perform vector search
doc_context = run_vector_search(query)

# Extract relevant columns
doc = doc_context[['product_name', 'about_product']]
# doc = doc_context[[ 'about_product']]

# print(doc)
# Convert context to string
context = doc.to_string(index=False)

#You are an assistant in customer service. Use the following context to answer the question:

# Define the prompt template
# promptTemplate_fstring = """
# Context:
# {context}
# Question:
# {query}
# Answer:
# """

promptTemplate_fstring = """
You are a customer service assistant, tasked with providing clear and concise answers based on the given context. 
Context:
{context}
Question:
{query}
Answer:
"""

# Initialize the prompt
prompt = PromptTemplate(
    # input_variables=["query", "context"],
    template=promptTemplate_fstring,
)

# print(prompt)
# Create the chain
chain = LLMChain(prompt=prompt, llm=hf)

# Run the chain and get the response
response = chain.run({"query": query, "context": context})

# Print the response
print(response)


You are a customer service assistant, tasked with providing clear and concise answers based on the given context. 
Context:
                                                                                                                                               product_name                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                