In [20]:
import re
from langchain.embeddings import OpenAIEmbeddings
from chromadb.utils import embedding_functions
import uuid
import os
import openai
from dotenv import load_dotenv
import pandas as pd

credential_path = "../../.credential"
assert os.path.exists(credential_path)
load_dotenv(credential_path)

openai.api_key = os.environ["OPENAI_API_KEY"]
# client = chromadb.Client(chroma_setting)
from chromadb import HttpClient

client = HttpClient()

In [21]:
preprocessed_data_folder = "../preprocessed_data"
chunk_size = 600

In [22]:
from transformers import GPT2TokenizerFast
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken


def count_tokens(text, model="gpt-3.5-turbo"):
    encoding = tiktoken.encoding_for_model(model)

    return len(encoding.encode(text))


openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key=os.getenv("OPENAI_API_KEY", "missing openai api key"),
    model_name="text-embedding-ada-002",
)

In [23]:

product_name_list = os.listdir(preprocessed_data_folder)
product_name_list.remove("product_a")
product_name_list.sort()
display(product_name_list)
# product_name_list = ["washing_machine_haier","air_purifier_electrolux"]
# remove = ["washing_machine_haier","air_purifier_electrolux"]
# for i in remove:
#     product_name_list.remove(i)
# product_name_list = ["tourist_visa"]
product_name_list = ['floral_embroidered_mini_dress',
 'jaspal_x_kenny_scharf_blobzr_me_t-shirt',]
product_name_list

['air_purifier_electrolux',
 'air_purifier_philips',
 'air_purifier_xiaomi',
 'electric_fan_hatari',
 'electric_fan_sharp',
 'electric_fan_toshiba',
 'floral_embroidered_mini_dress',
 'jaspal_x_kenny_scharf_blobzr_me_t-shirt',
 'refrigerator_haier',
 'refrigerator_samsung',
 'refrigerator_sharp',
 'tourist_visa',
 'tv_lg',
 'tv_samsung',
 'tv_sony',
 'washing_machine_haier',
 'washing_machine_lg',
 'washing_machine_samsung']

['floral_embroidered_mini_dress', 'jaspal_x_kenny_scharf_blobzr_me_t-shirt']

In [31]:

def extract_numeric_part(filename):
    # Use regular expression to extract the numeric part of the filename
    match = re.search(r"\d+", filename)
    if match:
        return int(match.group())
    return None


# Step 4: Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    # chunk_size=300,
    chunk_size=chunk_size,
    chunk_overlap=100,
    length_function=count_tokens,
)
for product_name in product_name_list:
    print(product_name)
    brand = product_name.split("_")[-1]
    product = product_name.replace("_" + brand, "")

    product_path = os.path.join(preprocessed_data_folder, product_name)
    text_file_name_list = os.listdir(product_path)
    # .txt only

    text_file_name_list = [i for i in text_file_name_list if i.endswith(".txt")]

    # print(text_file_name_list)

    # Sort the file names based on the numeric part
    text_file_name_list = sorted(text_file_name_list, key=extract_numeric_part)
    meta_data_list = []
    text_list = []

    for ind, text_file_name in enumerate(text_file_name_list):
        text_path = os.path.join(
            product_path, text_file_name.split(".")[0] + ".txt"
        )

        with open(text_path, "r") as f:
            text = f.read()

        meta_data = {
            "page_index": ind + 1,
            "brand": brand,
            "product": product,
        }
        # meta_data = {"hellow": ind+1}
        text_list.append(text)
        meta_data_list.append(meta_data)
    chunks = text_splitter.create_documents(text_list, meta_data_list)

    new_text_list = []
    new_meta_data_list = []
    for chunk in chunks:
        new_text_list.append(chunk.page_content)
        new_meta_data_list.append(chunk.metadata)
    collection = client.get_or_create_collection(
        name=product_name, embedding_function=openai_ef
    )
    collection.add(
        documents=new_text_list,  # we handle tokenization, embedding, and indexing automatically. You can skip that and add your own embeddings as well
        metadatas=new_meta_data_list,  # filter on these!
        ids=[f"doc_{i+1}_{uuid.uuid4}" for i in range(len(new_text_list))],  # unique for each doc
    )

floral_embroidered_mini_dress
jaspal_x_kenny_scharf_blobzr_me_t-shirt


In [7]:
text_list = []
meta_data_list = []
for chunk in chunks:
    text_list.append(chunk.page_content)
    meta_data_list.append(chunk.metadata)

In [28]:
list_collections = client.list_collections()
list_collections

[Collection(name=refrigerator_samsung),
 Collection(name=air_purifier_xiaomi),
 Collection(name=tourist_visa),
 Collection(name=electric_fan_hatari),
 Collection(name=washing_machine_haier),
 Collection(name=floral_embroidered_mini_dress),
 Collection(name=electric_fan_sharp),
 Collection(name=washing_machine_lg),
 Collection(name=tv_samsung),
 Collection(name=refrigerator_haier),
 Collection(name=air_purifier_electrolux),
 Collection(name=air_purifier_philips),
 Collection(name=jaspal_x_kenny_scharf_blobzr_me_t-shirt),
 Collection(name=tv_sony),
 Collection(name=washing_machine_samsung),
 Collection(name=refrigerator_sharp),
 Collection(name=electric_fan_toshiba),
 Collection(name=tv_lg)]

In [29]:
client.get_collection("tourist_visa").get()
client.get_collection("floral_embroidered_mini_dress").get()

{'ids': ['doc_1_<function uuid4 at 0x104d4e290>'],
 'embeddings': None,
 'metadatas': [{'brand': 'dress',
   'page_index': 1,
   'product': 'floral_embroidered_mini'}],
 'documents': ['color: black\ncolor: beige\nsize: XS\nsize: S\nsize: M\nsize: L\nsize: XL\nprice: ฿2,295.00\nproduct_name: \nFloral Embroidered Mini Dress \ndiv_value: Add some beautiful botanicals to your closet with this mini dress. It features floral embroidery, a V-neckline, a sleeveless design, slash pockets, and a back zip fastening. Match it with heels to finish the look. Perfect for wearing to a date or a caf?. Available in black or beige.\nsize_chart_for_recommend: size xs chest 30-32 inch waist 23-25 inch hip 33.5-35.5 inch\nsize_chart_for_recommend: size xs chest 77-81 cm waist 59-63 cm hip 86-90 cm\nsize_chart_for_recommend: size s chest 32-34 inch waist 25-27 inch hip 35.5-37.5 inch\nsize_chart_for_recommend: size s chest 82-86 cm waist 64-68 cm hip 91-95 cm\nsize_chart_for_recommend: size m chest 34-36 inc

In [40]:
raise

RuntimeError: No active exception to reraise

In [30]:
# 'floral_embroidered_mini_dress', 'jaspal_x_kenny_scharf_blobzr_me_t-shirt'
# client.delete_collection("floral_embroidered_mini_dress")
# client.delete_collection("jaspal_x_kenny_scharf_blobzr_me_t-shirt")
# client.delete_collection("tv_sony")

# #set ALLOW_RESET=TRUE 
# import os   
# for collection in list_collections:
#     client.delete_collection(collection.name)

In [17]:
db = client.get_collection("tv_sony")
db

Collection(name=tv_sony)

In [22]:
import chromadb

def get_chromadb_setting():
    setting = chromadb.config.Settings()
    return setting
from langchain.vectorstores import Chroma
embeddings=OpenAIEmbeddings()

collection_name = "tv_sony"
db = Chroma(
        collection_name=collection_name,
        client=chromadb.HttpClient(
            host="localhost", settings=get_chromadb_setting()
        ),
        embedding_function=embeddings,
    )


In [23]:
query = "มีช่อง HDMI กี่ช่อง"
docs = db.similarity_search(query)

In [26]:
for doc in docs:
    print(doc.page_content)
    print()

คมชัดระดับ 4K HDR ให้ภาพสมจริงขึ้นไปอีกระดับ ให้สีสัน ความมืด และความสว่างที่ถูกต้องระบบประมวลผลภาพ 4K X-Reality PRO เพิ่มความคมชัด และ 4K Resolution เพื่อเพิ่มความละเอียดของภาพชิปประมวลผล 4K processor X1 ที่ทำให้ภาพมีคุณภาพและมิติระบบปฏิบัติการ Google TV สามารถออกคำสั่งทีวีด้วยเสียงภาษาไทยโดยไม่จำเป็นต้องใช้รีโมทสามารถเชื่อมเนื้อหาบนโทรศัพท์มือถือทั้ง Android และ IOS ในการรองรับ Chrome cast และ Apple Airplayสั่งงานเครื่องใช้ไฟฟ้าอื่นๆ ในบ้านที่รองรับระบบ IOT โดยผ่าน Apple Homekitจัดระเบียบสายต่าง ๆ ที่เชื่อมต่อกับทีวีให้ดูเป็นระเบียบสวยงามด้วย Cable managementความจุภายใน 16 GB (พื้นที่บางส่วนอาจถูกใช้งานเพื่อข้อมูลในส่วนของระบบปฏิบัติการและแอปพลิเคชั่น)กำลังเสียงของลำโพง 10W+10W และ รองรับ Digital TV (DVB –T2)

- แบรนด์: SONY
- ซีรีส์: X75
- ความสูง: 66.2 ซม.
- ความกว้าง: 112.7 ซม.
- ความลึก: 7.7 ซม.
- น้ำหนัก: 9.8 กก.
- ขนาดหน้าจอ: 50 นิ้ว
- ความละเอียดหน้าจอ: 4K (2160P)
- ชนิดหน้าจอ: LED
- DIGITAL TV BUILT IN: YES
- SMART TV: ANDROID TV
- HDR FORMAT: HDR10
- HDMI ports (ช่อง): 3
- U