In [None]:
!pip install langchain chromadb sentence-transformers
!pip install -U langchain-community
!pip install -U langchain-huggingface
!pip install tf-keras
!pip install langdetect

In [None]:
import pandas as pd
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings

from huggingface_hub import login

# 복사한 토큰을 여기에 붙여넣기
login(token="****************")

In [None]:
csv_path = "/content/클렌징_RAG_데이터.csv"  # 파일 경로 수정

In [None]:
# ✅ 4. 데이터 불러오기
df = pd.read_csv(csv_path)
df = df.dropna(subset=["상품명", "피부타입", "리뷰내용", "품목"])  # 필수 컬럼 필터링

# 메타데이터 추가

In [None]:
from textwrap import wrap
from langchain.schema import Document

documents = []

for idx, row in df.iterrows():
    review = str(row["리뷰내용"]).strip()
    product = str(row["상품명"]).strip()
    skin_type = str(row["피부타입"]).strip()
    category = str(row["품목"]).strip()
    ingredients = str(row.get("성분", "")).strip()
    barcode = str(row.get("바코드", "")).strip()

    vegan_flag = row.get("비건품목", None)
    if pd.isna(vegan_flag):
        vegan_label = "정보 없음"
    elif int(vegan_flag) == 1:
        vegan_label = "비건"
    else:
        vegan_label = "논비건"

    chunks = wrap(review, 400)

    for i, chunk in enumerate(chunks):
        doc = Document(
            page_content=chunk,
            metadata={
                "product": product,
                "skin_type": skin_type,
                "category": category,
                "ingredients": ingredients,
                "barcode": barcode,
                "review_id": f"{product}_{idx}",
                "chunk_id": i,
                "vegan": vegan_label
            }
        )
        documents.append(doc)


In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
)


In [None]:
from langchain.vectorstores import Chroma

vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=embedding_model,
    persist_directory="chroma_multilingual"
)
