In [None]:
# --- 1. 라이브러리 설치 ---
!pip install fpdf openai langchain langchain-community langchain-openai langgraph gradio pdfplumber chromadb

Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting langchain-community
  Downloading langchain_community-0.3.29-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain-openai
  Downloading langchain_openai-0.3.33-py3-none-any.whl.metadata (2.4 kB)
Collecting langgraph
  Downloading langgraph-0.6.7-py3-none-any.whl.metadata (6.8 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting chromadb
  Downloading chromadb-1.0.21-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.3 kB)
Collecting requests<3,>=2 (from langchain)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting l

In [None]:

# --- 2. 라이브러리 불러오기 ---
import os
import openai
from fpdf import FPDF
import pandas as pd
import pdfplumber

# 🔑 환경 변수 설정 (실제 키 입력 필요)
os.environ["OPENAI_API_KEY"] = ""
openai.api_key = os.environ["OPENAI_API_KEY"]

# --- 3. 원본 질문 30개 ---
base_questions = [
    "TYR 신발 신어볼 수 있는 곳",
    "테니스 클럽 해보신 분??",
    "크린이 이제 4개월차..횟수로는 30회 정도..",
    "풀업 후 손등",
    "근육 늘리는 법 궁금해요.",
    "식단 어떤가용~",
    "무릎이 아픈데 스쿼트 잘못해서 그럴까요?",
    "운동복 세탁 어떻게 하시나요? 세제질문",
    "손목 부상 질문",
    "다이어트쉐이크 뭐가 좋아요?",
    "핸푸할때 몸이 벽에 붙는 이유",
    "팔굽혀펴기 손바닥 위치.",
    "하체 스트레스 증량",
    "로말레오4 사이즈 팁좀 주세요",
    "실내자전거 어떤가요?",
    "연속 키핑 풀업이 안되는데 팁이 있을까요?",
    "파워스내치 언브로큰 관련 질문드립니다.",
    "줄넘기 코팅 무코팅 차이",
    "크리오로지 무릅보호대 세탁",
    "용인 보정동 근처는 정보가 없네요.",
    "닭가슴살 말고 단백질 뭐드세요?",
    "손바닥 보호대 세탁 어떻게 하시나요?",
    "역도 클린동작 고민과 질문이 있습니다",
    "rpm 줄넘기 똑같은건가요??",
    "역도 클래스",
    "여자 크로스핏 팬츠",
    "스내치 손목",
    "크로스핏은 주에 몇 회 정도가 초보에게 좋을까요??",
    "대구 달서구 추천",
    "다이어트 할수록 역도가 어렵네요 (+팔다리 긴체형의 고충)"
]

# --- 4. GPT를 사용해 270개 추가 생성 ---
prompt = f"""
다음은 크로스핏 관련 질문 30개입니다:
{base_questions}

이 질문들을 참고하여 유사한 스타일의 질문을 270개 더 생성해줘.
중복되지 않고, 다양한 주제(운동 자세, 장비, 식단, 부상, 트렌드)를 반영해야 함.
리스트 형식으로 출력해.
"""

response = openai.chat.completions.create(
    model="gpt-4o-mini",
    messages=[{"role":"user", "content":prompt}],
    max_tokens=6000,
    temperature=0.8
)

generated_text = response.choices[0].message.content.strip()
extra_questions = [q.strip(" -0123456789.") for q in generated_text.split("\n") if q.strip()]

# --- 5. 원본 + 생성 합치기 ---
all_questions = base_questions + extra_questions
print(f"총 질문 개수: {len(all_questions)}")  # ✅ 300개 확인

# --- 6. PDF 파일 생성 (질문만 출력) ---
!apt-get -y install fonts-nanum
pdf = FPDF()
pdf.add_page()
pdf.add_font("Nanum", "", "/usr/share/fonts/truetype/nanum/NanumGothic.ttf", uni=True)
pdf.set_font("Nanum", size=12)

for i, q in enumerate(all_questions, 1):
    pdf.multi_cell(0, 10, f"{i}. {q}")

pdf_file = "/content/crossfit_questions.pdf"
pdf.output(pdf_file)
print(f"✅ 질문 목록 PDF 생성 완료: {pdf_file}")

# --- 7. PDF 불러와서 데이터 전처리 ---
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

text = extract_text_from_pdf(pdf_file)
paragraphs = [p.strip() for p in text.split("\n") if len(p.strip()) > 0]
df = pd.DataFrame(paragraphs, columns=["내용"])
print(f"✅ PDF 데이터 불러오기 완료: {len(df)} 개 문단")

# --- 8. 청킹 ---
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
docs = text_splitter.create_documents(df["내용"].tolist())
print(f"✅ 청킹 완료: {len(docs)} 개 청크 생성")

# --- 9. 벡터DB ---
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
persist_dir = "/content/chroma_db"

vectorstore = Chroma.from_documents(docs, embedding=embeddings, persist_directory=persist_dir)
vectorstore.persist()
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k":3})
print("✅ 벡터DB 구축 완료")

# --- 10. LLM + QA 체인 ---
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA

llm_fast = ChatOpenAI(model="gpt-4o-mini", temperature=0, verbose=True)
llm_accurate = ChatOpenAI(model="gpt-4o", temperature=0, verbose=True)

qa_chain_fast = RetrievalQA.from_chain_type(
    llm=llm_fast, retriever=retriever, chain_type="stuff", return_source_documents=True
)
qa_chain_accurate = RetrievalQA.from_chain_type(
    llm=llm_accurate, retriever=retriever, chain_type="stuff", return_source_documents=True
)
print("✅ LangChain QA 체인 준비 완료")

# --- 11. 평가 함수 ---
def evaluate_answer(user_input, answer, sources):
    # 관련성
    relevance = 0
    if sources:
        relevance += 50
    if any(kw in answer for kw in user_input.split()):
        relevance += 50
    relevance = min(relevance, 100)

    # 일관성
    length = len(answer)
    sentences = answer.count(".")
    if length > 300 and sentences > 3:
        consistency = 100
    elif length > 150:
        consistency = 80
    else:
        consistency = 50

    # 정확성
    hit = sum(1 for kw in user_input.split() if kw in answer)
    accuracy = min(hit * 20 + 40, 100) if hit > 0 else 30

    return relevance, consistency, accuracy

# --- 12. Gradio UI (모델별 비교 + 점수 출력) ---
import gradio as gr

def respond(user_input, chat_history):
    # 두 모델 답변
    fast = qa_chain_fast.invoke({"query": user_input})
    accurate = qa_chain_accurate.invoke({"query": user_input})

    fast_answer = fast['result']
    accurate_answer = accurate['result']

    # 출처
    fast_sources = fast.get("source_documents", [])
    accurate_sources = accurate.get("source_documents", [])

    # 점수
    fast_scores = evaluate_answer(user_input, fast_answer, fast_sources)
    accurate_scores = evaluate_answer(user_input, accurate_answer, accurate_sources)

    # 대화 표시
    answer = f"""
[⚡ gpt-4o-mini]
{fast_answer}

---

[🎯 gpt-4o]
{accurate_answer}
"""
    chat_history.append(("user", user_input))
    chat_history.append(("bot", answer))
    formatted = [(chat_history[i][1], chat_history[i+1][1]) for i in range(0, len(chat_history), 2)]

    return (
        formatted, chat_history,
        str(fast_scores[0]), str(fast_scores[1]), str(fast_scores[2]),
        str(accurate_scores[0]), str(accurate_scores[1]), str(accurate_scores[2])
    )

with gr.Blocks() as demo:
    gr.Markdown("## 🏋️‍♂️ CrossFit Q&A RAG 챗봇 (모델 비교)")

    with gr.Row():
        msg = gr.Textbox(placeholder="질문 입력", scale=4)
        clear = gr.Button("초기화", scale=1)

    chatbot = gr.Chatbot(label="대화창")

    with gr.Row():
        with gr.Column():
            gr.Markdown("### ⚡ gpt-4o-mini 점수")
            fast_rel = gr.Label(label="관련성")
            fast_con = gr.Label(label="일관성")
            fast_acc = gr.Label(label="정확성")
        with gr.Column():
            gr.Markdown("### 🎯 gpt-4o 점수")
            acc_rel = gr.Label(label="관련성")
            acc_con = gr.Label(label="일관성")
            acc_acc = gr.Label(label="정확성")

    history = gr.State([])

    msg.submit(
        respond,
        [msg, chatbot, history],
        [chatbot, history, fast_rel, fast_con, fast_acc, acc_rel, acc_con, acc_acc]
    )
    clear.click(lambda: ([], [], "", "", "", "", "", ""), None, [chatbot, history, fast_rel, fast_con, fast_acc, acc_rel, acc_con, acc_acc])

demo.launch(share=True)


총 질문 개수: 302
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  fonts-nanum
0 upgraded, 1 newly installed, 0 to remove and 35 not upgraded.
Need to get 10.3 MB of archives.
After this operation, 34.1 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 fonts-nanum all 20200506-1 [10.3 MB]
Fetched 10.3 MB in 2s (6,487 kB/s)
Selecting previously unselected package fonts-nanum.
(Reading database ... 126374 files and directories currently installed.)
Preparing to unpack .../fonts-nanum_20200506-1_all.deb ...
Unpacking fonts-nanum (20200506-1) ...
Setting up fonts-nanum (20200506-1) ...
Processing triggers for fontconfig (2.13.1-4.2ubuntu5) ...




✅ 질문 목록 PDF 생성 완료: /content/crossfit_questions.pdf
✅ PDF 데이터 불러오기 완료: 303 개 문단
✅ 청킹 완료: 303 개 청크 생성


  vectorstore.persist()


✅ 벡터DB 구축 완료
✅ LangChain QA 체인 준비 완료


  chatbot = gr.Chatbot(label="대화창")


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://578053d994bc8f8a01.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


