In [None]:
!pip install transformers==4.28.0 sentence-transformers==2.2.2 faiss-cpu datasets==2.13.0

In [1]:
import pandas as pd
import numpy as np
from datasets import load_dataset

# 예시: AG News 데이터셋 (4개 카테고리의 뉴스 제목+본문)
dataset = load_dataset("ag_news", split="train[:1000]")  # 샘플로 1000개만 사용
dataset = dataset.to_pandas()  # Pandas로 변환
dataset.head(3)

Unnamed: 0,text,label
0,Wall St. Bears Claw Back Into the Black (Reute...,2
1,Carlyle Looks Toward Commercial Aerospace (Reu...,2
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,2


In [5]:
dataset = dataset.drop_duplicates()
dataset.shape

(1000, 2)

In [12]:
import torch

In [46]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

GENERATOR_MODEL_NAME = "facebook/bart-large"
# GENERATOR_MODEL_NAME = "EleutherAI/gpt-neo-1.3B"

generator_tokenizer = AutoTokenizer.from_pretrained(GENERATOR_MODEL_NAME)
generator_model = AutoModelForSeq2SeqLM.from_pretrained(GENERATOR_MODEL_NAME)


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

In [47]:
generator_model.to(torch.device("cuda"))

AssertionError: Torch not compiled with CUDA enabled

In [7]:
from sentence_transformers import SentenceTransformer

QUERY_ENCODER_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
query_encoder = SentenceTransformer(QUERY_ENCODER_MODEL_NAME)


In [8]:
def encode_query(query: str):
    # SentenceTransformers 모델로 임베딩 추출
    embedding = query_encoder.encode([query], convert_to_numpy=True)
    # shape: (384, )
    return embedding[0]  


In [11]:
generator_model.device

device(type='cpu')

In [14]:
tensor = encode_query("안녕하세요! 내일 날씨가 어떻습니까?")
tensor.shape

(384,)

## 4. Document Indexing 코드 (FAISS 사용)

In [16]:
documents = dataset["text"].tolist()  # list of strings
doc_embeddings = query_encoder.encode(documents, convert_to_numpy=True)  # shape: (num_docs, dim)
normalized_embeddings = doc_embeddings / np.linalg.norm(doc_embeddings, axis=1, keepdims=True)

In [17]:
import faiss
import numpy as np

dim = normalized_embeddings.shape[1]  # 임베딩 차원
index = faiss.IndexFlatL2(dim)  # L2 거리기반
index.add(normalized_embeddings)       # 문서 임베딩 등록

In [18]:
index.ntotal

1000

## 5. Retriever 코드
Retrieval 함수는 쿼리(텍스트)를 입력받아
1. 쿼리 임베딩을 구하고
2. FAISS index에서 유사 문서 Top-K를 찾고
3. 해당 문서들의 텍스트를 반환한다.

In [19]:
def retrieve_top_k_docs(query: str, k=3):
    q_emb = encode_query(query).reshape(1, -1)  # (1, dim)
    distances, indices = index.search(q_emb, k) # (1, k) shape
    # indices: (1, k) 형태, 실제 문서 인덱스
    top_k_docs = [documents[i] for i in indices[0]]
    return top_k_docs, distances[0]


In [20]:
query_example = "Find news about microsoft"
top_docs, dists = retrieve_top_k_docs(query_example, k=5)
for i, (doc, dist) in enumerate(zip(top_docs, dists)):
    print(f"Top {i+1} doc (dist={dist:.4f}):\n{doc}\n")


Top 1 doc (dist=0.7838):
Taking the Microsoft Rorschach test CNET News.com's Charles Cooper asks what it is about Microsoft that pushes so many people straight over the edge?

Top 2 doc (dist=0.8133):
Microsoft to Introduce Cheaper Version of Windows  SEATTLE (Reuters) - Microsoft Corp. &lt;MSFT.O&gt; said it will  begin selling a stripped-down, low-cost version of its Windows  XP operating system in the emerging markets of Indonesia,  Malaysia and Thailand in order to spread the use of computing  and develop technology markets.

Top 3 doc (dist=0.9460):
Microsoft Lists XP SP2 Problems (NewsFactor) NewsFactor - With automatic download of Microsoft's (Nasdaq: MSFT) enormous SP2 security patch to the Windows XP operating system set to begin, the industry still waits to understand its ramifications. Home users that have their preferences set to receive operating-system updates as they are made available by Microsoft may be surprised to learn that some of the software they already run on t

In [36]:
i = 0
for ch1, ch2 in zip(top_docs[0], top_docs[1]):
    if ch1 != ch2:
        print(i, ch1, ch2)
    i = i+1

182 t n
183 h e
184 i x
185 s t


## Generator 코드

In [52]:
def rag_generate_answer(query: str, k=3, max_length=128):
    # 1) Retrieval
    top_docs, _ = retrieve_top_k_docs(query, k)
    
    context_str = ""
    # 2) Context 만들기 (단순 예시)
    for idx, doc in enumerate(top_docs):
        context_str = context_str + f"\n[news no.{idx}]\n{doc}"
    combined_prompt = f"Question: {query}\nContext: {context_str}\nresult:"
    print(combined_prompt)
    
    # 3) Generator로 답 생성
    inputs = generator_tokenizer([combined_prompt], return_tensors="pt", truncation=True)
    # GPU 사용 가능 시 -> inputs = inputs.to('cuda'), model도 .to('cuda') 가능
    with torch.no_grad():
        outputs = generator_model.generate(
            **inputs, 
            max_length=max_length, 
            num_beams=2,
            early_stopping=False,
        )
    answer = generator_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer


In [53]:
# 예시 쿼리
query_sample = "when does google's auction open?"
answer_output = rag_generate_answer(query_sample, k=3)
print("=== RAG-style Generation ===")
print("Query:", query_sample)
print("Answer:", answer_output)

Question: when does google's auction open?
Context: 
[news no.0]
Google auction begins on Friday An auction of shares in Google, the web search engine which could be floated for as much as \$36bn, takes place on Friday.
[news no.1]
Google IPO Bidding Opens Google IPO Bidding Opens\\Google's IPO bidding is officially open. Google and its underwriters expect to open the auction for the shares of Google rsquo;s Class A common stock at 9:00 a.m. EST (press time) on Friday, August 13, 2004. Google bidders must have obtained a bidder ID from ipo.google.com if you ...
[news no.2]
In Google's Auction, It's Not Easy to Tell a Bid From a Bet In a competition combining suspense and strategy, countless brave souls are hoping to buy a small piece of Google in an auction this week.
result:
=== RAG-style Generation ===
Query: when does google's auction open?
Answer: Question: when does google's auction open?=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-==-=-=-=-=-=-=-=-=-=-=-=-==-