In [9]:
#chatgpt api



import openai
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# API 키 설정 (파일에서 읽어오기)
key_file_path = "../../../../keys/chatgpt.txt"
with open(key_file_path, 'r') as key_file:
    openai.api_key = key_file.read().strip()


# OpenAI 임베딩 API를 통해 임베딩 벡터를 얻는 함수 (새로운 API)
def get_embedding(text, model="text-embedding-ada-002"):
    response = openai.embeddings.create(
        input=text,
        model=model
    )
    # response에서 data 속성을 직접 접근하여 임베딩 값 반환
    return response.data[0].embedding

# 코사인 유사도 계산 함수
def cosine_similarity_vectors(vec1, vec2):
    vec1 = np.array(vec1).reshape(1, -1)
    vec2 = np.array(vec2).reshape(1, -1)
    return cosine_similarity(vec1, vec2)[0][0]




In [10]:
# 문서와 쿼리 정의
documents = [
    "Machine learning is a field of artificial intelligence.",
    "Artificial intelligence is transforming technology.",
    "Deep learning is a part of machine learning based on neural networks.",
    "Neural networks are a subset of machine learning algorithms."
]

query = "artificial intelligence and machine learning"



In [11]:
# 각 문서에 대한 임베딩 벡터 계산
document_embeddings = [get_embedding(doc) for doc in documents]

# 쿼리 임베딩 벡터 계산
query_embedding = get_embedding(query)

# 각 문서와 쿼리 간의 유사도 계산
similarities = [cosine_similarity_vectors(query_embedding, doc_embedding) for doc_embedding in document_embeddings]

# 상위 K개의 문서 찾기
top_k = 3
top_k_indices = np.argsort(similarities)[::-1][:top_k]

# 결과 출력
print(f"Top-{top_k} most relevant documents for the query '{query}':\n")
for idx in top_k_indices:
    print(f"Document {idx + 1}: {documents[idx]}")
    print(f"Similarity: {similarities[idx]}\n")

Top-3 most relevant documents for the query 'artificial intelligence and machine learning':

Document 1: Machine learning is a field of artificial intelligence.
Similarity: 0.9137438749108253

Document 2: Artificial intelligence is transforming technology.
Similarity: 0.8977128558083001

Document 3: Deep learning is a part of machine learning based on neural networks.
Similarity: 0.8615561835698318

