# Naive RAG

### kaynak dosya adi, kaynak url, cos similarity ve kaynak soru cevaplarla veriyor ciktiyi..

In [None]:
import os
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_openai.embeddings import OpenAIEmbeddings
import numpy as np
import asyncio

# .env dosyasını yükleyerek API anahtarlarını getir
load_dotenv()

# API Anahtarları
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
LANGCHAIN_API_KEY = os.getenv("LANGCHAIN_API_KEY")

# Model Tanımlaması
model = ChatOpenAI(model="gpt-4", api_key=OPENAI_API_KEY)


#### INDEXING ####

# Ana veri dizini: Tüm kategoriler altındaki dosyaları yükleme
base_directory = "rag_data/website/organized_data"

# DirectoryLoader ile belirtilen dizindeki tüm alt klasörlerden txt dosyalarını yükle
loader = DirectoryLoader(base_directory, glob="**/*.txt", loader_cls=TextLoader)

# Belgeleri yükle
docs = loader.load()

# Embedding işlemi
embedding = OpenAIEmbeddings(api_key=OPENAI_API_KEY)
vectorstore = Chroma.from_documents(documents=docs, embedding=embedding)

# Retriever tanımlaması
retriever = vectorstore.as_retriever()



#### RETRIEVAL and GENERATION ####

# Prompt: ChatPromptTemplate kullanılarak prompt hazırlanır
custom_prompt = ChatPromptTemplate.from_messages([
    HumanMessagePromptTemplate.from_template(
        """
        You are a Telekom-Hilfe assistant for question-answering tasks, providing answers to Telekom customers or potential customers.
        Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know.
        Use three sentences maximum and keep the answer concise.
        Question: {question}
        Context: {context}
        Answer:
        """
    )
])

# Cosine Similarity hesaplama fonksiyonu
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

# Belgeleri formatlama fonksiyonu
def format_docs(docs, query_embedding):
    unique_sources = set()  # Benzersiz kaynakları saklamak için bir set
    formatted_docs = []
    
    for doc in docs:
        source = doc.metadata.get("source")  # Kaynağı metadata'dan al
        if source and source not in unique_sources:
            unique_sources.add(source)
            document_embedding = embedding.embed_query(doc.page_content)  # Embedding hesapla
            similarity = cosine_similarity(query_embedding, document_embedding)  # Cosine similarity hesapla
            content = doc.page_content.strip() or "Bu belge içeriği boş."  # Belge içeriği
            formatted_docs.append(
                f"Source document: {source}\n\nCosine Similarity: {similarity:.4f}\n\n{content}"
            )
    
    return "\n\n".join(formatted_docs)

# Chain tanımlaması
rag_chain = (
    {"context": RunnablePassthrough(), "question": RunnablePassthrough()}
    | custom_prompt
    | model
    | StrOutputParser()
)

# Sorgu için kullanılan belgeleri ve cevabı döndüren fonksiyon
async def retrieve_and_format_docs(question):
    # Belgeleri sorgu için al
    query_embedding = embedding.embed_query(question)  # Sorgu için embedding al
    retrieved_docs = await retriever.ainvoke(question)
    
    # Belgelerin içeriğini formatla
    formatted_docs = format_docs(retrieved_docs, query_embedding)
    
    # Cevabı al
    try:
        answer = await rag_chain.invoke({"context": formatted_docs, "question": question})
    except TypeError:
        # Eğer TypeError alınırsa, invoke çağrısının senkron olduğu anlamına gelebilir
        answer = rag_chain.invoke({"context": formatted_docs, "question": question})
    
    return answer, formatted_docs

# Jupyter Notebook gibi bir ortamda, mevcut olay döngüsünü kullanarak asenkron işlevleri çağırma
async def main():
    question = "Magenta TV nedir?"
    answer, source_docs = await retrieve_and_format_docs(question)
    print("Answer:", answer)
    print("\nSources:")
    print(source_docs)

# Mevcut olay döngüsünü kullanarak asenkron işlevleri çalıştırma
await main()

### Chunking yapmiyorum esasinda ben, split olayi ancak pdf dosyalarini islerken gerekecektir.

In [None]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300, 
    chunk_overlap=50)

# Make splits
splits = text_splitter.split_documents(blog_docs)

# Advanced RAG

## Query Translation

## Multi-query

In [8]:
import os
import numpy as np
import asyncio
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.load import dumps, loads
from operator import itemgetter

question = "Vertrag'imi kündigen yapmak istiyorum, ne yapmaliyim?"

# .env dosyasını yükleyerek API anahtarlarını getir
load_dotenv()

# API Anahtarları
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
LANGCHAIN_API_KEY = os.getenv("LANGCHAIN_API_KEY")

# Model Tanımlaması
model = ChatOpenAI(model="gpt-4", api_key=OPENAI_API_KEY)


#### INDEXING ####

# Ana veri dizini: Tüm kategoriler altındaki dosyaları yükleme
base_directory = "rag_data/website/organized_data"

# DirectoryLoader ile belirtilen dizindeki tüm alt klasörlerden txt dosyalarını yükle
loader = DirectoryLoader(base_directory, glob="**/*.txt", loader_cls=TextLoader)

# Belgeleri yükle
docs = loader.load()

# Embedding işlemi
embedding = OpenAIEmbeddings(api_key=OPENAI_API_KEY)
vectorstore = Chroma.from_documents(documents=docs, embedding=embedding)

# Retriever tanımlaması
retriever = vectorstore.as_retriever()

####### Multi Query ########

# Multi Query: Different Perspectives
template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_perspectives = ChatPromptTemplate.from_template(template)

generate_queries = (
    prompt_perspectives 
    | ChatOpenAI(temperature=0) 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)
   

def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

# Retrieve
retrieval_chain = generate_queries | retriever.map() | get_unique_union
docs = retrieval_chain.invoke({"question":question})
len(docs)
print(docs)


#################################

#### RETRIEVAL and GENERATION ####

# Prompt: ChatPromptTemplate kullanılarak prompt hazırlanır
custom_prompt = ChatPromptTemplate.from_messages([
    HumanMessagePromptTemplate.from_template(
        """
        You are a Telekom-Hilfe assistant for question-answering tasks, providing answers to Telekom customers or potential customers.
        Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know.
        Use three sentences maximum and keep the answer concise.
        Question: {question}
        Context: {context}
        Answer:
        """
    )
])

# Cosine Similarity hesaplama fonksiyonu
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

# Belgeleri formatlama fonksiyonu
def format_docs(docs, query_embedding):
    unique_sources = set()  # Benzersiz kaynakları saklamak için bir set
    formatted_docs = []
    
    for doc in docs:
        source = doc.metadata.get("source")  # Kaynağı metadata'dan al
        if source and source not in unique_sources:
            unique_sources.add(source)
            document_embedding = embedding.embed_query(doc.page_content)  # Embedding hesapla
            similarity = cosine_similarity(query_embedding, document_embedding)  # Cosine similarity hesapla
            content = doc.page_content.strip() or "Bu belge içeriği boş."  # Belge içeriği
            formatted_docs.append(
                f"Source document: {source}\n\nCosine Similarity: {similarity:.4f}\n\n{content}"
            )
    
    return "\n\n".join(formatted_docs)

# Chain tanımlaması
rag_chain = (
    {"context": retrieval_chain, 
     "question": itemgetter("question")} 
    |
    {"context": RunnablePassthrough(), "question": RunnablePassthrough()}
    | custom_prompt
    | model
    | StrOutputParser()
)

# Sorgu için kullanılan belgeleri ve cevabı döndüren fonksiyon
async def retrieve_and_format_docs(question):
    # Belgeleri sorgu için al
    query_embedding = embedding.embed_query(question)  # Sorgu için embedding al
    retrieved_docs = await retriever.ainvoke(question)
    
    # Belgelerin içeriğini formatla
    formatted_docs = format_docs(retrieved_docs, query_embedding)
    
    # Cevabı al
    try:
        answer = await rag_chain.invoke({"context": formatted_docs, "question": question})
    except TypeError:
        # Eğer TypeError alınırsa, invoke çağrısının senkron olduğu anlamına gelebilir
        answer = rag_chain.invoke({"context": formatted_docs, "question": question})
    
    return answer, formatted_docs

# Jupyter Notebook gibi bir ortamda, mevcut olay döngüsünü kullanarak asenkron işlevleri çağırma
async def main():
    answer, source_docs = await retrieve_and_format_docs(question)
    print("Answer:", answer)
    print("\nSources:")
    print(source_docs)

# Mevcut olay döngüsünü kullanarak asenkron işlevleri çalıştırma
await main()

  print_generated_queries(question)


[Document(metadata={'source': 'rag_data/website/organized_data/Others/https_www_telekom_de_start.txt'}, page_content='Source URL: https://www.telekom.de/start\n\n1. Question: Ein Eis gefällig?\n   Answer: Verträge verwalten, Rechnung einsehen u.v.m.\xa0- einfach und kostenlos in\xa0der\xa0App.\n\n'), Document(metadata={'source': 'rag_data/website/organized_data/Vertrag & Rechnung/https_www_telekom_de_hilfe_vertrag_rechnung_vertrag_kuendigung_vertrag_kuendigen.txt'}, page_content='Source URL: https://www.telekom.de/hilfe/vertrag-rechnung/vertrag/kuendigung/vertrag-kuendigen\nTelekom > Hilfe & Service > Vertrag & Rechnung > Vertrag > Kündigung > Vertrag\n\n1. Question: Wie kann ich meinen Vertrag kündigen?\n   Answer: Sie können uns Ihren Kündigungswunsch über das Kundencenter, die MeinMagenta App, per Kontaktformular oder per Brief mitteilen.\nKündigung über das KundencenterMelden Sie sich imKundencenteran und wählen Sie in der Kachel "Verträge" den Vertrag aus, den Sie kündigen wollen.

### Alttaki son kod daha iyi calisiyorsa üsttekini sil!

In [18]:
import os
import numpy as np
import asyncio
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.load import dumps, loads
from operator import itemgetter

# Soru Tanımlaması
question = "Hangi modemleri kullanabilirim adsl baglantim icin?"

# API Anahtarlarını Yükle
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
LANGCHAIN_API_KEY = os.getenv("LANGCHAIN_API_KEY")

# Modeli ve Embedding'i Başlat
model = ChatOpenAI(model="gpt-4", api_key=OPENAI_API_KEY)
embedding = OpenAIEmbeddings(api_key=OPENAI_API_KEY)

# Belgeleri Yükle ve Embedding Oluştur
def initialize_vectorstore(directory):
    loader = DirectoryLoader(directory, glob="**/*.txt", loader_cls=TextLoader)
    docs = loader.load()
    vectorstore = Chroma.from_documents(documents=docs, embedding=embedding)
    return vectorstore, docs

vectorstore, docs = initialize_vectorstore("rag_data/website/organized_data")
retriever = vectorstore.as_retriever()

# Alternatif Sorular İçin Şablon
template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_perspectives = ChatPromptTemplate.from_template(template)

generate_queries = (
    prompt_perspectives 
    | ChatOpenAI(temperature=0) 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

def get_unique_union(documents):
    """ Unique union of retrieved docs """
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    unique_docs = list(set(flattened_docs))
    return [loads(doc) for doc in unique_docs]

# Retrieve
retrieval_chain = generate_queries | retriever.map() | get_unique_union

docs = retrieval_chain.invoke({"question":question})

# Prompt Tanımlaması
custom_prompt = ChatPromptTemplate.from_messages([
    HumanMessagePromptTemplate.from_template(
        """
        You are a Telekom-Hilfe assistant for question-answering tasks, providing answers to Telekom customers or potential customers.
        Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know.
        Use three sentences maximum and keep the answer concise.
        Question: {question}
        Context: {context}
        Answer:
        """
    )
])

# Cosine Similarity Hesaplama Fonksiyonu
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

# Belgeleri Formatlama
def format_docs(docs, query_embedding):
    unique_sources = set()
    formatted_docs = []
    
    for doc in docs:
        source = doc.metadata.get("source")
        if source and source not in unique_sources:
            unique_sources.add(source)
            document_embedding = embedding.embed_query(doc.page_content)
            similarity = cosine_similarity(query_embedding, document_embedding)
            content = doc.page_content.strip() or "Bu belge içeriği boş."
            formatted_docs.append(
                f"Source document: {source}\n\nCosine Similarity: {similarity:.4f}\n\n{content}"
            )
    
    return "\n\n".join(formatted_docs)

# Chain Tanımlaması
rag_chain = (
    {"context": retrieval_chain, "question": itemgetter("question")} 
    | custom_prompt
    | model
    | StrOutputParser()
)

async def retrieve_and_format_docs(question):
    query_embedding = embedding.embed_query(question)
    #retrieved_docs = await retriever.ainvoke(question)
    formatted_docs = format_docs(docs, query_embedding)
    
    try:
        answer = await rag_chain.invoke({"context": formatted_docs, "question": question})
    except TypeError:
        answer = rag_chain.invoke({"context": formatted_docs, "question": question})
    
    return answer, formatted_docs

# Asenkron Sorular Fonksiyonu
async def print_generated_queries(question):
    queries = generate_queries.invoke({"question": question})  # Burada await kullanmaya gerek yok
    print("Generated Questions:")
    for i, q in enumerate(queries):
        print(f"{i+1}: {q}")

# Ana Fonksiyon
async def main():
    await print_generated_queries(question)
    answer, source_docs = await retrieve_and_format_docs(question)
    print("Answer:", answer)
    print("\nSources:")
    print(source_docs)

await main()

Generated Questions:
1: 1. Modemler arasında ADSL bağlantım için hangi seçenekleri kullanabilirim?
2: 2. ADSL bağlantısı için hangi modem markaları tercih edilebilir?
3: 3. ADSL bağlantılar için uygun modem seçenekleri nelerdir?
4: 4. ADSL bağlantımı destekleyen modem modelleri hangileridir?
5: 5. ADSL bağlantılar için modem seçerken nelere dikkat etmeliyim?
Answer: ADSL bağlantınız için şu modemleri kullanabilirsiniz: Speedport W 723V, Speedport Entry 2, NEO, W 921V, Speedport W 724V, W 925 Fiber, W 922V, ve Speedport Smart-Serie (Smart 2, 3, 4 ve 4 Plus), W 925V, Pro, Pro Plus. Bu modemlerin hepsi VDSL ve Glasfaser (Fiber) bağlantıları destekler. Ancak, hangi modemlerin tam olarak sizin ADSL hızınıza uygun olduğunu kontrol etmek için daha fazla ayrıntıya ihtiyaç duyarız.

Sources:
Source document: rag_data/website/organized_data/Geräte & Zubehör/https_www_telekom_de_hilfe_geraete_zubehoer_router_speedport_vdsl_fiber_router.txt

Cosine Similarity: 0.7964

Source URL: https://www.telek

### part6 rag fusion'da kaldim