# Naive RAG

### kaynak dosya adi, kaynak url, cos similarity ve kaynak soru cevaplarla veriyor ciktiyi..

In [None]:
import os
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_openai.embeddings import OpenAIEmbeddings
import numpy as np
import asyncio

# .env dosyasını yükleyerek API anahtarlarını getir
load_dotenv()

# API Anahtarları
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
LANGCHAIN_API_KEY = os.getenv("LANGCHAIN_API_KEY")

# Model Tanımlaması
model = ChatOpenAI(model="gpt-4", api_key=OPENAI_API_KEY)


#### INDEXING ####

# Ana veri dizini: Tüm kategoriler altındaki dosyaları yükleme
base_directory = "rag_data/website/organized_data"

# DirectoryLoader ile belirtilen dizindeki tüm alt klasörlerden txt dosyalarını yükle
loader = DirectoryLoader(base_directory, glob="**/*.txt", loader_cls=TextLoader)

# Belgeleri yükle
docs = loader.load()

# Embedding işlemi
embedding = OpenAIEmbeddings(api_key=OPENAI_API_KEY)
vectorstore = Chroma.from_documents(documents=docs, embedding=embedding)

# Retriever tanımlaması
retriever = vectorstore.as_retriever()



#### RETRIEVAL and GENERATION ####

# Prompt: ChatPromptTemplate kullanılarak prompt hazırlanır
custom_prompt = ChatPromptTemplate.from_messages([
    HumanMessagePromptTemplate.from_template(
        """
        You are a Telekom-Hilfe assistant for question-answering tasks, providing answers to Telekom customers or potential customers.
        Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know.
        Use three sentences maximum and keep the answer concise.
        Question: {question}
        Context: {context}
        Answer:
        """
    )
])

# Cosine Similarity hesaplama fonksiyonu
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

# Belgeleri formatlama fonksiyonu
def format_docs(docs, query_embedding):
    unique_sources = set()  # Benzersiz kaynakları saklamak için bir set
    formatted_docs = []
    
    for doc in docs:
        source = doc.metadata.get("source")  # Kaynağı metadata'dan al
        if source and source not in unique_sources:
            unique_sources.add(source)
            document_embedding = embedding.embed_query(doc.page_content)  # Embedding hesapla
            similarity = cosine_similarity(query_embedding, document_embedding)  # Cosine similarity hesapla
            content = doc.page_content.strip() or "Bu belge içeriği boş."  # Belge içeriği
            formatted_docs.append(
                f"Source document: {source}\n\nCosine Similarity: {similarity:.4f}\n\n{content}"
            )
    
    return "\n\n".join(formatted_docs)

# Chain tanımlaması
rag_chain = (
    {"context": RunnablePassthrough(), "question": RunnablePassthrough()}
    | custom_prompt
    | model
    | StrOutputParser()
)

# Sorgu için kullanılan belgeleri ve cevabı döndüren fonksiyon
async def retrieve_and_format_docs(question):
    # Belgeleri sorgu için al
    query_embedding = embedding.embed_query(question)  # Sorgu için embedding al
    retrieved_docs = await retriever.ainvoke(question)
    
    # Belgelerin içeriğini formatla
    formatted_docs = format_docs(retrieved_docs, query_embedding)
    
    # Cevabı al
    try:
        answer = await rag_chain.invoke({"context": formatted_docs, "question": question})
    except TypeError:
        # Eğer TypeError alınırsa, invoke çağrısının senkron olduğu anlamına gelebilir
        answer = rag_chain.invoke({"context": formatted_docs, "question": question})
    
    return answer, formatted_docs

# Jupyter Notebook gibi bir ortamda, mevcut olay döngüsünü kullanarak asenkron işlevleri çağırma
async def main():
    question = "Magenta TV nedir?"
    answer, source_docs = await retrieve_and_format_docs(question)
    print("Answer:", answer)
    print("\nSources:")
    print(source_docs)

# Mevcut olay döngüsünü kullanarak asenkron işlevleri çalıştırma
await main()

# Advanced RAG

## Query Translation

## Multi-query

In [None]:
import os
import numpy as np
import asyncio
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.load import dumps, loads
from operator import itemgetter

question = "Vertrag'imi kündigen yapmak istiyorum, ne yapmaliyim?"

# .env dosyasını yükleyerek API anahtarlarını getir
load_dotenv()

# API Anahtarları
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
LANGCHAIN_API_KEY = os.getenv("LANGCHAIN_API_KEY")

# Model Tanımlaması
model = ChatOpenAI(model="gpt-4", api_key=OPENAI_API_KEY)


#### INDEXING ####

# Ana veri dizini: Tüm kategoriler altındaki dosyaları yükleme
base_directory = "rag_data/website/organized_data"

# DirectoryLoader ile belirtilen dizindeki tüm alt klasörlerden txt dosyalarını yükle
loader = DirectoryLoader(base_directory, glob="**/*.txt", loader_cls=TextLoader)

# Belgeleri yükle
docs = loader.load()

# Embedding işlemi
embedding = OpenAIEmbeddings(api_key=OPENAI_API_KEY)
vectorstore = Chroma.from_documents(documents=docs, embedding=embedding)

# Retriever tanımlaması
retriever = vectorstore.as_retriever()

####### Multi Query ########

# Multi Query: Different Perspectives
template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_perspectives = ChatPromptTemplate.from_template(template)

generate_queries = (
    prompt_perspectives 
    | ChatOpenAI(temperature=0) 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)
   

def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

# Retrieve
retrieval_chain = generate_queries | retriever.map() | get_unique_union
docs = retrieval_chain.invoke({"question":question})
len(docs)
print(docs)


#################################

#### RETRIEVAL and GENERATION ####

# Prompt: ChatPromptTemplate kullanılarak prompt hazırlanır
custom_prompt = ChatPromptTemplate.from_messages([
    HumanMessagePromptTemplate.from_template(
        """
        You are a Telekom-Hilfe assistant for question-answering tasks, providing answers to Telekom customers or potential customers.
        Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know.
        Use three sentences maximum and keep the answer concise.
        Question: {question}
        Context: {context}
        Answer:
        """
    )
])

# Cosine Similarity hesaplama fonksiyonu
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

# Belgeleri formatlama fonksiyonu
def format_docs(docs, query_embedding):
    unique_sources = set()  # Benzersiz kaynakları saklamak için bir set
    formatted_docs = []
    
    for doc in docs:
        source = doc.metadata.get("source")  # Kaynağı metadata'dan al
        if source and source not in unique_sources:
            unique_sources.add(source)
            document_embedding = embedding.embed_query(doc.page_content)  # Embedding hesapla
            similarity = cosine_similarity(query_embedding, document_embedding)  # Cosine similarity hesapla
            content = doc.page_content.strip() or "Bu belge içeriği boş."  # Belge içeriği
            formatted_docs.append(
                f"Source document: {source}\n\nCosine Similarity: {similarity:.4f}\n\n{content}"
            )
    
    return "\n\n".join(formatted_docs)

# Chain tanımlaması
rag_chain = (
    {"context": retrieval_chain, 
     "question": itemgetter("question")} 
    |
    {"context": RunnablePassthrough(), "question": RunnablePassthrough()}
    | custom_prompt
    | model
    | StrOutputParser()
)

# Sorgu için kullanılan belgeleri ve cevabı döndüren fonksiyon
async def retrieve_and_format_docs(question):
    # Belgeleri sorgu için al
    query_embedding = embedding.embed_query(question)  # Sorgu için embedding al
    retrieved_docs = await retriever.ainvoke(question)
    
    # Belgelerin içeriğini formatla
    formatted_docs = format_docs(retrieved_docs, query_embedding)
    
    # Cevabı al
    try:
        answer = await rag_chain.invoke({"context": formatted_docs, "question": question})
    except TypeError:
        # Eğer TypeError alınırsa, invoke çağrısının senkron olduğu anlamına gelebilir
        answer = rag_chain.invoke({"context": formatted_docs, "question": question})
    
    return answer, formatted_docs

# Jupyter Notebook gibi bir ortamda, mevcut olay döngüsünü kullanarak asenkron işlevleri çağırma
async def main():
    answer, source_docs = await retrieve_and_format_docs(question)
    print("Answer:", answer)
    print("\nSources:")
    print(source_docs)

# Mevcut olay döngüsünü kullanarak asenkron işlevleri çalıştırma
await main()

### Alttaki son kod daha iyi calisiyorsa üsttekini sil!

In [None]:
import os
import numpy as np
import asyncio
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.load import dumps, loads
from operator import itemgetter

# Soru Tanımlaması
question = "Vertrag'imi kündigen yapmak istiyorum, ne yapmaliyim?"

# API Anahtarlarını Yükle
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
LANGCHAIN_API_KEY = os.getenv("LANGCHAIN_API_KEY")

# Modeli ve Embedding'i Başlat
model = ChatOpenAI(model="gpt-4", api_key=OPENAI_API_KEY)
embedding = OpenAIEmbeddings(api_key=OPENAI_API_KEY)

# Belgeleri Yükle ve Embedding Oluştur
def initialize_vectorstore(directory):
    loader = DirectoryLoader(directory, glob="**/*.txt", loader_cls=TextLoader)
    docs = loader.load()
    vectorstore = Chroma.from_documents(documents=docs, embedding=embedding)
    return vectorstore, docs

vectorstore, docs = initialize_vectorstore("rag_data/website/organized_data")
retriever = vectorstore.as_retriever()

# Alternatif Sorular İçin Şablon
template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_perspectives = ChatPromptTemplate.from_template(template)

generate_queries = (
    prompt_perspectives 
    | ChatOpenAI(temperature=0) 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

def get_unique_union(documents):
    """ Unique union of retrieved docs """
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    unique_docs = list(set(flattened_docs))
    return [loads(doc) for doc in unique_docs]

# Retrieve
retrieval_chain = generate_queries | retriever.map() | get_unique_union

multi_query_docs = retrieval_chain.invoke({"question":question})

# Prompt Tanımlaması
custom_prompt = ChatPromptTemplate.from_messages([
    HumanMessagePromptTemplate.from_template(
        """
        You are a Telekom-Hilfe assistant for question-answering tasks, providing answers to Telekom customers or potential customers.
        Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know.
        Use three sentences maximum and keep the answer concise.
        Question: {question}
        Context: {context}
        Answer:
        """
    )
])

# Cosine Similarity Hesaplama Fonksiyonu
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

# Belgeleri Formatlama
def format_docs(docs, query_embedding):
    unique_sources = set()
    formatted_docs = []
    
    for doc in docs:
        source = doc.metadata.get("source")
        if source and source not in unique_sources:
            unique_sources.add(source)
            document_embedding = embedding.embed_query(doc.page_content)
            similarity = cosine_similarity(query_embedding, document_embedding)
            content = doc.page_content.strip() or "Bu belge içeriği boş."
            formatted_docs.append(
                f"Source document: {source}\n\nCosine Similarity: {similarity:.4f}\n\n{content}"
            )
    
    return "\n\n".join(formatted_docs)

# Chain Tanımlaması
rag_chain = (
    {"context": retrieval_chain, "question": itemgetter("question")} 
    | custom_prompt
    | model
    | StrOutputParser()
)

async def retrieve_and_format_docs(question):
    query_embedding = embedding.embed_query(question)
    #retrieved_docs = await retriever.ainvoke(question)
    formatted_docs = format_docs(multi_query_docs, query_embedding)
    
    try:
        answer = await rag_chain.invoke({"context": formatted_docs, "question": question})
    except TypeError:
        answer = rag_chain.invoke({"context": formatted_docs, "question": question})
    
    return answer, formatted_docs

# Asenkron Sorular Fonksiyonu
async def print_generated_queries(question):
    queries = generate_queries.invoke({"question": question})  # Burada await kullanmaya gerek yok
    print("Generated Questions:")
    for i, q in enumerate(queries):
        print(f"{i+1}: {q}")

# Ana Fonksiyon
async def main():
    await print_generated_queries(question)
    answer, source_docs = await retrieve_and_format_docs(question)
    print("Answer:", answer)
    print("\nSources:")
    print(source_docs)

await main()

## RAG-Fusion

### Iyi calisiyor gibi ama 4 soru generate ederken bazen dil sorunu olabiliyor türkce girince, ing ve alm olm
### Problem: cevapta mevcut data'da bulamayince mevcut belgelerde bulunamadi gibi dönüyor, bunu prompt a ekle oyle coz, salak salak konusmasin.

In [None]:
import os
import numpy as np
import asyncio
import logging
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.load import dumps, loads
from operator import itemgetter

# Configure logging to suppress INFO logs from HTTP requests
logging.basicConfig(level=logging.WARNING)
logging.getLogger('openai').setLevel(logging.WARNING)
logging.getLogger('urllib3').setLevel(logging.WARNING)

# Soru Tanımlaması
question = "esim nedir?"

# API Anahtarlarını Yükle
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
LANGCHAIN_API_KEY = os.getenv("LANGCHAIN_API_KEY")

# Modeli ve Embedding'i Başlat
model = ChatOpenAI(model="gpt-4", api_key=OPENAI_API_KEY)
embedding = OpenAIEmbeddings(api_key=OPENAI_API_KEY)

# Belgeleri Yükle ve Embedding Oluştur
def initialize_vectorstore(directory):
    loader = DirectoryLoader(directory, glob="**/*.txt", loader_cls=TextLoader)
    docs = loader.load()
    vectorstore = Chroma.from_documents(documents=docs, embedding=embedding)
    return vectorstore, docs

vectorstore, docs = initialize_vectorstore("rag_data/website/organized_data")
retriever = vectorstore.as_retriever()

############ RAG-Fusion #############

# RAG-Fusion: Related
template = """You are a helpful assistant that generates multiple search queries based on a single input query. \n
Generate multiple search queries related to: {question} \n
Output (4 queries):"""
prompt_rag_fusion = ChatPromptTemplate.from_template(template)

# Generate four queries
generate_queries = (
    prompt_rag_fusion 
    | ChatOpenAI(temperature=0)
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

# Function for reciprocal rank fusion
def reciprocal_rank_fusion(results: list[list], k=60):
    """ Reciprocal rank fusion that takes multiple lists of ranked documents 
        and an optional parameter k used in the RRF formula """
    
    # Initialize a dictionary to hold fused scores for each unique document
    fused_scores = {}

    # Iterate through each list of ranked documents
    for docs in results:
        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(docs):
            # Convert the document to a string format to use as a key (assumes documents can be serialized to JSON)
            doc_str = dumps(doc)
            # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            # Retrieve the current score of the document, if any
            previous_score = fused_scores[doc_str]
            # Update the score of the document using the RRF formula: 1 / (rank + k)
            fused_scores[doc_str] += 1 / (rank + k)

    # Sort the documents based on their fused scores in descending order to get the final reranked results
    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    # Return the reranked results as a list of tuples, each containing the document and its fused score
    return reranked_results

# Chain for the retrieval process
retrieval_chain_rag_fusion = generate_queries | retriever.map() | reciprocal_rank_fusion
fusion_docs = retrieval_chain_rag_fusion.invoke({"question": question})

# Cosine Similarity Calculation Function
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2) if (norm_vec1 and norm_vec2) else 0.0

# Function to get embeddings for a document's content
async def get_document_embeddings(doc):
    return embedding.embed_query(doc.page_content)

# Function to format fusion_docs as a readable string with similarity scores
async def format_fusion_docs_with_similarity(fusion_docs):
    formatted_docs = []
    question_embedding = embedding.embed_query(question)
    
    for doc, score in fusion_docs:
        doc_embedding = await get_document_embeddings(doc)
        similarity = cosine_similarity(question_embedding, doc_embedding)
        source = doc.metadata.get("source", "No source")
        content = doc.page_content
        formatted_docs.append(f"Source: {source}\nFusion Score: {score:.4f}\nCosine Similarity: {similarity:.4f}\nContent: {content}\n")
    
    return "\n".join(formatted_docs)

####################

# Prompt Definition
telekom_template = """You are a Telekom-Hilfe assistant for question-answering tasks, providing answers to Telekom customers or potential customers. 
Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. 
Use three sentences maximum and keep the answer concise.
Question: {question}
Context: {context}
Answer:
"""
prompt_telekom = ChatPromptTemplate.from_template(telekom_template)

# Chain Definition
rag_chain = (
    {"context": retrieval_chain_rag_fusion, "question": itemgetter("question")} 
    | prompt_telekom
    | model
    | StrOutputParser()
)

async def retrieve_and_format_docs(question):
    formatted_docs = await format_fusion_docs_with_similarity(fusion_docs)
    
    try:
        # Attempt to get the answer asynchronously
        answer = await rag_chain.invoke({"context": formatted_docs, "question": question})
    except TypeError:
        # Fallback to synchronous invocation if asynchronous fails
        answer = rag_chain.invoke({"context": formatted_docs, "question": question})
    
    return answer, formatted_docs

# Asynchronous function to print generated queries
async def print_generated_queries(question):
    queries = generate_queries.invoke({"question": question})
    print("Generated Questions:")
    for i, q in enumerate(queries):
        print(f"{i+1}: {q}")

# Main Function
async def main():
    await print_generated_queries(question)
    answer, formatted_docs = await retrieve_and_format_docs(question)
    print("Answer:", answer)
    print("\nSources:")
    print(formatted_docs)  # Print the formatted version of fusion_docs with similarity scores

# Run the main function
await main()

# !!Decomposition
## Calismadi olmadi maalesef, asnwer sadece 3. sorunun cevabini veriyor, stratch den baska kaynakalara bakip cözüm bulmak lazim.

In [73]:
import os
import numpy as np
import asyncio
import logging
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.load import dumps, loads
from operator import itemgetter

question = "Was ist Mobilfunk?"

# Configure logging
logging.basicConfig(level=logging.WARNING)
logging.getLogger('openai').setLevel(logging.WARNING)
logging.getLogger('urllib3').setLevel(logging.WARNING)

# Load API keys
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
LANGCHAIN_API_KEY = os.getenv("LANGCHAIN_API_KEY")

# Initialize model and embeddings
model = ChatOpenAI(model="gpt-4", api_key=OPENAI_API_KEY)
embedding = OpenAIEmbeddings(api_key=OPENAI_API_KEY)

def initialize_vectorstore(directory):
    loader = DirectoryLoader(directory, glob="**/*.txt", loader_cls=TextLoader)
    docs = loader.load()
    vectorstore = Chroma.from_documents(documents=docs, embedding=embedding)
    return vectorstore, docs

vectorstore, docs = initialize_vectorstore("rag_data/website/organized_data")
retriever = vectorstore.as_retriever()

# Define prompts and chains
template = """You are a helpful assistant that generates multiple sub-questions related to an input question. \n
The goal is to break down the input into a set of sub-problems / sub-questions that can be answered in isolation. \n
Generate multiple search queries related to: {question} \n
Output (3 queries):"""
prompt_decomposition = ChatPromptTemplate.from_template(template)

# Chain
generate_queries_decomposition = ( prompt_decomposition | model | StrOutputParser() | (lambda x: x.split("\n")))

# Run
questions = generate_queries_decomposition.invoke({"question":question})

# Answer recursion
template = """Here is the question you need to answer:

\n --- \n {question} \n --- \n

Here is any available background question + answer pairs:

\n --- \n {q_a_pairs} \n --- \n

Here is additional context relevant to the question: 

\n --- \n {context} \n --- \n

Use the above context and any background question + answer pairs to answer the question: \n {question}
"""
decomposition_prompt = ChatPromptTemplate.from_template(template)

def format_qa_pair(question, answer):
    """Format Q and A pair"""
    
    formatted_string = ""
    formatted_string += f"Question: {question}\nAnswer: {answer}\n\n"
    return formatted_string.strip()


q_a_pairs = ""
for q in questions:
    
    rag_chain = (
    {"context": itemgetter("question") | retriever, 
     "question": itemgetter("question"),
     "q_a_pairs": itemgetter("q_a_pairs")} 
    | decomposition_prompt
    | model
    | StrOutputParser())

    
    q_a_pair = format_qa_pair(q,answer)
    q_a_pairs = q_a_pairs + "\n---\n"+  q_a_pair
    answer = rag_chain.invoke({"question":q,"q_a_pairs":q_a_pairs})

2024-08-31 19:03:30,511 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-08-31 19:03:33,289 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-08-31 19:03:36,762 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-31 19:03:37,067 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-08-31 19:03:41,522 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-31 19:03:41,878 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-08-31 19:03:50,274 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-08-31 19:03:50,654 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-08-31 19:03:59,931 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [74]:
questions

['1. "Definition of Mobilfunk"',
 '2. "History of Mobilfunk"',
 '3. "How does Mobilfunk work?"']

In [75]:
answer

"Mobilfunk, or mobile communication, works by transmitting signals between mobile devices and network towers. It supports various types of communication such as voice calls, text messaging, video calls, and data transfer. The communication process happens through a mobile network which is divided into cells, each covered by a base station. When a mobile device makes a call or sends a message, the information is sent to the nearest base station, which then transmits the information to the recipient's nearest base station and finally to their device. \n\nFor data transfer, when a user browses the internet or uses an app, the request is sent to the base station, then to the mobile network's servers, which retrieve the data from the internet and send it back to the device through the base station. \n\nThe ability to move seamlessly between base stations while maintaining the call or data session is what gives Mobilfunk its mobility. However, the effectiveness of Mobilfunk can be influenced

## Step Back
### cosine similarity eksik sadece calisiyor suan.

In [3]:
import os
import logging
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_openai.embeddings import OpenAIEmbeddings

question = "Glasfaser baglantisina sahibim, bilmeme gereken en önemli noktalar nelerdir?"

# Configure logging
logging.basicConfig(level=logging.WARNING)
logging.getLogger('openai').setLevel(logging.WARNING)
logging.getLogger('urllib3').setLevel(logging.WARNING)

# Load API keys
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Initialize model and embeddings
model = ChatOpenAI(model="gpt-4", api_key=OPENAI_API_KEY)
embedding = OpenAIEmbeddings(api_key=OPENAI_API_KEY)

def initialize_vectorstore(directory):
    loader = DirectoryLoader(directory, glob="**/*.txt", loader_cls=TextLoader)
    docs = loader.load()
    vectorstore = Chroma.from_documents(documents=docs, embedding=embedding)
    return vectorstore

# Initialize vector store and retriever
vectorstore = initialize_vectorstore("rag_data/website/organized_data")
retriever = vectorstore.as_retriever()

# Few Shot Examples
examples = [
    {
        "input": "Could the members of The Police perform lawful arrests?",
        "output": "what can the members of The Police do?",
    },
    {
        "input": "Jan Sindel’s was born in what country?",
        "output": "what is Jan Sindel’s personal history?",
    },
]

# Transform examples into example messages
example_prompt = ChatPromptTemplate.from_messages(
    [
        ("human", "{input}"),
        ("ai", "{output}"),
    ]
)

few_shot_prompt = FewShotChatMessagePromptTemplate(
    example_prompt=example_prompt,
    examples=examples,
)

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You are an expert at world knowledge. Your task is to step back and paraphrase a question to a more generic step-back question, which is easier to answer. Here are a few examples:""",
        ),
        few_shot_prompt,
        ("user", "{question}"),
    ]
)

# Generate step-back queries
generate_queries_step_back = prompt | model | StrOutputParser()
step_back_question = generate_queries_step_back.invoke({"question": question})

print(f"Original Question: {question}")
print(f"Step-Back Question: {step_back_question}")

# Response prompt template
response_prompt_template = """You are an expert of world knowledge. I am going to ask you a question. Your response should be comprehensive and not contradicted with the following context if they are relevant. Otherwise, ignore them if they are not relevant.

# Normal Context:
{normal_context}

# Step-Back Context:
{step_back_context}

# Original Question: {question}

# Answer:
"""
response_prompt = ChatPromptTemplate.from_template(response_prompt_template)

def get_retrieved_content(retrieved_documents):
    """Format retrieved documents as a string with source information."""
    seen_sources = set()
    content_list = []
    for doc in retrieved_documents:
        source = doc.metadata.get('source', 'Unknown')
        if source not in seen_sources:
            seen_sources.add(source)
            content = (
                f"Source: {source}\n"
                f"Content:\n{doc.page_content}\n"
                "------------------------------\n"
            )
            content_list.append(content)
    return "\n".join(content_list)

def format_retrieved_context(query):
    """Retrieve and format context for the given query."""
    # Retrieve documents using the 'invoke' method
    retrieved_docs = retriever.invoke(query)
    return get_retrieved_content(retrieved_docs)

# Construct the chain to retrieve and generate the response
chain = (
    {
        "normal_context": lambda x: format_retrieved_context(x["question"]),
        "step_back_context": lambda x: format_retrieved_context(x["step_back_question"]),
        "question": lambda x: x["question"],
    }
    | response_prompt
    | model
    | StrOutputParser()
)

# Execute the chain
result = chain.invoke({"question": question, "step_back_question": step_back_question})

# Display the final response
print("\nNormal Context:\n", format_retrieved_context(question))
print("\nStep-Back Context:\n", format_retrieved_context(step_back_question))
print("\nFinal Answer:\n", result)

Original Question: Glasfaser baglantisina sahibim, bilmeme gereken en önemli noktalar nelerdir?
Step-Back Question: Glasfaser bağlantısı hakkında genel bilgiye sahip olmam gerekiyor mu?

Normal Context:
 Source: rag_data/website/organized_data/Others/https_www_telekom_de_netz_glasfaser_neubauprojekte.txt
Content:
Source URL: https://www.telekom.de/netz/glasfaser/neubauprojekte

Question: Warum Telekom Glasfaser für Ihr privates Eigentum?
Answer: Mit einem Telekom Glasfaser-Anschluss sind Sie nicht an uns gebunden und können auch Produkte von anderen Anbietern nutzen.
Ein Glasfaser-Anschluss ist der neue Standard für die digitale Versorgung und steigert schon heute den Wert Ihrer Immobilie.
Profitieren Sie von der Erfahrung und Zuverlässigkeit der Telekom als Partner. Wir stehen Ihnen jederzeit zur Seite und sorgen für einen reibungslosen Ablauf.

Question: Worüber möchten Sie sich informieren?
Answer: Wer baut, muss rechtzeitig planen. In allen Fragen zum modernen Hausanschluss berät u