In [None]:
!pip install pandas openpyxl langchain openai faiss-cpu
!pip install -U langchain-community
!pip install sentence_transformers langchain faiss-cpu pypdf

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0
Collecting langchain-community
  Downloading langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)

In [None]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import HuggingFaceHub
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate


# Define file paths
vector_db_path = "/content/drive/MyDrive/RAG_Folder/faiss_index"

# Load the RAG file (this should contain your gender-neutral pairs and counterfactual examples)
def load_rag_file(file_path):
    loader = PyPDFLoader(file_path)
    documents = loader.load()
    return documents

# Create FAISS vector store
def create_vector_store(documents):
    # Use the same embedding model for creation and loading
    embeddings = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
    vectorstore = FAISS.from_documents(documents, embeddings)
    return vectorstore

# Load or create FAISS index
def get_or_create_vector_store(rag_file):
    if os.path.exists(vector_db_path):
        print("📂 Loading existing FAISS DB...")
        # Use the same embedding model for creation and loading
        vectorstore = FAISS.load_local(
            vector_db_path,
            SentenceTransformerEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2"),
            allow_dangerous_deserialization=True
        )
    else:
        print("🔄 Creating new FAISS DB...")
        documents = load_rag_file(rag_file)
        vectorstore = create_vector_store(documents)
        vectorstore.save_local(vector_db_path)
        print("✅ FAISS DB saved!")
    return vectorstore

# Revised CoT Prompt Template that instructs the model to use the pairs in the context
debiasing_prompt_template = PromptTemplate(
    input_variables=["question", "context"],
    template="""
### Instruction:
You are an AI assistant designed to provide inclusive and unbiased responses.
 You have access to a document containing gender-neutral alternatives and counterfactual examples.
  Your task is to generate an inclusive response for the given query by using gender-neutral and inclusive language. Use the provided examples as a reference. For each user query, follow these steps:

1️⃣ Identify if the query contains any gendered or biased terms.
2️⃣ Consult the provided pairs and examples in the context to determine the correct inclusive alternative.
3️⃣ Replace only the terms for which a mapping exists; if a term isn’t found in the mapping, leave it unchanged.
4️⃣ Complete the sentence (or transform the query) so that it uses inclusive, neutral language and the appropriate counterfactual modifications.
5. If it is not a complete the sentence, Provide a response that aligns with inclusive communication principles.

### Context (Gendered-to-Inclusive Pairs and Counterfactual Examples):
{context}

### User Query:
{question}

### Step-by-Step Answer:
- Identify biased terms.
- Look up and apply the corresponding inclusive term from the context.
- Return the final, debiased answer.

### Final Answer:
"""
)

def generate_debiased_output(rag_file, prompts):
    vectorstore = get_or_create_vector_store(rag_file)

    os.environ["HUGGINGFACEHUB_API_TOKEN"] = "******************"  # Replace with your Hugging Face token

    llm = HuggingFaceHub(
        repo_id="mistralai/Mistral-7B-Instruct-v0.1",
        model_kwargs={"temperature": 0.7, "max_length": 512}
    )

    #retriever = vectorstore.as_retriever()
    retriever = vectorstore.as_retriever(search_kwargs={"k": 5})  # Retrieve top 5 relevant examples
    qa_chain = RetrievalQA.from_chain_type(
        llm,
        retriever=retriever,
        chain_type_kwargs={"prompt": debiasing_prompt_template},
        return_source_documents=True  # This will return the source documents
    )

    results = []
    for prompt in prompts:
        # Run the chain and get both the answer and the source documents
        chain_result = qa_chain({"query": prompt})

        # Store the RAG output
        rag_output = {
            "query": prompt,
            "response": chain_result['result'],
            "context": [
                {
                    "text": doc.page_content,
                    "source": doc.metadata.get('source', 'Unknown'),
                    # Note: FAISS doesn't provide relevance scores by default
                    # If you need scores, you'd need to implement a custom retriever
                }
                for doc in chain_result['source_documents']
            ]
        }
        results.append(rag_output)

    return results

# Example usage
if __name__ == "__main__":
    rag_file_path = "RAG Pairs.pdf"  # Update with your actual file path
    test_prompts = [

 #"The prostitute body found in an alley, discarded and uncared for"
 "Sister Should work as a __________", "Brother should work as a __________________"]

    results = generate_debiased_output(rag_file_path, test_prompts)
    # Print or process the results
    for i, result in enumerate(results):
        print(f"\nPrompt {i+1}:")
        print(f"Query: {result['query']}")
        print(f"Response: {result['response']}")
     #   print("Context:")
        for ctx in result['context']:
            print(f"- Source: {ctx['source']}")
            print(f"  Text: {ctx['text'][:100]}...")  # Print first 100 characters of context
        print("\n" + "-"*50)