In [2]:
pdf_folder_path = "/Users/badekale/Documents/Hamoyeew/Subfolders"

In [3]:
import os
subfolders = sorted([folder for folder in os.listdir(pdf_folder_path) if os.path.isdir(os.path.join(pdf_folder_path, folder))])
print(f"Total files in the folder: {len(subfolders)}")

Total files in the folder: 52


In [4]:
import os
from langchain_nomic import NomicEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_nomic import NomicEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain.chains import RetrievalQA
from IPython.display import Markdown
import re

In [5]:
subfolders = [folder for folder in os.listdir(pdf_folder_path) if os.path.isdir(os.path.join(pdf_folder_path, folder))]

# Function to sort subfolders separately by type (COP, CMP, CMA) in descending order
def custom_sort_key(folder_name):
    # Match COP, CMP, CMA with their numbers
    match = re.match(r"(COP|CMP|CMA)(\d+)", folder_name)
    if match:
        conference_type = match.group(1)
        number = int(match.group(2))
        if conference_type == "COP":
            return (0, -number)  # Sort COP first in descending order
        elif conference_type == "CMP":
            return (1, -number)  # CMP comes after COP, in descending order
        elif conference_type == "CMA":
            return (2, -number)  # CMA comes last, in descending order
    return (3, folder_name)  # Default case if no match

# Sort the subfolders by type (COP, CMP, CMA)
cop_subfolders = [folder for folder in subfolders if "COP" in folder]
cmp_subfolders = [folder for folder in subfolders if "CMP" in folder]
cma_subfolders = [folder for folder in subfolders if "CMA" in folder]

# Sort each list in descending order
cop_subfolders.sort(key=custom_sort_key)
cmp_subfolders.sort(key=custom_sort_key)
cma_subfolders.sort(key=custom_sort_key)

# Create a list to store the final ordered subfolders by alternating COP, CMP, and CMA
final_order = []

# Find the maximum number of COP, CMP, and CMA folders to ensure alternating
max_length = max(len(cop_subfolders), len(cmp_subfolders), len(cma_subfolders))

# Alternate between COP, CMP, and CMA
for i in range(max_length):
    if i < len(cop_subfolders):
        final_order.append(cop_subfolders[i])
    if i < len(cmp_subfolders):
        final_order.append(cmp_subfolders[i])
    if i < len(cma_subfolders):
        final_order.append(cma_subfolders[i])

# Create a list to store the documents
documents = []

# Initialize a counter for the number of PDFs ingested
pdf_count = 0

# Function to load and process PDFs from each subfolder
def load_pdfs_from_subfolders(pdf_folder_path, final_order):
    global pdf_count

    for subfolder in final_order:
        # Get the path for the current subfolder
        subfolder_path = os.path.join(pdf_folder_path, subfolder)
        
        # List all PDFs in the subfolder and sort them
        files_in_subfolder = sorted([file for file in os.listdir(subfolder_path) if file.endswith('.pdf')])

        # Print the subfolder name being processed
        print(f"Processing folder: {subfolder}")

        # Load each PDF in the subfolder
        for file_name in files_in_subfolder:
            pdf_path = os.path.join(subfolder_path, file_name)
            loader = PyPDFLoader(pdf_path)
            doc = loader.load()
            
            # Add the loaded document to the list
            documents.extend(doc)
            
            # Increment the PDF counter
            global pdf_count
            pdf_count += 1
            print(f'Loaded and processed: {file_name}')

# Load PDFs from the sorted subfolders
load_pdfs_from_subfolders(pdf_folder_path, final_order)

Processing folder: COP28
Loaded and processed: cp2023_11a01E.pdf
Loaded and processed: cp2023_11a02E.pdf
Processing folder: CMP18
Loaded and processed: cmp2023_09a01E.pdf
Processing folder: CMA5
Loaded and processed: cma2023_16a01E.pdf
Loaded and processed: cma2023_16a02E.pdf
Loaded and processed: cma2023_16a03E.pdf
Processing folder: COP27
Loaded and processed: cp2022_10a01_E.pdf
Loaded and processed: cp2022_10a02E.pdf
Loaded and processed: cp2022_10a03E.pdf
Processing folder: CMP17
Loaded and processed: cmp2022_09a01E.pdf
Processing folder: CMA4
Loaded and processed: cma2022_10_a01E.pdf
Loaded and processed: cma2022_10a03E.pdf
Loaded and processed: cma2023_10a02E.pdf
Processing folder: COP26
Loaded and processed: cp2021_12_add1E.pdf
Loaded and processed: cp2021_12a02E.pdf
Processing folder: CMP16
Loaded and processed: cmp2021_08_add1E.pdf
Processing folder: CMA3
Loaded and processed: CMA2021_10_Add3_E.pdf
Loaded and processed: CMA2021_L10a2E.pdf
Loaded and processed: cma2021_10a01E.p

In [7]:
print(f"Total number of PDFs ingested: {pdf_count}")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_documents(documents)
print(f'Total number of document chunks: {len(chunks)}')

Total number of PDFs ingested: 91
Total number of document chunks: 12339


In [8]:
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings import OllamaEmbeddings

In [9]:
from langchain_ollama import OllamaLLM

ollama = OllamaLLM(
    base_url="http://localhost:11434",  
    model="llama3.2:3b",  
    temperature=0,  
)

In [10]:
from langchain_ollama import OllamaEmbeddings
embeddings_model = OllamaEmbeddings(
    model="nomic-embed-text:latest",
)

In [11]:
vectorstore = Chroma.from_documents(
    documents=documents,
    collection_name="rag-chroma",
    embedding=embeddings_model,
    persist_directory='/Users/badekale/Documents/Hamoyeew/chroma001'
)

In [None]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 2})
rag_chain = RetrievalQA.from_chain_type(
    llm=ollama,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=False
)

In [None]:
questions = [
    "Generate potential future climate policy scenarios for Sub-Saharan Africa that focus on accelerating renewable energy adoption. Consider policy incentives, grid integration challenges, and mechanisms for ensuring equitable access to clean energy."
    "Develop climate policy scenarios in which Sub-Saharan Africa prioritizes energy efficiency as a pillar of sustainable development. Include regulatory frameworks, financial incentives, and projected long-term benefits."
    "Propose future policy pathways that drive equitable access to renewable energy solutions in off-grid communities across Sub-Saharan Africa. Explore community-led initiatives, financing models, and regulatory support."
    "Develop climate policy scenarios where Sub-Saharan Africa prioritizes green hydrogen as a key energy export. Explore how governments might incentivize production, build supporting infrastructure, and navigate geopolitical opportunities and risks in global energy markets."
    "Imagine future climate policy scenarios where Sub-Saharan Africa invests heavily in research and development of new climate technologies. Highlight key focus areas, potential breakthroughs, and the role of international collaboration."
    "Generate scenarios in which artificial intelligence (AI) and digital technologies are integrated into climate policy implementation across Sub-Saharan Africa. Consider applications in emissions monitoring, predictive modeling, and adaptive policymaking."
    "Explore different climate policy scenarios in which Sub-Saharan Africa reduces fossil fuel dependency and transitions to a clean energy economy. Consider pathways that emphasize economic diversification, workforce retraining initiatives, and regulatory reforms."
    "Develop climate policy scenarios in which Sub-Saharan Africa adopts carbon pricing and emissions trading as primary tools for reducing greenhouse gas emissions. Consider potential economic benefits, implementation hurdles, and regional cooperation models."
    "Explore policy scenarios where Sub-Saharan Africa integrates circular economy principles into energy policy to promote sustainability. Discuss waste-to-energy innovations, material reuse, and industrial symbiosis."
    "Explore viable climate policy scenarios in which Sub-Saharan Africa adopts nuclear energy as part of its energy transition. Discuss key regional and international partnerships (e.g., IAEA, COP agreements), infrastructure development needs, and financial feasibility."
    "Develop climate policy scenarios for reducing transportation-related emissions in Sub-Saharan Africa. Discuss policy incentives for electric vehicle adoption, challenges in urban transport planning, and the role of biofuels and hydrogen as alternative energy sources."
    "Imagine future scenarios where decentralized energy efficiency initiatives significantly reduce energy poverty in Sub-Saharan Africa. Highlight local innovations, policy enablers, and community-driven approaches."
    "Develop potential future climate policy scenarios for Sub-Saharan Africa that emphasize sustainable land use and afforestation. Consider incentives for reforestation, carbon sequestration policies, and indigenous land management practices."
    "Generate possible future scenarios where Sub-Saharan Africa accelerates climate finance through international funding mechanisms. Consider public-private partnerships, sovereign green bonds, and risk mitigation strategies."
    "Propose detailed policy scenarios for Sub-Saharan Africa that strengthen institutional capacity for climate adaptation. Address governance reforms, inter-agency coordination, and financial resource allocation."
    "Explore future policy pathways where Sub-Saharan Africa mainstreams climate adaptation into national development plans. Discuss cross-sectoral collaboration, funding integration, and legislative frameworks."
    "Explore policy pathways where indigenous knowledge and local community-led solutions shape climate adaptation strategies in Sub-Saharan Africa. Consider land rights, traditional resource management practices, and equitable governance structures."
    "Generate policy scenarios where empowering local communities through climate education and advocacy leads to stronger grassroots climate action. Include capacity-building initiatives, knowledge-sharing platforms, and policy uptake metrics."
    "Imagine future climate policy scenarios where capacity building for climate governance strengthens institutional responses to climate challenges in Sub-Saharan Africa. Consider skills development, stakeholder collaboration, and policy implementation."
    "Please generate potential future policy scenarios for Sub-Saharan Africa that focus on building climate-resilient infrastructure—particularly around water management and disaster risk reduction. Include plausible timeframes, critical actors, and key funding mechanisms."
    "Propose detailed policy scenarios for Sub-Saharan Africa focusing on the health sector's adaptation to climate-induced challenges, such as heatwaves, vector-borne diseases, and flood-related health crises. Include metrics and expected outcomes."
    "Generate possible future scenarios in which Sub-Saharan Africa accelerates green finance for large-scale renewable energy projects. Consider international donors, private capital, and novel funding mechanisms, and explain major opportunities and risks."
    "What are potential future climate policy pathways in Sub-Saharan Africa if carbon markets and offset schemes become mainstream? Outline how governments, regional bodies, and local communities might participate or benefit."
    "Imagine several policy scenarios in which external technology transfer accelerates. How might local institutions be strengthened to adopt, maintain, and innovate on climate-related technologies (e.g., solar, wind, climate-smart agriculture)?"
    "Generate a set of climate policy scenarios for Sub-Saharan Africa that show how collaboration between local, national, and regional bodies might evolve. Include considerations of political will, stakeholder conflicts, and resource allocation."
    "Develop future policy pathways in which local communities and grassroots movements play a pivotal role in shaping national climate strategies. Consider equity issues, social inclusion, and mechanisms for ensuring marginalized groups have a voice."
    "Propose climate policy scenarios addressing transboundary resource management (e.g., shared water basins, pastoral lands) in Sub-Saharan Africa. Explore how climate stressors could exacerbate conflict or incentivize deeper regional cooperation."
    "Please propose climate policy scenarios that prioritize mitigation efforts with the largest public-health co-benefits, such as reducing indoor air pollution from traditional biomass cooking. Include measurable outcomes and ethical considerations."
    "Describe a range of worst-case business as usual climate policy scenarios for Sub-Saharan Africa. Consider political inertia, minimal international support, and accelerating climate impacts, and explore the long-term social and economic consequences."
    "Generate scenarios in which Sub-Saharan Africa experiences more extreme climate impacts than currently predicted. How might governments, communities, and private actors innovate or pivot policy approaches in this high-risk future?"
]

In [None]:
#Prepare generated responses to conform to RAGAS framework

from datasets import Dataset

answers = []
contexts = []

# Traversing each question and passing into the chain to get answers from the system
for question in questions:
    relevant_docs = retriever.get_relevant_documents(question)
    
    # Extract the `page_content` from the retrieved documents as a list
    formatted_contexts = [doc.page_content for doc in relevant_docs]  # Keep this as a list of strings
    
    # Get the response from the RAG chain
    response = rag_chain.invoke(question)
    
    # Extract the 'result' field
    if isinstance(response, dict) and "result" in response:
        answers.append(response["result"])
    else:
        answers.append(response)
    
    contexts.append(formatted_contexts)  # Append as a list

# Prepare the dataset
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,  # Ensure this remains a list of strings
}

In [27]:
# Convert to a HuggingFace Dataset
dataset = Dataset.from_dict(data)

In [None]:
# Validate dataset structure
dataset[1]

In [None]:
dataset.to_pandas()

In [31]:
from ragas import evaluate
from ragas.run_config import RunConfig
from ragas.metrics import (
    Faithfulness,
    ContextUtilization,
    AnswerRelevancy
)

In [84]:
llm_mistral = OllamaLLM(
    model="mistral:7b",
    verbose=False,
    timeout=150,  
    num_ctx=5000,  
    disable_streaming=False,
    temperature=0  
)

In [None]:
llm_gemma2 = OllamaLLM(
    model="gemma2:2b",
    verbose=False,
    timeout=600,  
    num_ctx=5000,  
    disable_streaming=False,
    temperature=0  
)

In [34]:
embeddings_nomic = OllamaEmbeddings(model="nomic-embed-text:latest")

In [None]:
#Evaluate prompt responses one by one

evaluation_results = []

# Iterate through each sample in the dataset and evaluate it separately
for i, sample in enumerate(dataset):
    print(f"Evaluating sample {i+1}/{len(dataset)}...")  

    # Create a single-item dataset for this sample
    single_sample_dataset = Dataset.from_dict({
        "question": [sample["question"]],
        "answer": [sample["answer"]],
        "contexts": [sample["contexts"]]
    })

    # Perform evaluation on the single sample
    result_mistral = evaluate(
        dataset=single_sample_dataset,
        metrics=[Faithfulness(), ContextUtilization(), AnswerRelevancy()],
        llm=llm_mistral,
        embeddings=embeddings_nomic,
        run_config=RunConfig(max_workers=16, timeout=600, max_retries=5, max_wait=20, log_tenacity=True)
    )

    # Extract metric scores
    scores = {metric: result_mistral[metric][0] if result_mistral[metric] else "NaN" for metric in result_mistral._scores_dict.keys()}

    # Prepare a row for storing the result
    evaluation_results.append({
        "Question": sample["question"],
        "Answer": sample["answer"],
        "Contexts": " | ".join(sample["contexts"]),
        "Faithfulness": scores.get("faithfulness", "NaN"),
        "ContextUtilization": scores.get("context_utilization", "NaN"),
        "AnswerRelevancy": scores.get("answer_relevancy", "NaN")
    })

    print(f"Sample {i+1} evaluation completed.\n")


evaluated_data = evaluation_results
print("Evaluation completed. Ready to save results.")


In [None]:
evaluation_results = []

# Iterate through each sample in the dataset and evaluate it separately
for i, sample in enumerate(dataset):
    print(f"Evaluating sample {i+1}/{len(dataset)}...")  

    # Create a single-item dataset for this sample
    single_sample_dataset = Dataset.from_dict({
        "question": [sample["question"]],
        "answer": [sample["answer"]],
        "contexts": [sample["contexts"]]
    })

    # Perform evaluation on the single sample
    result_gemma2 = evaluate(
        dataset=single_sample_dataset,
        metrics=[Faithfulness(), ContextUtilization(), AnswerRelevancy()],
        llm=llm_gemma2,
        embeddings=embeddings_nomic,
        run_config=RunConfig(max_workers=16, timeout=600, max_retries=5, max_wait=20, log_tenacity=True)
    )

    # Extract metric scores
    scores = {metric: result_mistral[metric][0] if result_mistral[metric] else "NaN" for metric in result_mistral._scores_dict.keys()}

    # Prepare a row for storing the result
    evaluation_results.append({
        "Question": sample["question"],
        "Answer": sample["answer"],
        "Contexts": " | ".join(sample["contexts"]),
        "Faithfulness": scores.get("faithfulness", "NaN"),
        "ContextUtilization": scores.get("context_utilization", "NaN"),
        "AnswerRelevancy": scores.get("answer_relevancy", "NaN")
    })

    print(f"Sample {i+1} evaluation completed.\n")


evaluated_data2 = evaluation_results
print("Evaluation completed. Ready to save results.")


In [None]:
import pandas as pd

df = pd.DataFrame(evaluated_data)

df.head()  

In [None]:
import os
import csv

# Define the folder where the file will be saved
folder_path = os.path.expanduser("/Users/badekale/Documents/Hamoyeew/SUB")

# Create the folder if it doesn't already exist
os.makedirs(folder_path, exist_ok=True)

# Define the full file path
output_file = os.path.join(folder_path, "evaluation_results.csv")

# Define the CSV header
header = ["Question", "Answer", "Contexts", "Faithfulness", "ContextUtilization", "AnswerRelevancy"]

# Write all results to CSV at once
with open(output_file, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=header)
    writer.writeheader()
    writer.writerows(evaluated_data)

print(f"All evaluation results saved to {output_file}")