In [None]:
import os
from langchain.document_loaders import DirectoryLoader, PDFPlumberLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA


# 2. Configure your OpenAI key
os.environ["OPENAI_API_KEY"] = "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"

# 3. Load & parse PDF(s)
loader = DirectoryLoader(
    r"C:\Users\ambyb\Desktop\Budgeting\Performance_Reports",
    glob="**/*.pdf",           # recursively grab every .pdf
    loader_cls=PDFPlumberLoader
)
all_documents = loader.load()

# 4. Chunk the documents
splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200, separator="\n")
chunks = splitter.split_documents(documents)

# 5. Create embeddings + vector index
embeddings = OpenAIEmbeddings()
vector_index = FAISS.from_documents(chunks, embeddings)

# 6. Build the Retrieval-QA chain
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_index.as_retriever(search_kwargs={"k": 4})
)

# 7. Simple chat loop
def ask_pdf(question: str) -> str:
    return qa.run(question)

if __name__ == "__main__":
    print("PDF-Chatbot ready! (type 'exit' to quit)")
    while True:
        q = input("You: ")
        if q.lower() in ("exit", "quit"):
            break
        print("Bot:", ask_pdf(q))


In [None]:
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

# 1. Load Wikipedia content
loader = WikipediaLoader(query="Philosophy of life", load_max_docs=3)
documents = loader.load()

# 2. Split documents into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
split_docs = splitter.split_documents(documents)

# 3. Create vectorstore from documents
embedding = OpenAIEmbeddings(model="text-embedding-3-small")
vectorstore = FAISS.from_documents(split_docs, embedding)
retriever = vectorstore.as_retriever()

# 4. Define prompt template
prompt_template = """Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer".
2. If you find the answer, write the answer in a concise way with five sentences maximum.

{context}

Question: {question}

Helpful Answer:
"""

PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

# 5. Initialize the LLM and QA chain
llm = ChatOpenAI(model_name="gpt-4.1-nano")
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)

# 6. Ask your question
query = "What is life?"
result = qa_chain.invoke({"query": query})

# 7. Print the answer
print("Answer:\n", result["result"])
print("\nSource documents:")
for doc in result["source_documents"]:
    print(f"- {doc.metadata.get('title', 'Unknown')}")

In [7]:
## Methodology

#This section describes the systematic process by which the AI-driven budget optimization model is developed and evaluated. We proceed in five stages:  
1. Data Collection & Preprocessing  
2. Regression Analysis for Lever Identification  
3. Markov Decision Process (MDP) Formulation  
4. Policy Computation  
5. Monte Carlo Stress-Testing

---

### 1. Data Collection & Preprocessing

1. **Scope & Sources**  
   - Annual budget records for the past 10 years:  
     - Approved Budget \(B^{\text{app}}_{t}\)  
     - Released Budget \(B^{\text{rel}}_{t}\)  
     - Actual Expenditure \(B^{\text{act}}_{t}\)  
   - Macroeconomic indicators: GDP \((G_{t})\), inflation \((\pi_{t})\), sector performance (e.g., enrolment, health outcomes).
2. **Cleaning & Imputation**  
   - Detect gaps in \(B^{\text{rel}}_{t}\) or \(B^{\text{act}}_{t}\).  
   - Impute via time-series methods (linear interpolation, Kalman filter).  
   - Flag imputed points with indicator \(\delta^{\text{imp}}_{t}\in\{0,1\}\).  
3. **Feature Engineering**  
   - Compute derived metrics and normalize:  
     \[
       E_{t} = \frac{B_{t}^{\text{act}}}{B_{t}^{\text{app}}}, 
       \quad
       E^{\text{GDP}}_{t} = \frac{B_{t}^{\text{act}}}{G_{t}},
       \quad
       \Delta G_{t} = \frac{G_{t}-G_{t-1}}{G_{t-1}}.
     \]  
   - Scale all \(X\) to zero mean and unit variance.

---

### 2. Regression Analysis for Lever Identification

1. **Model Specification**  
   We set  
   \[
     Y_{t} = E^{\text{GDP}}_{t}, 
     \quad 
     X_{t} = [\Delta G_{t},\,E_{t-1},\,\delta^{\text{imp}}_{t},\,\dots].
   \]  
   And fit  
   \[
     Y_{t} = \beta_{0} + \sum_{i=1}^{k}\beta_{i}\,X_{t,i} + \varepsilon_{t}, 
     \quad \varepsilon_{t}\sim\mathcal{N}(0,\sigma^{2}).
   \]
2. **Estimation & Selection**  
   - Use an Elastic Net (L1 + L2 penalties):  
     \[
       \min_{\beta}\;\frac{1}{T}\sum_{t=1}^{T}(Y_{t}-\beta^\top X_{t})^{2}
       + \lambda_{1}\|\beta\|_{1} + \lambda_{2}\|\beta\|_{2}^{2}.
     \]  
   - Select nonzero \(\beta_{i}\)s as decision **levers** \(a_{i}\).

---

### 3. Markov Decision Process Formulation

Define \(\langle S,A,T,R,\gamma\rangle\):

- **States** \(S\): Discrete fiscal states (Under-Spend, On-Track, Over-Commit).
- **Actions** \(A\): Budget moves derived from \(\{\beta_i\}\), e.g. “Increase Sector \(i\) by 5%.”
- **Transition** \(T(s'\!\mid s,a)\):  
  Estimate next outcome \(\hat Y_{t+1}\) via regression, then map into state \(s'\).  
- **Reward** \(R(s,a)\):  
  \[
    R(s,a) = w_{1}\,E_{t+1} \;-\; w_{2}\,\mathbf{1}\{E_{t+1}<\alpha\},
  \]  
  with threshold \(\alpha\).  
- **Discount** \(\gamma\): e.g. 0.95.

---

### 4. Policy Computation

Solve for optimal policy \(\pi^{*}:S\to A\) via **Value Iteration**:

```text
Initialize V₀(s) = 0  for all s ∈ S
Repeat until ‖Vₖ₊₁ - Vₖ‖∞ < ε:
  For each s ∈ S:
    Vₖ₊₁(s) = maxₐ [ R(s,a) + γ ∑ₛ' T(s'|s,a) Vₖ(s') ]
π*(s) = argmaxₐ [ R(s,a) + γ ∑ₛ' T(s'|s,a) V*(s') ]


SyntaxError: invalid syntax (1801370208.py, line 4)